add elasticseatch, fluentd, kibana check

author: juanvallejo <jvallejo@redhat.com> 2017-03-22 15:52:35 -0400
committer: juanvallejo <jvallejo@redhat.com> 2017-06-02 16:44:07 -0400
commit: 2e53dbb4c0d9bfe79cd79e0a0ece9db065b286df (patch)
tree: e2bcc99a895fc6647bc7623a8bcbf6c8fd5385f8
parent: 46dca9b8b15ed67adfa2ca617f300e5d1df7c3e0 (diff)
13 files changed, 1575 insertions, 6 deletions
diff --git a/roles/openshift_health_checker/library/ocutil.py b/roles/openshift_health_checker/library/ocutil.py
new file mode 100644
index 000000000..2e60735d6
--- /dev/null
+++ b/roles/openshift_health_checker/library/ocutil.py
@@ -0,0 +1,74 @@
+#!/usr/bin/python
+"""Interface to OpenShift oc command"""
+
+import os
+import shlex
+import shutil
+import subprocess
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+ADDITIONAL_PATH_LOOKUPS = ['/usr/local/bin', os.path.expanduser('~/bin')]
+
+
+def locate_oc_binary():
+    """Find and return oc binary file"""
+    # https://github.com/openshift/openshift-ansible/issues/3410
+    # oc can be in /usr/local/bin in some cases, but that may not
+    # be in $PATH due to ansible/sudo
+    paths = os.environ.get("PATH", os.defpath).split(os.pathsep) + ADDITIONAL_PATH_LOOKUPS
+
+    oc_binary = 'oc'
+
+    # Use shutil.which if it is available, otherwise fallback to a naive path search
+    try:
+        which_result = shutil.which(oc_binary, path=os.pathsep.join(paths))
+        if which_result is not None:
+            oc_binary = which_result
+    except AttributeError:
+        for path in paths:
+            if os.path.exists(os.path.join(path, oc_binary)):
+                oc_binary = os.path.join(path, oc_binary)
+                break
+
+    return oc_binary
+
+
+def main():
+    """Module that executes commands on a remote OpenShift cluster"""
+
+    module = AnsibleModule(
+        argument_spec=dict(
+            namespace=dict(type="str", required=True),
+            config_file=dict(type="str", required=True),
+            cmd=dict(type="str", required=True),
+            extra_args=dict(type="list", default=[]),
+        ),
+    )
+
+    cmd = [
+        locate_oc_binary(),
+        '--config', module.params["config_file"],
+        '-n', module.params["namespace"],
+    ] + shlex.split(module.params["cmd"])
+
+    failed = True
+    try:
+        cmd_result = subprocess.check_output(list(cmd), stderr=subprocess.STDOUT)
+        failed = False
+    except subprocess.CalledProcessError as exc:
+        cmd_result = '[rc {}] {}\n{}'.format(exc.returncode, ' '.join(exc.cmd), exc.output)
+    except OSError as exc:
+        # we get this when 'oc' is not there
+        cmd_result = str(exc)
+
+    module.exit_json(
+        changed=False,
+        failed=failed,
+        result=cmd_result,
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index be63d864a..5c9949ced 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -66,16 +66,26 @@ class OpenShiftCheck(object):
 LOADER_EXCLUDES = (
     "__init__.py",
     "mixins.py",
+    "logging.py",
 )
 
 
-def load_checks():
+def load_checks(path=None, subpkg=""):
     """Dynamically import all check modules for the side effect of registering checks."""
-    return [
-        import_module(__package__ + "." + name[:-3])
-        for name in os.listdir(os.path.dirname(__file__))
-        if name.endswith(".py") and name not in LOADER_EXCLUDES
-    ]
+    if path is None:
+        path = os.path.dirname(__file__)
+
+    modules = []
+
+    for name in os.listdir(path):
+        if os.path.isdir(os.path.join(path, name)):
+            modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name)
+            continue
+
+        if name.endswith(".py") and name not in LOADER_EXCLUDES:
+            modules.append(import_module(__package__ + subpkg + "." + name[:-3]))
+
+    return modules
 
 
 def get_var(task_vars, *keys, **kwargs):
diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
new file mode 100644
index 000000000..c9fc59896
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -0,0 +1,61 @@
+"""
+Module for performing checks on an Curator logging deployment
+"""
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Curator(LoggingCheck):
+    """Module that checks an integrated logging Curator deployment"""
+
+    name = "curator"
+    tags = ["health", "logging"]
+
+    logging_namespace = None
+
+    def run(self, tmp, task_vars):
+        """Check various things and gather errors. Returns: result as hash"""
+
+        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+        curator_pods, error = super(Curator, self).get_pods_for_component(
+            self.module_executor,
+            self.logging_namespace,
+            "curator",
+            task_vars
+        )
+        if error:
+            return {"failed": True, "changed": False, "msg": error}
+        check_error = self.check_curator(curator_pods)
+
+        if check_error:
+            msg = ("The following Curator deployment issue was found:"
+                   "\n-------\n"
+                   "{}".format(check_error))
+            return {"failed": True, "changed": False, "msg": msg}
+
+        # TODO(lmeyer): run it all again for the ops cluster
+        return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'}
+
+    def check_curator(self, pods):
+        """Check to see if curator is up and working. Returns: error string"""
+        if not pods:
+            return (
+                "There are no Curator pods for the logging stack,\n"
+                "so nothing will prune Elasticsearch indexes.\n"
+                "Is Curator correctly deployed?"
+            )
+
+        not_running = super(Curator, self).not_running_pods(pods)
+        if len(not_running) == len(pods):
+            return (
+                "The Curator pod is not currently in a running state,\n"
+                "so Elasticsearch indexes may increase without bound."
+            )
+        if len(pods) - len(not_running) > 1:
+            return (
+                "There is more than one Curator pod running. This should not normally happen.\n"
+                "Although this doesn't cause any problems, you may want to investigate."
+            )
+
+        return None
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
new file mode 100644
index 000000000..01cb35b81
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -0,0 +1,217 @@
+"""
+Module for performing checks on an Elasticsearch logging deployment
+"""
+
+import json
+import re
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Elasticsearch(LoggingCheck):
+    """Module that checks an integrated logging Elasticsearch deployment"""
+
+    name = "elasticsearch"
+    tags = ["health", "logging"]
+
+    logging_namespace = None
+
+    def run(self, tmp, task_vars):
+        """Check various things and gather errors. Returns: result as hash"""
+
+        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+        es_pods, error = super(Elasticsearch, self).get_pods_for_component(
+            self.execute_module,
+            self.logging_namespace,
+            "es",
+            task_vars,
+        )
+        if error:
+            return {"failed": True, "changed": False, "msg": error}
+        check_error = self.check_elasticsearch(es_pods, task_vars)
+
+        if check_error:
+            msg = ("The following Elasticsearch deployment issue was found:"
+                   "\n-------\n"
+                   "{}".format(check_error))
+            return {"failed": True, "changed": False, "msg": msg}
+
+        # TODO(lmeyer): run it all again for the ops cluster
+        return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'}
+
+    def _not_running_elasticsearch_pods(self, es_pods):
+        """Returns: list of running pods, list of errors about non-running pods"""
+        not_running = super(Elasticsearch, self).not_running_pods(es_pods)
+        if not_running:
+            return not_running, [(
+                'The following Elasticsearch pods are not running:\n'
+                '{pods}'
+                'These pods will not aggregate logs from their nodes.'
+            ).format(pods=''.join(
+                "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+                for pod in not_running
+            ))]
+        return not_running, []
+
+    def check_elasticsearch(self, es_pods, task_vars):
+        """Various checks for elasticsearch. Returns: error string"""
+        not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods)
+        running_pods = [pod for pod in es_pods if pod not in not_running_pods]
+        pods_by_name = {
+            pod['metadata']['name']: pod for pod in running_pods
+            # Filter out pods that are not members of a DC
+            if pod['metadata'].get('labels', {}).get('deploymentconfig')
+        }
+        if not pods_by_name:
+            return 'No logging Elasticsearch pods were found. Is logging deployed?'
+        error_msgs += self._check_elasticsearch_masters(pods_by_name, task_vars)
+        error_msgs += self._check_elasticsearch_node_list(pods_by_name, task_vars)
+        error_msgs += self._check_es_cluster_health(pods_by_name, task_vars)
+        error_msgs += self._check_elasticsearch_diskspace(pods_by_name, task_vars)
+        return '\n'.join(error_msgs)
+
+    @staticmethod
+    def _build_es_curl_cmd(pod_name, url):
+        base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
+        return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
+
+    def _check_elasticsearch_masters(self, pods_by_name, task_vars):
+        """Check that Elasticsearch masters are sane. Returns: list of error strings"""
+        es_master_names = set()
+        error_msgs = []
+        for pod_name in pods_by_name.keys():
+            # Compare what each ES node reports as master and compare for split brain
+            get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
+            master_name_str = self._exec_oc(get_master_cmd, [], task_vars)
+            master_names = (master_name_str or '').split(' ')
+            if len(master_names) > 1:
+                es_master_names.add(master_names[1])
+            else:
+                error_msgs.append(
+                    'No master? Elasticsearch {pod} returned bad string when asked master name:\n'
+                    '  {response}'.format(pod=pod_name, response=master_name_str)
+                )
+
+        if not es_master_names:
+            error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?')
+            return '\n'.join(error_msgs)
+
+        if len(es_master_names) > 1:
+            error_msgs.append(
+                'Found multiple Elasticsearch masters according to the pods:\n'
+                '{master_list}\n'
+                'This implies that the masters have "split brain" and are not correctly\n'
+                'replicating data for the logging cluster. Log loss is likely to occur.'
+                .format(master_list='\n'.join('  ' + master for master in es_master_names))
+            )
+
+        return error_msgs
+
+    def _check_elasticsearch_node_list(self, pods_by_name, task_vars):
+        """Check that reported ES masters are accounted for by pods. Returns: list of error strings"""
+
+        if not pods_by_name:
+            return ['No logging Elasticsearch masters were found. Is logging deployed?']
+
+        # get ES cluster nodes
+        node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
+        cluster_node_data = self._exec_oc(node_cmd, [], task_vars)
+        try:
+            cluster_nodes = json.loads(cluster_node_data)['nodes']
+        except (ValueError, KeyError):
+            return [
+                'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
+                cluster_node_data
+            ]
+
+        # Try to match all ES-reported node hosts to known pods.
+        error_msgs = []
+        for node in cluster_nodes.values():
+            # Note that with 1.4/3.4 the pod IP may be used as the master name
+            if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
+                       for pod_name, pod in pods_by_name.items()):
+                error_msgs.append(
+                    'The Elasticsearch cluster reports a member node "{node}"\n'
+                    'that does not correspond to any known ES pod.'.format(node=node['host'])
+                )
+
+        return error_msgs
+
+    def _check_es_cluster_health(self, pods_by_name, task_vars):
+        """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
+        error_msgs = []
+        for pod_name in pods_by_name.keys():
+            cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
+            cluster_health_data = self._exec_oc(cluster_health_cmd, [], task_vars)
+            try:
+                health_res = json.loads(cluster_health_data)
+                if not health_res or not health_res.get('status'):
+                    raise ValueError()
+            except ValueError:
+                error_msgs.append(
+                    'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
+                    'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
+                )
+                continue
+
+            if health_res['status'] not in ['green', 'yellow']:
+                error_msgs.append(
+                    'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
+                )
+
+        return error_msgs
+
+    def _check_elasticsearch_diskspace(self, pods_by_name, task_vars):
+        """
+        Exec into an ES pod and query the diskspace on the persistent volume.
+        Returns: list of errors
+        """
+        error_msgs = []
+        for pod_name in pods_by_name.keys():
+            df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
+            disk_output = self._exec_oc(df_cmd, [], task_vars)
+            lines = disk_output.splitlines()
+            # expecting one header looking like 'IUse% Use%' and one body line
+            body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
+            if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
+                error_msgs.append(
+                    'Could not retrieve storage usage from logging ES pod "{pod}".\n'
+                    'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
+                )
+                continue
+            inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
+
+            inode_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_inode_pct', default='90')
+            if int(inode_pct) >= int(inode_pct_thresh):
+                error_msgs.append(
+                    'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
+                    '  is {pct}, greater than threshold {limit}.\n'
+                    '  Note: threshold can be specified in inventory with {param}'.format(
+                        pod=pod_name,
+                        pct=str(inode_pct),
+                        limit=str(inode_pct_thresh),
+                        param='openshift_check_efk_es_inode_pct',
+                    ))
+            disk_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_storage_pct', default='80')
+            if int(disk_pct) >= int(disk_pct_thresh):
+                error_msgs.append(
+                    'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
+                    '  is {pct}, greater than threshold {limit}.\n'
+                    '  Note: threshold can be specified in inventory with {param}'.format(
+                        pod=pod_name,
+                        pct=str(disk_pct),
+                        limit=str(disk_pct_thresh),
+                        param='openshift_check_efk_es_storage_pct',
+                    ))
+
+        return error_msgs
+
+    def _exec_oc(self, cmd_str, extra_args, task_vars):
+        return super(Elasticsearch, self).exec_oc(
+            self.execute_module,
+            self.logging_namespace,
+            cmd_str,
+            extra_args,
+            task_vars,
+        )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
new file mode 100644
index 000000000..627567293
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -0,0 +1,170 @@
+"""
+Module for performing checks on an Fluentd logging deployment
+"""
+
+import json
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Fluentd(LoggingCheck):
+    """Module that checks an integrated logging Fluentd deployment"""
+    name = "fluentd"
+    tags = ["health", "logging"]
+
+    logging_namespace = None
+
+    def run(self, tmp, task_vars):
+        """Check various things and gather errors. Returns: result as hash"""
+
+        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+        fluentd_pods, error = super(Fluentd, self).get_pods_for_component(
+            self.execute_module,
+            self.logging_namespace,
+            "fluentd",
+            task_vars,
+        )
+        if error:
+            return {"failed": True, "changed": False, "msg": error}
+        check_error = self.check_fluentd(fluentd_pods, task_vars)
+
+        if check_error:
+            msg = ("The following Fluentd deployment issue was found:"
+                   "\n-------\n"
+                   "{}".format(check_error))
+            return {"failed": True, "changed": False, "msg": msg}
+
+        # TODO(lmeyer): run it all again for the ops cluster
+        return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'}
+
+    @staticmethod
+    def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+        """Filter to all nodes with fluentd label. Returns dict(name: node), error string"""
+        label, value = node_selector.split('=', 1)
+        fluentd_nodes = {
+            name: node for name, node in nodes_by_name.items()
+            if node['metadata']['labels'].get(label) == value
+        }
+        if not fluentd_nodes:
+            return None, (
+                'There are no nodes with the fluentd label {label}.\n'
+                'This means no logs will be aggregated from the nodes.'
+            ).format(label=node_selector)
+        return fluentd_nodes, None
+
+    @staticmethod
+    def _check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars):
+        """Note if nodes are not labeled as expected. Returns: error string"""
+        intended_nodes = get_var(task_vars, 'openshift_logging_fluentd_hosts', default=['--all'])
+        if not intended_nodes or '--all' in intended_nodes:
+            intended_nodes = nodes_by_name.keys()
+        nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
+        if nodes_missing_labels:
+            return (
+                'The following nodes are supposed to be labeled with {label} but are not:\n'
+                '  {nodes}\n'
+                'Fluentd will not aggregate logs from these nodes.'
+            ).format(label=node_selector, nodes=', '.join(nodes_missing_labels))
+        return None
+
+    @staticmethod
+    def _check_nodes_have_fluentd(pods, fluentd_nodes):
+        """Make sure fluentd is on all the labeled nodes. Returns: error string"""
+        unmatched_nodes = fluentd_nodes.copy()
+        node_names_by_label = {
+            node['metadata']['labels']['kubernetes.io/hostname']: name
+            for name, node in fluentd_nodes.items()
+        }
+        node_names_by_internal_ip = {
+            address['address']: name
+            for name, node in fluentd_nodes.items()
+            for address in node['status']['addresses']
+            if address['type'] == "InternalIP"
+        }
+        for pod in pods:
+            for name in [
+                    pod['spec']['nodeName'],
+                    node_names_by_internal_ip.get(pod['spec']['nodeName']),
+                    node_names_by_label.get(pod.get('spec', {}).get('host')),
+            ]:
+                unmatched_nodes.pop(name, None)
+        if unmatched_nodes:
+            return (
+                'The following nodes are supposed to have a Fluentd pod but do not:\n'
+                '{nodes}'
+                'These nodes will not have their logs aggregated.'
+            ).format(nodes=''.join(
+                "  {}\n".format(name)
+                for name in unmatched_nodes.keys()
+            ))
+        return None
+
+    def _check_fluentd_pods_running(self, pods):
+        """Make sure all fluentd pods are running. Returns: error string"""
+        not_running = super(Fluentd, self).not_running_pods(pods)
+        if not_running:
+            return (
+                'The following Fluentd pods are supposed to be running but are not:\n'
+                '{pods}'
+                'These pods will not aggregate logs from their nodes.'
+            ).format(pods=''.join(
+                "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+                for pod in not_running
+            ))
+        return None
+
+    def check_fluentd(self, pods, task_vars):
+        """Verify fluentd is running everywhere. Returns: error string"""
+
+        node_selector = get_var(task_vars, 'openshift_logging_fluentd_nodeselector',
+                                default='logging-infra-fluentd=true')
+
+        nodes_by_name, error = self.get_nodes_by_name(task_vars)
+
+        if error:
+            return error
+        fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
+        if error:
+            return error
+
+        error_msgs = []
+        error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars)
+        if error:
+            error_msgs.append(error)
+        error = self._check_nodes_have_fluentd(pods, fluentd_nodes)
+        if error:
+            error_msgs.append(error)
+        error = self._check_fluentd_pods_running(pods)
+        if error:
+            error_msgs.append(error)
+
+        # Make sure there are no extra fluentd pods
+        if len(pods) > len(fluentd_nodes):
+            error_msgs.append(
+                'There are more Fluentd pods running than nodes labeled.\n'
+                'This may not cause problems with logging but it likely indicates something wrong.'
+            )
+
+        return '\n'.join(error_msgs)
+
+    def get_nodes_by_name(self, task_vars):
+        """Retrieve all the node definitions. Returns: dict(name: node), error string"""
+        nodes_json = self._exec_oc("get nodes -o json", [], task_vars)
+        try:
+            nodes = json.loads(nodes_json)
+        except ValueError:  # no valid json - should not happen
+            return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json
+        if not nodes or not nodes.get('items'):  # also should not happen
+            return None, "No nodes appear to be defined according to the API."
+        return {
+            node['metadata']['name']: node
+            for node in nodes['items']
+        }, None
+
+    def _exec_oc(self, cmd_str, extra_args, task_vars):
+        return super(Fluentd, self).exec_oc(self.execute_module,
+                                            self.logging_namespace,
+                                            cmd_str,
+                                            extra_args,
+                                            task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
new file mode 100644
index 000000000..442f407b1
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -0,0 +1,229 @@
+"""
+Module for performing checks on a Kibana logging deployment
+"""
+
+import json
+import ssl
+
+try:
+    from urllib2 import HTTPError, URLError
+    import urllib2
+except ImportError:
+    from urllib.error import HTTPError, URLError
+    import urllib.request as urllib2
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Kibana(LoggingCheck):
+    """Module that checks an integrated logging Kibana deployment"""
+
+    name = "kibana"
+    tags = ["health", "logging"]
+
+    logging_namespace = None
+
+    def run(self, tmp, task_vars):
+        """Check various things and gather errors. Returns: result as hash"""
+
+        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+        kibana_pods, error = super(Kibana, self).get_pods_for_component(
+            self.execute_module,
+            self.logging_namespace,
+            "kibana",
+            task_vars,
+        )
+        if error:
+            return {"failed": True, "changed": False, "msg": error}
+        check_error = self.check_kibana(kibana_pods)
+
+        if not check_error:
+            check_error = self._check_kibana_route(task_vars)
+
+        if check_error:
+            msg = ("The following Kibana deployment issue was found:"
+                   "\n-------\n"
+                   "{}".format(check_error))
+            return {"failed": True, "changed": False, "msg": msg}
+
+        # TODO(lmeyer): run it all again for the ops cluster
+        return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'}
+
+    def _verify_url_internal(self, url, task_vars):
+        """
+        Try to reach a URL from the host.
+        Returns: success (bool), reason (for failure)
+        """
+        args = dict(
+            url=url,
+            follow_redirects='none',
+            validate_certs='no',  # likely to be signed with internal CA
+            # TODO(lmeyer): give users option to validate certs
+            status_code=302,
+        )
+        result = self.execute_module('uri', args, task_vars)
+        if result.get('failed'):
+            return result['msg']
+        return None
+
+    @staticmethod
+    def _verify_url_external(url):
+        """
+        Try to reach a URL from ansible control host.
+        Returns: success (bool), reason (for failure)
+        """
+        # This actually checks from the ansible control host, which may or may not
+        # really be "external" to the cluster.
+
+        # Disable SSL cert validation to work around internally signed certs
+        ctx = ssl.create_default_context()
+        ctx.check_hostname = False  # or setting CERT_NONE is refused
+        ctx.verify_mode = ssl.CERT_NONE
+
+        # Verify that the url is returning a valid response
+        try:
+            # We only care if the url connects and responds
+            return_code = urllib2.urlopen(url, context=ctx).getcode()
+        except HTTPError as httperr:
+            return httperr.reason
+        except URLError as urlerr:
+            return str(urlerr)
+
+        # there appears to be no way to prevent urlopen from following redirects
+        if return_code != 200:
+            return 'Expected success (200) but got return code {}'.format(int(return_code))
+
+        return None
+
+    def check_kibana(self, pods):
+        """Check to see if Kibana is up and working. Returns: error string."""
+
+        if not pods:
+            return "There are no Kibana pods deployed, so no access to the logging UI."
+
+        not_running = self.not_running_pods(pods)
+        if len(not_running) == len(pods):
+            return "No Kibana pod is in a running state, so there is no access to the logging UI."
+        elif not_running:
+            return (
+                "The following Kibana pods are not currently in a running state:\n"
+                "{pods}"
+                "However at least one is, so service may not be impacted."
+            ).format(pods="".join("  " + pod['metadata']['name'] + "\n" for pod in not_running))
+
+        return None
+
+    def _get_kibana_url(self, task_vars):
+        """
+        Get kibana route or report error.
+        Returns: url (or empty), reason for failure
+        """
+
+        # Get logging url
+        get_route = self._exec_oc("get route logging-kibana -o json", [], task_vars)
+        if not get_route:
+            return None, 'no_route_exists'
+
+        route = json.loads(get_route)
+
+        # check that the route has been accepted by a router
+        ingress = route["status"]["ingress"]
+        # ingress can be null if there is no router, or empty if not routed
+        if not ingress or not ingress[0]:
+            return None, 'route_not_accepted'
+
+        host = route.get("spec", {}).get("host")
+        if not host:
+            return None, 'route_missing_host'
+
+        return 'https://{}/'.format(host), None
+
+    def _check_kibana_route(self, task_vars):
+        """
+        Check to see if kibana route is up and working.
+        Returns: error string
+        """
+        known_errors = dict(
+            no_route_exists=(
+                'No route is defined for Kibana in the logging namespace,\n'
+                'so the logging stack is not accessible. Is logging deployed?\n'
+                'Did something remove the logging-kibana route?'
+            ),
+            route_not_accepted=(
+                'The logging-kibana route is not being routed by any router.\n'
+                'Is the router deployed and working?'
+            ),
+            route_missing_host=(
+                'The logging-kibana route has no hostname defined,\n'
+                'which should never happen. Did something alter its definition?'
+            ),
+        )
+
+        kibana_url, error = self._get_kibana_url(task_vars)
+        if not kibana_url:
+            return known_errors.get(error, error)
+
+        # first, check that kibana is reachable from the master.
+        error = self._verify_url_internal(kibana_url, task_vars)
+        if error:
+            if 'urlopen error [Errno 111] Connection refused' in error:
+                error = (
+                    'Failed to connect from this master to Kibana URL {url}\n'
+                    'Is kibana running, and is at least one router routing to it?'
+                ).format(url=kibana_url)
+            elif 'urlopen error [Errno -2] Name or service not known' in error:
+                error = (
+                    'Failed to connect from this master to Kibana URL {url}\n'
+                    'because the hostname does not resolve.\n'
+                    'Is DNS configured for the Kibana hostname?'
+                ).format(url=kibana_url)
+            elif 'Status code was not' in error:
+                error = (
+                    'A request from this master to the Kibana URL {url}\n'
+                    'did not return the correct status code (302).\n'
+                    'This could mean that Kibana is malfunctioning, the hostname is\n'
+                    'resolving incorrectly, or other network issues. The output was:\n'
+                    '  {error}'
+                ).format(url=kibana_url, error=error)
+            return 'Error validating the logging Kibana route:\n' + error
+
+        # in production we would like the kibana route to work from outside the
+        # cluster too; but that may not be the case, so allow disabling just this part.
+        if not get_var(task_vars, "openshift_check_efk_kibana_external", default=True):
+            return None
+        error = self._verify_url_external(kibana_url)
+        if error:
+            if 'urlopen error [Errno 111] Connection refused' in error:
+                error = (
+                    'Failed to connect from the Ansible control host to Kibana URL {url}\n'
+                    'Is the router for the Kibana hostname exposed externally?'
+                ).format(url=kibana_url)
+            elif 'urlopen error [Errno -2] Name or service not known' in error:
+                error = (
+                    'Failed to resolve the Kibana hostname in {url}\n'
+                    'from the Ansible control host.\n'
+                    'Is DNS configured to resolve this Kibana hostname externally?'
+                ).format(url=kibana_url)
+            elif 'Expected success (200)' in error:
+                error = (
+                    'A request to Kibana at {url}\n'
+                    'returned the wrong error code:\n'
+                    '  {error}\n'
+                    'This could mean that Kibana is malfunctioning, the hostname is\n'
+                    'resolving incorrectly, or other network issues.'
+                ).format(url=kibana_url, error=error)
+            error = (
+                'Error validating the logging Kibana route:\n{error}\n'
+                'To disable external Kibana route validation, set in your inventory:\n'
+                '  openshift_check_efk_kibana_external=False'
+            ).format(error=error)
+            return error
+        return None
+
+    def _exec_oc(self, cmd_str, extra_args, task_vars):
+        return super(Kibana, self).exec_oc(self.execute_module,
+                                           self.logging_namespace,
+                                           cmd_str,
+                                           extra_args,
+                                           task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
new file mode 100644
index 000000000..05b4d300c
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -0,0 +1,96 @@
+"""
+Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack
+"""
+
+import json
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class LoggingCheck(OpenShiftCheck):
+    """Base class for logging component checks"""
+
+    name = "logging"
+
+    @classmethod
+    def is_active(cls, task_vars):
+        return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars)
+
+    @staticmethod
+    def is_first_master(task_vars):
+        """Run only on first master and only when logging is configured. Returns: bool"""
+        logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True)
+        # Note: It would be nice to use membership in oo_first_master group, however for now it
+        # seems best to avoid requiring that setup and just check this is the first master.
+        hostname = get_var(task_vars, "ansible_ssh_host") or [None]
+        masters = get_var(task_vars, "groups", "masters", default=None) or [None]
+        return logging_deployed and masters[0] == hostname
+
+    def run(self, tmp, task_vars):
+        pass
+
+    def get_pods_for_component(self, execute_module, namespace, logging_component, task_vars):
+        """Get all pods for a given component. Returns: list of pods for component, error string"""
+        pod_output = self.exec_oc(
+            execute_module,
+            namespace,
+            "get pods -l component={} -o json".format(logging_component),
+            [],
+            task_vars
+        )
+        try:
+            pods = json.loads(pod_output)
+            if not pods or not pods.get('items'):
+                raise ValueError()
+        except ValueError:
+            # successful run but non-parsing data generally means there were no pods in the namespace
+            return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace)
+
+        return pods['items'], None
+
+    @staticmethod
+    def not_running_pods(pods):
+        """Returns: list of pods not in a ready and running state"""
+        return [
+            pod for pod in pods
+            if any(
+                container['ready'] is False
+                for container in pod['status']['containerStatuses']
+            ) or not any(
+                condition['type'] == 'Ready' and condition['status'] == 'True'
+                for condition in pod['status']['conditions']
+            )
+        ]
+
+    @staticmethod
+    def exec_oc(execute_module=None, namespace="logging", cmd_str="", extra_args=None, task_vars=None):
+        """
+        Execute an 'oc' command in the remote host.
+        Returns: output of command and namespace,
+        or raises OpenShiftCheckException on error
+        """
+        config_base = get_var(task_vars, "openshift", "common", "config_base")
+        args = {
+            "namespace": namespace,
+            "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+            "cmd": cmd_str,
+            "extra_args": list(extra_args) if extra_args else [],
+        }
+
+        result = execute_module("ocutil", args, task_vars)
+        if result.get("failed"):
+            msg = (
+                'Unexpected error using `oc` to validate the logging stack components.\n'
+                'Error executing `oc {cmd}`:\n'
+                '{error}'
+            ).format(cmd=args['cmd'], error=result['result'])
+
+            if result['result'] == '[Errno 2] No such file or directory':
+                msg = (
+                    "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+                    "Has an installation been run on this host yet?"
+                )
+            raise OpenShiftCheckException(msg)
+
+        return result.get("result", "")
diff --git a/roles/openshift_health_checker/test/curator_test.py b/roles/openshift_health_checker/test/curator_test.py
new file mode 100644
index 000000000..ae108c96e
--- /dev/null
+++ b/roles/openshift_health_checker/test/curator_test.py
@@ -0,0 +1,68 @@
+import pytest
+
+from openshift_checks.logging.curator import Curator
+
+
+def canned_curator(exec_oc=None):
+    """Create a Curator check object with canned exec_oc method"""
+    check = Curator("dummy")  # fails if a module is actually invoked
+    if exec_oc:
+        check._exec_oc = exec_oc
+    return check
+
+
+def assert_error(error, expect_error):
+    if expect_error:
+        assert error
+        assert expect_error in error
+    else:
+        assert not error
+
+
+plain_curator_pod = {
+    "metadata": {
+        "labels": {"component": "curator", "deploymentconfig": "logging-curator"},
+        "name": "logging-curator-1",
+    },
+    "status": {
+        "containerStatuses": [{"ready": True}],
+        "conditions": [{"status": "True", "type": "Ready"}],
+        "podIP": "10.10.10.10",
+    }
+}
+
+not_running_curator_pod = {
+    "metadata": {
+        "labels": {"component": "curator", "deploymentconfig": "logging-curator"},
+        "name": "logging-curator-2",
+    },
+    "status": {
+        "containerStatuses": [{"ready": False}],
+        "conditions": [{"status": "False", "type": "Ready"}],
+        "podIP": "10.10.10.10",
+    }
+}
+
+
+@pytest.mark.parametrize('pods, expect_error', [
+    (
+        [],
+        "no Curator pods",
+    ),
+    (
+        [plain_curator_pod],
+        None,
+    ),
+    (
+        [not_running_curator_pod],
+        "not currently in a running state",
+    ),
+    (
+        [plain_curator_pod, plain_curator_pod],
+        "more than one Curator pod",
+    ),
+])
+def test_get_curator_pods(pods, expect_error):
+    check = canned_curator()
+    error = check.check_curator(pods)
+    assert_error(error, expect_error)
diff --git a/roles/openshift_health_checker/test/elasticsearch_test.py b/roles/openshift_health_checker/test/elasticsearch_test.py
new file mode 100644
index 000000000..b9d375d8c
--- /dev/null
+++ b/roles/openshift_health_checker/test/elasticsearch_test.py
@@ -0,0 +1,180 @@
+import pytest
+import json
+
+from openshift_checks.logging.elasticsearch import Elasticsearch
+
+task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin')))
+
+
+def canned_elasticsearch(exec_oc=None):
+    """Create an Elasticsearch check object with canned exec_oc method"""
+    check = Elasticsearch("dummy")  # fails if a module is actually invoked
+    if exec_oc:
+        check._exec_oc = exec_oc
+    return check
+
+
+def assert_error(error, expect_error):
+    if expect_error:
+        assert error
+        assert expect_error in error
+    else:
+        assert not error
+
+
+plain_es_pod = {
+    "metadata": {
+        "labels": {"component": "es", "deploymentconfig": "logging-es"},
+        "name": "logging-es",
+    },
+    "status": {
+        "conditions": [{"status": "True", "type": "Ready"}],
+        "containerStatuses": [{"ready": True}],
+        "podIP": "10.10.10.10",
+    },
+    "_test_master_name_str": "name logging-es",
+}
+
+split_es_pod = {
+    "metadata": {
+        "labels": {"component": "es", "deploymentconfig": "logging-es-2"},
+        "name": "logging-es-2",
+    },
+    "status": {
+        "conditions": [{"status": "True", "type": "Ready"}],
+        "containerStatuses": [{"ready": True}],
+        "podIP": "10.10.10.10",
+    },
+    "_test_master_name_str": "name logging-es-2",
+}
+
+
+def test_check_elasticsearch():
+    assert 'No logging Elasticsearch pods' in canned_elasticsearch().check_elasticsearch([], {})
+
+    # canned oc responses to match so all the checks pass
+    def _exec_oc(cmd, args, task_vars):
+        if '_cat/master' in cmd:
+            return 'name logging-es'
+        elif '/_nodes' in cmd:
+            return json.dumps(es_node_list)
+        elif '_cluster/health' in cmd:
+            return '{"status": "green"}'
+        elif ' df ' in cmd:
+            return 'IUse% Use%\n 3%  4%\n'
+        else:
+            raise Exception(cmd)
+
+    assert not canned_elasticsearch(_exec_oc).check_elasticsearch([plain_es_pod], {})
+
+
+def pods_by_name(pods):
+    return {pod['metadata']['name']: pod for pod in pods}
+
+
+@pytest.mark.parametrize('pods, expect_error', [
+    (
+        [],
+        'No logging Elasticsearch masters',
+    ),
+    (
+        [plain_es_pod],
+        None,
+    ),
+    (
+        [plain_es_pod, split_es_pod],
+        'Found multiple Elasticsearch masters',
+    ),
+])
+def test_check_elasticsearch_masters(pods, expect_error):
+    test_pods = list(pods)
+    check = canned_elasticsearch(lambda cmd, args, task_vars: test_pods.pop(0)['_test_master_name_str'])
+
+    errors = check._check_elasticsearch_masters(pods_by_name(pods), task_vars_config_base)
+    assert_error(''.join(errors), expect_error)
+
+
+es_node_list = {
+    'nodes': {
+        'random-es-name': {
+            'host': 'logging-es',
+        }}}
+
+
+@pytest.mark.parametrize('pods, node_list, expect_error', [
+    (
+        [],
+        {},
+        'No logging Elasticsearch masters',
+    ),
+    (
+        [plain_es_pod],
+        es_node_list,
+        None,
+    ),
+    (
+        [plain_es_pod],
+        {},  # empty list of nodes triggers KeyError
+        "Failed to query",
+    ),
+    (
+        [split_es_pod],
+        es_node_list,
+        'does not correspond to any known ES pod',
+    ),
+])
+def test_check_elasticsearch_node_list(pods, node_list, expect_error):
+    check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(node_list))
+
+    errors = check._check_elasticsearch_node_list(pods_by_name(pods), task_vars_config_base)
+    assert_error(''.join(errors), expect_error)
+
+
+@pytest.mark.parametrize('pods, health_data, expect_error', [
+    (
+        [plain_es_pod],
+        [{"status": "green"}],
+        None,
+    ),
+    (
+        [plain_es_pod],
+        [{"no-status": "should bomb"}],
+        'Could not retrieve cluster health status',
+    ),
+    (
+        [plain_es_pod, split_es_pod],
+        [{"status": "green"}, {"status": "red"}],
+        'Elasticsearch cluster health status is RED',
+    ),
+])
+def test_check_elasticsearch_cluster_health(pods, health_data, expect_error):
+    test_health_data = list(health_data)
+    check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(test_health_data.pop(0)))
+
+    errors = check._check_es_cluster_health(pods_by_name(pods), task_vars_config_base)
+    assert_error(''.join(errors), expect_error)
+
+
+@pytest.mark.parametrize('disk_data, expect_error', [
+    (
+        'df: /elasticsearch/persistent: No such file or directory\n',
+        'Could not retrieve storage usage',
+    ),
+    (
+        'IUse% Use%\n 3%  4%\n',
+        None,
+    ),
+    (
+        'IUse% Use%\n 95%  40%\n',
+        'Inode percent usage on the storage volume',
+    ),
+    (
+        'IUse% Use%\n 3%  94%\n',
+        'Disk percent usage on the storage volume',
+    ),
+])
+def test_check_elasticsearch_diskspace(disk_data, expect_error):
+    check = canned_elasticsearch(lambda cmd, args, task_vars: disk_data)
+
+    errors = check._check_elasticsearch_diskspace(pods_by_name([plain_es_pod]), task_vars_config_base)
+    assert_error(''.join(errors), expect_error)
diff --git a/roles/openshift_health_checker/test/fluentd_test.py b/roles/openshift_health_checker/test/fluentd_test.py
new file mode 100644
index 000000000..d151c0b19
--- /dev/null
+++ b/roles/openshift_health_checker/test/fluentd_test.py
@@ -0,0 +1,109 @@
+import pytest
+import json
+
+from openshift_checks.logging.fluentd import Fluentd
+
+
+def canned_fluentd(exec_oc=None):
+    """Create a Fluentd check object with canned exec_oc method"""
+    check = Fluentd("dummy")  # fails if a module is actually invoked
+    if exec_oc:
+        check._exec_oc = exec_oc
+    return check
+
+
+def assert_error(error, expect_error):
+    if expect_error:
+        assert error
+        assert expect_error in error
+    else:
+        assert not error
+
+
+fluentd_pod_node1 = {
+    "metadata": {
+        "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+        "name": "logging-fluentd-1",
+    },
+    "spec": {"host": "node1", "nodeName": "node1"},
+    "status": {
+        "containerStatuses": [{"ready": True}],
+        "conditions": [{"status": "True", "type": "Ready"}],
+    }
+}
+fluentd_pod_node2_down = {
+    "metadata": {
+        "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+        "name": "logging-fluentd-2",
+    },
+    "spec": {"host": "node2", "nodeName": "node2"},
+    "status": {
+        "containerStatuses": [{"ready": False}],
+        "conditions": [{"status": "False", "type": "Ready"}],
+    }
+}
+fluentd_node1 = {
+    "metadata": {
+        "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "node1"},
+        "name": "node1",
+    },
+    "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.1"}]},
+}
+fluentd_node2 = {
+    "metadata": {
+        "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "hostname"},
+        "name": "node2",
+    },
+    "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.2"}]},
+}
+fluentd_node3_unlabeled = {
+    "metadata": {
+        "labels": {"kubernetes.io/hostname": "hostname"},
+        "name": "node3",
+    },
+    "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.3"}]},
+}
+
+
+@pytest.mark.parametrize('pods, nodes, expect_error', [
+    (
+        [],
+        [],
+        'No nodes appear to be defined',
+    ),
+    (
+        [],
+        [fluentd_node3_unlabeled],
+        'There are no nodes with the fluentd label',
+    ),
+    (
+        [],
+        [fluentd_node1, fluentd_node3_unlabeled],
+        'Fluentd will not aggregate logs from these nodes.',
+    ),
+    (
+        [],
+        [fluentd_node2],
+        "nodes are supposed to have a Fluentd pod but do not",
+    ),
+    (
+        [fluentd_pod_node1, fluentd_pod_node1],
+        [fluentd_node1],
+        'more Fluentd pods running than nodes labeled',
+    ),
+    (
+        [fluentd_pod_node2_down],
+        [fluentd_node2],
+        "Fluentd pods are supposed to be running",
+    ),
+    (
+        [fluentd_pod_node1],
+        [fluentd_node1],
+        None,
+    ),
+])
+def test_get_fluentd_pods(pods, nodes, expect_error):
+    check = canned_fluentd(lambda cmd, args, task_vars: json.dumps(dict(items=nodes)))
+
+    error = check.check_fluentd(pods, {})
+    assert_error(error, expect_error)
diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py
new file mode 100644
index 000000000..19140a1b6
--- /dev/null
+++ b/roles/openshift_health_checker/test/kibana_test.py
@@ -0,0 +1,218 @@
+import pytest
+import json
+
+try:
+    import urllib2
+    from urllib2 import HTTPError, URLError
+except ImportError:
+    from urllib.error import HTTPError, URLError
+    import urllib.request as urllib2
+
+from openshift_checks.logging.kibana import Kibana
+
+
+def canned_kibana(exec_oc=None):
+    """Create a Kibana check object with canned exec_oc method"""
+    check = Kibana("dummy")  # fails if a module is actually invoked
+    if exec_oc:
+        check._exec_oc = exec_oc
+    return check
+
+
+def assert_error(error, expect_error):
+    if expect_error:
+        assert error
+        assert expect_error in error
+    else:
+        assert not error
+
+
+plain_kibana_pod = {
+    "metadata": {
+        "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+        "name": "logging-kibana-1",
+    },
+    "status": {
+        "containerStatuses": [{"ready": True}, {"ready": True}],
+        "conditions": [{"status": "True", "type": "Ready"}],
+    }
+}
+not_running_kibana_pod = {
+    "metadata": {
+        "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+        "name": "logging-kibana-2",
+    },
+    "status": {
+        "containerStatuses": [{"ready": True}, {"ready": False}],
+        "conditions": [{"status": "True", "type": "Ready"}],
+    }
+}
+
+
+@pytest.mark.parametrize('pods, expect_error', [
+    (
+        [],
+        "There are no Kibana pods deployed",
+    ),
+    (
+        [plain_kibana_pod],
+        None,
+    ),
+    (
+        [not_running_kibana_pod],
+        "No Kibana pod is in a running state",
+    ),
+    (
+        [plain_kibana_pod, not_running_kibana_pod],
+        "The following Kibana pods are not currently in a running state",
+    ),
+])
+def test_check_kibana(pods, expect_error):
+    check = canned_kibana()
+    error = check.check_kibana(pods)
+    assert_error(error, expect_error)
+
+
+@pytest.mark.parametrize('route, expect_url, expect_error', [
+    (
+        None,
+        None,
+        'no_route_exists',
+    ),
+
+    # test route with no ingress
+    (
+        {
+            "metadata": {
+                "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+                "name": "logging-kibana",
+            },
+            "status": {
+                "ingress": [],
+            },
+            "spec": {
+                "host": "hostname",
+            }
+        },
+        None,
+        'route_not_accepted',
+    ),
+
+    # test route with no host
+    (
+        {
+            "metadata": {
+                "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+                "name": "logging-kibana",
+            },
+            "status": {
+                "ingress": [{
+                    "status": True,
+                }],
+            },
+            "spec": {},
+        },
+        None,
+        'route_missing_host',
+    ),
+
+    # test route that looks fine
+    (
+        {
+            "metadata": {
+                "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+                "name": "logging-kibana",
+            },
+            "status": {
+                "ingress": [{
+                    "status": True,
+                }],
+            },
+            "spec": {
+                "host": "hostname",
+            },
+        },
+        "https://hostname/",
+        None,
+    ),
+])
+def test_get_kibana_url(route, expect_url, expect_error):
+    check = canned_kibana(lambda cmd, args, task_vars: json.dumps(route) if route else "")
+
+    url, error = check._get_kibana_url({})
+    if expect_url:
+        assert url == expect_url
+    else:
+        assert not url
+    if expect_error:
+        assert error == expect_error
+    else:
+        assert not error
+
+
+@pytest.mark.parametrize('exec_result, expect', [
+    (
+        'urlopen error [Errno 111] Connection refused',
+        'at least one router routing to it?',
+    ),
+    (
+        'urlopen error [Errno -2] Name or service not known',
+        'DNS configured for the Kibana hostname?',
+    ),
+    (
+        'Status code was not [302]: HTTP Error 500: Server error',
+        'did not return the correct status code',
+    ),
+    (
+        'bork bork bork',
+        'bork bork bork',  # should pass through
+    ),
+])
+def test_verify_url_internal_failure(exec_result, expect):
+    check = Kibana(execute_module=lambda module_name, args, task_vars: dict(failed=True, msg=exec_result))
+    check._get_kibana_url = lambda task_vars: ('url', None)
+
+    error = check._check_kibana_route({})
+    assert_error(error, expect)
+
+
+@pytest.mark.parametrize('lib_result, expect', [
+    (
+        HTTPError('url', 500, "it broke", hdrs=None, fp=None),
+        'it broke',
+    ),
+    (
+        URLError('it broke'),
+        'it broke',
+    ),
+    (
+        302,
+        'returned the wrong error code',
+    ),
+    (
+        200,
+        None,
+    ),
+])
+def test_verify_url_external_failure(lib_result, expect, monkeypatch):
+
+    class _http_return:
+
+        def __init__(self, code):
+            self.code = code
+
+        def getcode(self):
+            return self.code
+
+    def urlopen(url, context):
+        if type(lib_result) is int:
+            return _http_return(lib_result)
+        raise lib_result
+    monkeypatch.setattr(urllib2, 'urlopen', urlopen)
+
+    check = canned_kibana()
+    check._get_kibana_url = lambda task_vars: ('url', None)
+    check._verify_url_internal = lambda url, task_vars: None
+
+    error = check._check_kibana_route({})
+    assert_error(error, expect)
diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py
new file mode 100644
index 000000000..b6db34fe3
--- /dev/null
+++ b/roles/openshift_health_checker/test/logging_check_test.py
@@ -0,0 +1,137 @@
+import pytest
+import json
+
+from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException
+
+task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin')))
+
+
+logging_namespace = "logging"
+
+
+def canned_loggingcheck(exec_oc=None):
+    """Create a LoggingCheck object with canned exec_oc method"""
+    check = LoggingCheck("dummy")  # fails if a module is actually invoked
+    check.logging_namespace = 'logging'
+    if exec_oc:
+        check.exec_oc = exec_oc
+    return check
+
+
+def assert_error(error, expect_error):
+    if expect_error:
+        assert error
+        assert expect_error in error
+    else:
+        assert not error
+
+
+plain_es_pod = {
+    "metadata": {
+        "labels": {"component": "es", "deploymentconfig": "logging-es"},
+        "name": "logging-es",
+    },
+    "status": {
+        "conditions": [{"status": "True", "type": "Ready"}],
+        "containerStatuses": [{"ready": True}],
+        "podIP": "10.10.10.10",
+    },
+    "_test_master_name_str": "name logging-es",
+}
+
+plain_kibana_pod = {
+    "metadata": {
+        "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+        "name": "logging-kibana-1",
+    },
+    "status": {
+        "containerStatuses": [{"ready": True}, {"ready": True}],
+        "conditions": [{"status": "True", "type": "Ready"}],
+    }
+}
+
+fluentd_pod_node1 = {
+    "metadata": {
+        "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+        "name": "logging-fluentd-1",
+    },
+    "spec": {"host": "node1", "nodeName": "node1"},
+    "status": {
+        "containerStatuses": [{"ready": True}],
+        "conditions": [{"status": "True", "type": "Ready"}],
+    }
+}
+
+plain_curator_pod = {
+    "metadata": {
+        "labels": {"component": "curator", "deploymentconfig": "logging-curator"},
+        "name": "logging-curator-1",
+    },
+    "status": {
+        "containerStatuses": [{"ready": True}],
+        "conditions": [{"status": "True", "type": "Ready"}],
+        "podIP": "10.10.10.10",
+    }
+}
+
+
+@pytest.mark.parametrize('problem, expect', [
+    ("[Errno 2] No such file or directory", "supposed to be a master"),
+    ("Permission denied", "Unexpected error using `oc`"),
+])
+def test_oc_failure(problem, expect):
+    def execute_module(module_name, args, task_vars):
+        if module_name == "ocutil":
+            return dict(failed=True, result=problem)
+        return dict(changed=False)
+
+    check = LoggingCheck({})
+
+    with pytest.raises(OpenShiftCheckException) as excinfo:
+        check.exec_oc(execute_module, logging_namespace, 'get foo', [], task_vars=task_vars_config_base)
+    assert expect in str(excinfo)
+
+
+groups_with_first_master = dict(masters=['this-host', 'other-host'])
+groups_with_second_master = dict(masters=['other-host', 'this-host'])
+groups_not_a_master = dict(masters=['other-host'])
+
+
+@pytest.mark.parametrize('groups, logging_deployed, is_active', [
+    (groups_with_first_master, True, True),
+    (groups_with_first_master, False, False),
+    (groups_not_a_master, True, False),
+    (groups_with_second_master, True, False),
+    (groups_not_a_master, True, False),
+])
+def test_is_active(groups, logging_deployed, is_active):
+    task_vars = dict(
+        ansible_ssh_host='this-host',
+        groups=groups,
+        openshift_hosted_logging_deploy=logging_deployed,
+    )
+
+    assert LoggingCheck.is_active(task_vars=task_vars) == is_active
+
+
+@pytest.mark.parametrize('pod_output, expect_pods, expect_error', [
+    (
+        'No resources found.',
+        None,
+        'There are no pods in the logging namespace',
+    ),
+    (
+        json.dumps({'items': [plain_kibana_pod, plain_es_pod, plain_curator_pod, fluentd_pod_node1]}),
+        [plain_es_pod],
+        None,
+    ),
+])
+def test_get_pods_for_component(pod_output, expect_pods, expect_error):
+    check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: pod_output)
+    pods, error = check.get_pods_for_component(
+        lambda name, args, task_vars: {},
+        logging_namespace,
+        "es",
+        {}
+    )
+    assert_error(error, expect_error)
author	juanvallejo <jvallejo@redhat.com>	2017-03-22 15:52:35 -0400
committer	juanvallejo <jvallejo@redhat.com>	2017-06-02 16:44:07 -0400
commit	2e53dbb4c0d9bfe79cd79e0a0ece9db065b286df (patch)
tree	e2bcc99a895fc6647bc7623a8bcbf6c8fd5385f8
parent	46dca9b8b15ed67adfa2ca617f300e5d1df7c3e0 (diff)