From bf0828bc0f2e3088df20abc77e30a162595e1c22 Mon Sep 17 00:00:00 2001 From: Luke Meyer Date: Thu, 20 Jul 2017 22:25:47 -0400 Subject: openshift_checks: add property to track 'changed' Introduced the 'changed' property for checks that can make changes to track whether they did or not. Rather than the check's own logic having to track this and include it in the result hash, just set the property and have the action plugin insert it in the result hash after running (even if there is an exception). Cleared out a lot of crufty "changed: false" hash entries. --- .../action_plugins/openshift_health_check.py | 10 ++++++---- .../openshift_health_checker/openshift_checks/__init__.py | 2 ++ .../openshift_checks/docker_image_availability.py | 10 ++++------ .../openshift_checks/docker_storage.py | 10 ++++------ .../openshift_checks/etcd_imagedata_size.py | 4 ++-- .../openshift_checks/etcd_volume.py | 2 +- .../openshift_checks/logging/curator.py | 6 +++--- .../openshift_checks/logging/elasticsearch.py | 6 +++--- .../openshift_checks/logging/fluentd.py | 6 +++--- .../openshift_checks/logging/kibana.py | 6 +++--- roles/openshift_health_checker/openshift_checks/mixins.py | 8 ++++---- roles/openshift_health_checker/test/action_plugin_test.py | 14 +++++++++----- 12 files changed, 44 insertions(+), 40 deletions(-) (limited to 'roles') diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py index 23da53940..05e53333d 100644 --- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py +++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py @@ -68,13 +68,15 @@ class ActionModule(ActionBase): msg=str(e), ) + if check.changed: + r["changed"] = True check_results[check_name] = r - if r.get("failed", False): - result["failed"] = True - result["msg"] = "One or more checks failed" + result["changed"] = any(r.get("changed") for r in check_results.values()) + if any(r.get("failed") for r in check_results.values()): + result["failed"] = True + result["msg"] = "One or more checks failed" - result["changed"] = any(r.get("changed", False) for r in check_results.values()) return result def load_known_checks(self, tmp, task_vars): diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index 85cbc6224..5d289e522 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -34,6 +34,8 @@ class OpenShiftCheck(object): self._execute_module = execute_module self.task_vars = task_vars or {} self.tmp = tmp + # set True when the check makes a change to the host so it can be reported to the user: + self.changed = False @abstractproperty def name(self): diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py index 77180223e..85a922f86 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py +++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py @@ -41,11 +41,10 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): return super(DockerImageAvailability, self).is_active() and has_valid_deployment_type def run(self): - msg, failed, changed = self.ensure_dependencies() + msg, failed = self.ensure_dependencies() if failed: return { "failed": True, - "changed": changed, "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg } @@ -54,11 +53,11 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): # exit early if all images were found locally if not missing_images: - return {"changed": changed} + return {} registries = self.known_docker_registries() if not registries: - return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed} + return {"failed": True, "msg": "Unable to retrieve any docker registries."} available_images = self.available_images(missing_images, registries) unavailable_images = set(missing_images) - set(available_images) @@ -70,10 +69,9 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): "One or more required Docker images are not available:\n {}\n" "Configured registries: {}" ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)), - "changed": changed, } - return {"changed": changed} + return {} def required_images(self): """ diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py index dea15a56e..7ae384bd7 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_storage.py +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -43,21 +43,20 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): ] def run(self): - msg, failed, changed = self.ensure_dependencies() + msg, failed = self.ensure_dependencies() if failed: return { "failed": True, - "changed": changed, "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg } # attempt to get the docker info hash from the API docker_info = self.execute_module("docker_info", {}) if docker_info.get("failed"): - return {"failed": True, "changed": changed, + return {"failed": True, "msg": "Failed to query Docker API. Is docker running on this host?"} if not docker_info.get("info"): # this would be very strange - return {"failed": True, "changed": changed, + return {"failed": True, "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))} docker_info = docker_info["info"] @@ -68,7 +67,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): "Detected unsupported Docker storage driver '{driver}'.\n" "Supported storage drivers are: {drivers}" ).format(driver=driver, drivers=', '.join(self.storage_drivers)) - return {"failed": True, "changed": changed, "msg": msg} + return {"failed": True, "msg": msg} # driver status info is a list of tuples; convert to dict and validate based on driver driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])} @@ -81,7 +80,6 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): if driver in ['overlay', 'overlay2']: result = self.check_overlay_support(docker_info, driver_status) - result['changed'] = result.get('changed', False) or changed return result def check_devicemapper_support(self, driver_status): diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py index 28c38504d..ae8460b7e 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py @@ -56,7 +56,7 @@ class EtcdImageDataSize(OpenShiftCheck): reason = etcdkeysize["module_stderr"] msg = msg.format(host=etcd_host, reason=reason) - return {"failed": True, "changed": False, "msg": msg} + return {"failed": True, "msg": msg} if etcdkeysize["size_limit_exceeded"]: limit = self._to_gigabytes(etcd_imagedata_size_limit) @@ -65,7 +65,7 @@ class EtcdImageDataSize(OpenShiftCheck): "Use the `oadm prune images` command to cleanup unused Docker images.") return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)} - return {"changed": False} + return {} @staticmethod def _get_etcd_mountpath(ansible_mounts): diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index da7d0364a..e55d55e91 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -40,7 +40,7 @@ class EtcdVolume(OpenShiftCheck): ) return {"failed": True, "msg": msg} - return {"changed": False} + return {} def _etcd_mount_info(self): ansible_mounts = self.get_var("ansible_mounts") diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py index 32d853d57..32a92c909 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/curator.py +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -18,16 +18,16 @@ class Curator(LoggingCheck): "curator", ) if error: - return {"failed": True, "changed": False, "msg": error} + return {"failed": True, "msg": error} check_error = self.check_curator(curator_pods) if check_error: msg = ("The following Curator deployment issue was found:" "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} + return {"failed": True, "msg": msg} # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'} + return {"failed": False, "msg": 'No problems found with Curator deployment.'} def check_curator(self, pods): """Check to see if curator is up and working. Returns: error string""" diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py index 8bdda1f32..b2e9a8f49 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -21,16 +21,16 @@ class Elasticsearch(LoggingCheck): "es", ) if error: - return {"failed": True, "changed": False, "msg": error} + return {"failed": True, "msg": error} check_error = self.check_elasticsearch(es_pods) if check_error: msg = ("The following Elasticsearch deployment issue was found:" "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} + return {"failed": True, "msg": msg} # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'} + return {"failed": False, "msg": 'No problems found with Elasticsearch deployment.'} def _not_running_elasticsearch_pods(self, es_pods): """Returns: list of pods that are not running, list of errors about non-running pods""" diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py index b3485bf44..69c7b4392 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -20,16 +20,16 @@ class Fluentd(LoggingCheck): "fluentd", ) if error: - return {"failed": True, "changed": False, "msg": error} + return {"failed": True, "msg": error} check_error = self.check_fluentd(fluentd_pods) if check_error: msg = ("The following Fluentd deployment issue was found:" "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} + return {"failed": True, "msg": msg} # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'} + return {"failed": False, "msg": 'No problems found with Fluentd deployment.'} @staticmethod def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector): diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py index efb14ab42..c600bb47e 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -30,7 +30,7 @@ class Kibana(LoggingCheck): "kibana", ) if error: - return {"failed": True, "changed": False, "msg": error} + return {"failed": True, "msg": error} check_error = self.check_kibana(kibana_pods) if not check_error: @@ -39,10 +39,10 @@ class Kibana(LoggingCheck): if check_error: msg = ("The following Kibana deployment issue was found:" "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} + return {"failed": True, "msg": msg} # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'} + return {"failed": False, "msg": 'No problems found with Kibana deployment.'} def _verify_url_internal(self, url): """ diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py index 3b2c64e6a..e9bae60a3 100644 --- a/roles/openshift_health_checker/openshift_checks/mixins.py +++ b/roles/openshift_health_checker/openshift_checks/mixins.py @@ -29,10 +29,10 @@ class DockerHostMixin(object): """ Ensure that docker-related packages exist, but not on atomic hosts (which would not be able to install but should already have them). - Returns: msg, failed, changed + Returns: msg, failed """ if self.get_var("openshift", "common", "is_atomic"): - return "", False, False + return "", False # NOTE: we would use the "package" module but it's actually an action plugin # and it's not clear how to invoke one of those. This is about the same anyway: @@ -49,5 +49,5 @@ class DockerHostMixin(object): " {deps}\n{msg}" ).format(deps=',\n '.join(self.dependencies), msg=msg) failed = result.get("failed", False) or result.get("rc", 0) != 0 - changed = result.get("changed", False) - return msg, failed, changed + self.changed = result.get("changed", False) + return msg, failed diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py index 2d068be3d..f5161d6f5 100644 --- a/roles/openshift_health_checker/test/action_plugin_test.py +++ b/roles/openshift_health_checker/test/action_plugin_test.py @@ -6,7 +6,7 @@ from openshift_health_check import ActionModule, resolve_checks from openshift_checks import OpenShiftCheckException -def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None): +def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None, changed=False): """Returns a new class that is compatible with OpenShiftCheck for testing.""" _name, _tags = name, tags @@ -14,6 +14,7 @@ def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, ru class FakeCheck(object): name = _name tags = _tags or [] + changed = False def __init__(self, execute_module=None, task_vars=None, tmp=None): pass @@ -22,6 +23,7 @@ def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, ru return is_active def run(self): + self.changed = changed if run_exception is not None: raise run_exception return run_return @@ -135,14 +137,15 @@ def test_action_plugin_run_check_ok(plugin, task_vars, monkeypatch): def test_action_plugin_run_check_changed(plugin, task_vars, monkeypatch): - check_return_value = {'ok': 'test', 'changed': True} - check_class = fake_check(run_return=check_return_value) + check_return_value = {'ok': 'test'} + check_class = fake_check(run_return=check_return_value, changed=True) monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()}) monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) result = plugin.run(tmp=None, task_vars=task_vars) assert result['checks']['fake_check'] == check_return_value + assert changed(result['checks']['fake_check']) assert not failed(result) assert changed(result) assert not skipped(result) @@ -165,7 +168,7 @@ def test_action_plugin_run_check_fail(plugin, task_vars, monkeypatch): def test_action_plugin_run_check_exception(plugin, task_vars, monkeypatch): exception_msg = 'fake check has an exception' run_exception = OpenShiftCheckException(exception_msg) - check_class = fake_check(run_exception=run_exception) + check_class = fake_check(run_exception=run_exception, changed=True) monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()}) monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) @@ -173,7 +176,8 @@ def test_action_plugin_run_check_exception(plugin, task_vars, monkeypatch): assert failed(result['checks']['fake_check'], msg_has=exception_msg) assert failed(result, msg_has=['failed']) - assert not changed(result) + assert changed(result['checks']['fake_check']) + assert changed(result) assert not skipped(result) -- cgit v1.2.3 From 06a6fb9642a2cc70b1ca65f403b853fe8ce9d4b2 Mon Sep 17 00:00:00 2001 From: Luke Meyer Date: Thu, 20 Jul 2017 23:39:47 -0400 Subject: openshift_checks: refactor logging checks Turn failure messages into exceptions that tests can look for without depending on text meant for humans. Turn logging_namespace property into a method. Get rid of _exec_oc and just use logging.exec_oc. --- .../openshift_checks/__init__.py | 29 ++- .../openshift_checks/logging/curator.py | 32 ++-- .../openshift_checks/logging/elasticsearch.py | 166 ++++++++-------- .../openshift_checks/logging/fluentd.py | 191 +++++++++---------- .../openshift_checks/logging/fluentd_config.py | 17 +- .../openshift_checks/logging/kibana.py | 208 +++++++++++---------- .../openshift_checks/logging/logging.py | 54 ++++-- .../openshift_checks/logging/logging_index_time.py | 75 ++++---- .../openshift_health_checker/test/curator_test.py | 45 ++--- .../test/elasticsearch_test.py | 148 +++++++++------ .../test/fluentd_config_test.py | 15 +- .../openshift_health_checker/test/fluentd_test.py | 55 +++--- roles/openshift_health_checker/test/kibana_test.py | 149 +++++++++------ .../test/logging_check_test.py | 49 ++--- .../test/logging_index_time_test.py | 82 ++++---- 15 files changed, 699 insertions(+), 616 deletions(-) (limited to 'roles') diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index 5d289e522..f26008c9f 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -13,8 +13,30 @@ from ansible.module_utils.six.moves import reduce # pylint: disable=import-erro class OpenShiftCheckException(Exception): - """Raised when a check cannot proceed.""" - pass + """Raised when a check encounters a failure condition.""" + + def __init__(self, name, msg=None): + # msg is for the message the user will see when this is raised. + # name is for test code to identify the error without looking at msg text. + if msg is None: # for parameter backward compatibility + msg = name + name = self.__class__.__name__ + self.name = name + super(OpenShiftCheckException, self).__init__(msg) + + +class OpenShiftCheckExceptionList(OpenShiftCheckException): + """A container for multiple logging errors that may be detected in one check.""" + def __init__(self, errors): + self.errors = errors + super(OpenShiftCheckExceptionList, self).__init__( + 'OpenShiftCheckExceptionList', + '\n'.join(str(msg) for msg in errors) + ) + + # make iterable + def __getitem__(self, index): + return self.errors[index] @six.add_metaclass(ABCMeta) @@ -34,7 +56,8 @@ class OpenShiftCheck(object): self._execute_module = execute_module self.task_vars = task_vars or {} self.tmp = tmp - # set True when the check makes a change to the host so it can be reported to the user: + + # set to True when the check changes the host, for accurate total "changed" count self.changed = False @abstractproperty diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py index 32a92c909..b27f97172 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/curator.py +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -1,6 +1,6 @@ """Check for an aggregated logging Curator deployment""" -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck class Curator(LoggingCheck): @@ -12,27 +12,17 @@ class Curator(LoggingCheck): def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - curator_pods, error = self.get_pods_for_component( - self.logging_namespace, - "curator", - ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_curator(curator_pods) - - if check_error: - msg = ("The following Curator deployment issue was found:" - "{}".format(check_error)) - return {"failed": True, "msg": msg} - + curator_pods = self.get_pods_for_component("curator") + self.check_curator(curator_pods) # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Curator deployment.'} + + return {} def check_curator(self, pods): """Check to see if curator is up and working. Returns: error string""" if not pods: - return ( + raise OpenShiftCheckException( + "MissingComponentPods", "There are no Curator pods for the logging stack,\n" "so nothing will prune Elasticsearch indexes.\n" "Is Curator correctly deployed?" @@ -40,14 +30,14 @@ class Curator(LoggingCheck): not_running = self.not_running_pods(pods) if len(not_running) == len(pods): - return ( + raise OpenShiftCheckException( + "CuratorNotRunning", "The Curator pod is not currently in a running state,\n" "so Elasticsearch indexes may increase without bound." ) if len(pods) - len(not_running) > 1: - return ( + raise OpenShiftCheckException( + "TooManyCurators", "There is more than one Curator pod running. This should not normally happen.\n" "Although this doesn't cause any problems, you may want to investigate." ) - - return None diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py index b2e9a8f49..7fc843fd7 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -3,6 +3,7 @@ import json import re +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList from openshift_checks.logging.logging import LoggingCheck @@ -15,168 +16,178 @@ class Elasticsearch(LoggingCheck): def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - es_pods, error = self.get_pods_for_component( - self.logging_namespace, - "es", - ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_elasticsearch(es_pods) - - if check_error: - msg = ("The following Elasticsearch deployment issue was found:" - "{}".format(check_error)) - return {"failed": True, "msg": msg} - + es_pods = self.get_pods_for_component("es") + self.check_elasticsearch(es_pods) # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Elasticsearch deployment.'} - def _not_running_elasticsearch_pods(self, es_pods): - """Returns: list of pods that are not running, list of errors about non-running pods""" - not_running = self.not_running_pods(es_pods) - if not_running: - return not_running, [( - 'The following Elasticsearch pods are not running:\n' - '{pods}' - 'These pods will not aggregate logs from their nodes.' - ).format(pods=''.join( - " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) - for pod in not_running - ))] - return not_running, [] + return {} def check_elasticsearch(self, es_pods): - """Various checks for elasticsearch. Returns: error string""" - not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods) - running_pods = [pod for pod in es_pods if pod not in not_running_pods] + """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors.""" + running_pods, errors = self.running_elasticsearch_pods(es_pods) pods_by_name = { pod['metadata']['name']: pod for pod in running_pods # Filter out pods that are not members of a DC if pod['metadata'].get('labels', {}).get('deploymentconfig') } if not pods_by_name: - return 'No logging Elasticsearch pods were found. Is logging deployed?' - error_msgs += self._check_elasticsearch_masters(pods_by_name) - error_msgs += self._check_elasticsearch_node_list(pods_by_name) - error_msgs += self._check_es_cluster_health(pods_by_name) - error_msgs += self._check_elasticsearch_diskspace(pods_by_name) - return '\n'.join(error_msgs) + # nothing running, cannot run the rest of the check + errors.append(OpenShiftCheckException( + 'NoRunningPods', + 'No logging Elasticsearch pods were found running, so no logs are being aggregated.' + )) + raise OpenShiftCheckExceptionList(errors) + + errors += self.check_elasticsearch_masters(pods_by_name) + errors += self.check_elasticsearch_node_list(pods_by_name) + errors += self.check_es_cluster_health(pods_by_name) + errors += self.check_elasticsearch_diskspace(pods_by_name) + if errors: + raise OpenShiftCheckExceptionList(errors) + + def running_elasticsearch_pods(self, es_pods): + """Returns: list of running pods, list of errors about non-running pods""" + not_running = self.not_running_pods(es_pods) + running_pods = [pod for pod in es_pods if pod not in not_running] + if not_running: + return running_pods, [OpenShiftCheckException( + 'PodNotRunning', + 'The following Elasticsearch pods are defined but not running:\n' + '{pods}'.format(pods=''.join( + " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) + for pod in not_running + )) + )] + return running_pods, [] @staticmethod def _build_es_curl_cmd(pod_name, url): base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'" return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) - def _check_elasticsearch_masters(self, pods_by_name): - """Check that Elasticsearch masters are sane. Returns: list of error strings""" + def check_elasticsearch_masters(self, pods_by_name): + """Check that Elasticsearch masters are sane. Returns: list of errors""" es_master_names = set() - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): # Compare what each ES node reports as master and compare for split brain get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") - master_name_str = self.exec_oc(self.logging_namespace, get_master_cmd, []) + master_name_str = self.exec_oc(get_master_cmd, []) master_names = (master_name_str or '').split(' ') if len(master_names) > 1: es_master_names.add(master_names[1]) else: - error_msgs.append( - 'No master? Elasticsearch {pod} returned bad string when asked master name:\n' + errors.append(OpenShiftCheckException( + 'NoMasterName', + 'Elasticsearch {pod} gave unexpected response when asked master name:\n' ' {response}'.format(pod=pod_name, response=master_name_str) - ) + )) if not es_master_names: - error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?') - return '\n'.join(error_msgs) + errors.append(OpenShiftCheckException( + 'NoMasterFound', + 'No logging Elasticsearch masters were found.' + )) + return errors if len(es_master_names) > 1: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'SplitBrainMasters', 'Found multiple Elasticsearch masters according to the pods:\n' '{master_list}\n' 'This implies that the masters have "split brain" and are not correctly\n' 'replicating data for the logging cluster. Log loss is likely to occur.' .format(master_list='\n'.join(' ' + master for master in es_master_names)) - ) + )) - return error_msgs + return errors - def _check_elasticsearch_node_list(self, pods_by_name): - """Check that reported ES masters are accounted for by pods. Returns: list of error strings""" + def check_elasticsearch_node_list(self, pods_by_name): + """Check that reported ES masters are accounted for by pods. Returns: list of errors""" if not pods_by_name: - return ['No logging Elasticsearch masters were found. Is logging deployed?'] + return [OpenShiftCheckException( + 'MissingComponentPods', + 'No logging Elasticsearch pods were found.' + )] # get ES cluster nodes node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') - cluster_node_data = self.exec_oc(self.logging_namespace, node_cmd, []) + cluster_node_data = self.exec_oc(node_cmd, []) try: cluster_nodes = json.loads(cluster_node_data)['nodes'] except (ValueError, KeyError): - return [ + return [OpenShiftCheckException( + 'MissingNodeList', 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' + cluster_node_data - ] + )] # Try to match all ES-reported node hosts to known pods. - error_msgs = [] + errors = [] for node in cluster_nodes.values(): # Note that with 1.4/3.4 the pod IP may be used as the master name if not any(node['host'] in (pod_name, pod['status'].get('podIP')) for pod_name, pod in pods_by_name.items()): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'EsPodNodeMismatch', 'The Elasticsearch cluster reports a member node "{node}"\n' 'that does not correspond to any known ES pod.'.format(node=node['host']) - ) + )) - return error_msgs + return errors - def _check_es_cluster_health(self, pods_by_name): + def check_es_cluster_health(self, pods_by_name): """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') - cluster_health_data = self.exec_oc(self.logging_namespace, cluster_health_cmd, []) + cluster_health_data = self.exec_oc(cluster_health_cmd, []) try: health_res = json.loads(cluster_health_data) if not health_res or not health_res.get('status'): raise ValueError() except ValueError: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'BadEsResponse', 'Could not retrieve cluster health status from logging ES pod "{pod}".\n' 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data) - ) + )) continue if health_res['status'] not in ['green', 'yellow']: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'EsClusterHealthRed', 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name) - ) + )) - return error_msgs + return errors - def _check_elasticsearch_diskspace(self, pods_by_name): + def check_elasticsearch_diskspace(self, pods_by_name): """ Exec into an ES pod and query the diskspace on the persistent volume. Returns: list of errors """ - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) - disk_output = self.exec_oc(self.logging_namespace, df_cmd, []) + disk_output = self.exec_oc(df_cmd, []) lines = disk_output.splitlines() # expecting one header looking like 'IUse% Use%' and one body line body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'BadDfResponse', 'Could not retrieve storage usage from logging ES pod "{pod}".\n' 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output) - ) + )) continue inode_pct, disk_pct = re.match(body_re, lines[1]).groups() inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90') if int(inode_pct) >= int(inode_pct_thresh): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'InodeUsageTooHigh', 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n' ' is {pct}, greater than threshold {limit}.\n' ' Note: threshold can be specified in inventory with {param}'.format( @@ -184,10 +195,11 @@ class Elasticsearch(LoggingCheck): pct=str(inode_pct), limit=str(inode_pct_thresh), param='openshift_check_efk_es_inode_pct', - )) + ))) disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80') if int(disk_pct) >= int(disk_pct_thresh): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'DiskUsageTooHigh', 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n' ' is {pct}, greater than threshold {limit}.\n' ' Note: threshold can be specified in inventory with {param}'.format( @@ -195,6 +207,6 @@ class Elasticsearch(LoggingCheck): pct=str(disk_pct), limit=str(disk_pct_thresh), param='openshift_check_efk_es_storage_pct', - )) + ))) - return error_msgs + return errors diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py index 69c7b4392..3b192a281 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -2,6 +2,8 @@ import json + +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList from openshift_checks.logging.logging import LoggingCheck @@ -12,57 +14,96 @@ class Fluentd(LoggingCheck): tags = ["health", "logging"] def run(self): - """Check various things and gather errors. Returns: result as hash""" + """Check the Fluentd deployment and raise an error if any problems are found.""" + + fluentd_pods = self.get_pods_for_component("fluentd") + self.check_fluentd(fluentd_pods) + return {} + + def check_fluentd(self, pods): + """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found.""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - fluentd_pods, error = super(Fluentd, self).get_pods_for_component( - self.logging_namespace, - "fluentd", + node_selector = self.get_var( + 'openshift_logging_fluentd_nodeselector', + default='logging-infra-fluentd=true' ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_fluentd(fluentd_pods) - if check_error: - msg = ("The following Fluentd deployment issue was found:" - "{}".format(check_error)) - return {"failed": True, "msg": msg} + nodes_by_name = self.get_nodes_by_name() + fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector) - # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Fluentd deployment.'} + errors = [] + errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) + errors += self.check_nodes_have_fluentd(pods, fluentd_nodes) + errors += self.check_fluentd_pods_running(pods) + + # Make sure there are no extra fluentd pods + if len(pods) > len(fluentd_nodes): + errors.append(OpenShiftCheckException( + 'TooManyFluentdPods', + 'There are more Fluentd pods running than nodes labeled.\n' + 'This may not cause problems with logging but it likely indicates something wrong.' + )) + + if errors: + raise OpenShiftCheckExceptionList(errors) + + def get_nodes_by_name(self): + """Retrieve all the node definitions. Returns: dict(name: node)""" + nodes_json = self.exec_oc("get nodes -o json", []) + try: + nodes = json.loads(nodes_json) + except ValueError: # no valid json - should not happen + raise OpenShiftCheckException( + "BadOcNodeList", + "Could not obtain a list of nodes to validate fluentd.\n" + "Output from oc get:\n" + nodes_json + ) + if not nodes or not nodes.get('items'): # also should not happen + raise OpenShiftCheckException( + "NoNodesDefined", + "No nodes appear to be defined according to the API." + ) + return { + node['metadata']['name']: node + for node in nodes['items'] + } @staticmethod - def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector): - """Filter to all nodes with fluentd label. Returns dict(name: node), error string""" + def filter_fluentd_labeled_nodes(nodes_by_name, node_selector): + """Filter to all nodes with fluentd label. Returns dict(name: node)""" label, value = node_selector.split('=', 1) fluentd_nodes = { name: node for name, node in nodes_by_name.items() if node['metadata']['labels'].get(label) == value } if not fluentd_nodes: - return None, ( + raise OpenShiftCheckException( + 'NoNodesLabeled', 'There are no nodes with the fluentd label {label}.\n' - 'This means no logs will be aggregated from the nodes.' - ).format(label=node_selector) - return fluentd_nodes, None + 'This means no logs will be aggregated from the nodes.'.format(label=node_selector) + ) + return fluentd_nodes - def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): - """Note if nodes are not labeled as expected. Returns: error string""" + def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): + """Note if nodes are not labeled as expected. Returns: error list""" intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all']) if not intended_nodes or '--all' in intended_nodes: intended_nodes = nodes_by_name.keys() nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) if nodes_missing_labels: - return ( + return [OpenShiftCheckException( + 'NodesUnlabeled', 'The following nodes are supposed to be labeled with {label} but are not:\n' ' {nodes}\n' - 'Fluentd will not aggregate logs from these nodes.' - ).format(label=node_selector, nodes=', '.join(nodes_missing_labels)) - return None + 'Fluentd will not aggregate logs from these nodes.'.format( + label=node_selector, nodes=', '.join(nodes_missing_labels) + ))] + + return [] @staticmethod - def _check_nodes_have_fluentd(pods, fluentd_nodes): - """Make sure fluentd is on all the labeled nodes. Returns: error string""" + def check_nodes_have_fluentd(pods, fluentd_nodes): + """Make sure fluentd is on all the labeled nodes. Returns: error list""" unmatched_nodes = fluentd_nodes.copy() node_names_by_label = { node['metadata']['labels']['kubernetes.io/hostname']: name @@ -82,80 +123,32 @@ class Fluentd(LoggingCheck): ]: unmatched_nodes.pop(name, None) if unmatched_nodes: - return ( + return [OpenShiftCheckException( + 'MissingFluentdPod', 'The following nodes are supposed to have a Fluentd pod but do not:\n' - '{nodes}' - 'These nodes will not have their logs aggregated.' - ).format(nodes=''.join( - " {}\n".format(name) - for name in unmatched_nodes.keys() - )) - return None + ' {nodes}\n' + 'These nodes will not have their logs aggregated.'.format( + nodes='\n '.join(unmatched_nodes.keys()) + ))] + + return [] - def _check_fluentd_pods_running(self, pods): + def check_fluentd_pods_running(self, pods): """Make sure all fluentd pods are running. Returns: error string""" not_running = super(Fluentd, self).not_running_pods(pods) if not_running: - return ( + return [OpenShiftCheckException( + 'FluentdNotRunning', 'The following Fluentd pods are supposed to be running but are not:\n' - '{pods}' - 'These pods will not aggregate logs from their nodes.' - ).format(pods=''.join( - " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) - for pod in not_running - )) - return None - - def check_fluentd(self, pods): - """Verify fluentd is running everywhere. Returns: error string""" - - node_selector = self.get_var( - 'openshift_logging_fluentd_nodeselector', - default='logging-infra-fluentd=true' - ) - - nodes_by_name, error = self.get_nodes_by_name() - - if error: - return error - fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector) - if error: - return error - - error_msgs = [] - error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) - if error: - error_msgs.append(error) - error = self._check_nodes_have_fluentd(pods, fluentd_nodes) - if error: - error_msgs.append(error) - error = self._check_fluentd_pods_running(pods) - if error: - error_msgs.append(error) - - # Make sure there are no extra fluentd pods - if len(pods) > len(fluentd_nodes): - error_msgs.append( - 'There are more Fluentd pods running than nodes labeled.\n' - 'This may not cause problems with logging but it likely indicates something wrong.' - ) - - return '\n'.join(error_msgs) - - def get_nodes_by_name(self): - """Retrieve all the node definitions. Returns: dict(name: node), error string""" - nodes_json = self.exec_oc( - self.logging_namespace, - "get nodes -o json", - [] - ) - try: - nodes = json.loads(nodes_json) - except ValueError: # no valid json - should not happen - return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json - if not nodes or not nodes.get('items'): # also should not happen - return None, "No nodes appear to be defined according to the API." - return { - node['metadata']['name']: node - for node in nodes['items'] - }, None + ' {pods}\n' + 'These pods will not aggregate logs from their nodes.'.format( + pods='\n'.join( + " {name} ({host})".format( + name=pod['metadata']['name'], + host=pod['spec'].get('host', 'None') + ) + for pod in not_running + ) + ))] + + return [] diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py index 0970f0a63..d783e6760 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py @@ -24,7 +24,6 @@ class FluentdConfig(LoggingCheck): def run(self): """Check that Fluentd has running pods, and that its logging config matches Docker's logging config.""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace) config_error = self.check_logging_config() if config_error: msg = ("The following Fluentd logging configuration problem was found:" @@ -120,19 +119,13 @@ class FluentdConfig(LoggingCheck): def running_fluentd_pods(self): """Return a list of running fluentd pods.""" - fluentd_pods, error = self.get_pods_for_component( - self.logging_namespace, - "fluentd", - ) - if error: - msg = 'Unable to retrieve any pods for the "fluentd" logging component: {}'.format(error) - raise OpenShiftCheckException(msg) + fluentd_pods = self.get_pods_for_component("fluentd") running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running'] if not running_fluentd_pods: - msg = ('No Fluentd pods were found to be in the "Running" state. ' - 'At least one Fluentd pod is required in order to perform this check.') - - raise OpenShiftCheckException(msg) + raise OpenShiftCheckException( + 'No Fluentd pods were found to be in the "Running" state. ' + 'At least one Fluentd pod is required in order to perform this check.' + ) return running_fluentd_pods diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py index c600bb47e..3b1cf8baa 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -12,7 +12,7 @@ except ImportError: from urllib.error import HTTPError, URLError import urllib.request as urllib2 -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException class Kibana(LoggingCheck): @@ -24,25 +24,12 @@ class Kibana(LoggingCheck): def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - kibana_pods, error = self.get_pods_for_component( - self.logging_namespace, - "kibana", - ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_kibana(kibana_pods) - - if not check_error: - check_error = self._check_kibana_route() - - if check_error: - msg = ("The following Kibana deployment issue was found:" - "{}".format(check_error)) - return {"failed": True, "msg": msg} - + kibana_pods = self.get_pods_for_component("kibana") + self.check_kibana(kibana_pods) + self.check_kibana_route() # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Kibana deployment.'} + + return {} def _verify_url_internal(self, url): """ @@ -65,7 +52,7 @@ class Kibana(LoggingCheck): def _verify_url_external(url): """ Try to reach a URL from ansible control host. - Returns: success (bool), reason (for failure) + Raise an OpenShiftCheckException if anything goes wrong. """ # This actually checks from the ansible control host, which may or may not # really be "external" to the cluster. @@ -91,130 +78,149 @@ class Kibana(LoggingCheck): return None def check_kibana(self, pods): - """Check to see if Kibana is up and working. Returns: error string.""" + """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not.""" if not pods: - return "There are no Kibana pods deployed, so no access to the logging UI." + raise OpenShiftCheckException( + "MissingComponentPods", + "There are no Kibana pods deployed, so no access to the logging UI." + ) not_running = self.not_running_pods(pods) if len(not_running) == len(pods): - return "No Kibana pod is in a running state, so there is no access to the logging UI." + raise OpenShiftCheckException( + "NoRunningPods", + "No Kibana pod is in a running state, so there is no access to the logging UI." + ) elif not_running: - return ( + raise OpenShiftCheckException( + "PodNotRunning", "The following Kibana pods are not currently in a running state:\n" - "{pods}" - "However at least one is, so service may not be impacted." - ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running)) - - return None + " {pods}\n" + "However at least one is, so service may not be impacted.".format( + pods="\n ".join(pod['metadata']['name'] for pod in not_running) + ) + ) def _get_kibana_url(self): """ Get kibana route or report error. - Returns: url (or empty), reason for failure + Returns: url """ # Get logging url - get_route = self.exec_oc( - self.logging_namespace, - "get route logging-kibana -o json", - [], - ) + get_route = self.exec_oc("get route logging-kibana -o json", []) if not get_route: - return None, 'no_route_exists' + raise OpenShiftCheckException( + 'no_route_exists', + 'No route is defined for Kibana in the logging namespace,\n' + 'so the logging stack is not accessible. Is logging deployed?\n' + 'Did something remove the logging-kibana route?' + ) - route = json.loads(get_route) + try: + route = json.loads(get_route) + # check that the route has been accepted by a router + ingress = route["status"]["ingress"] + except (ValueError, KeyError): + raise OpenShiftCheckException( + 'get_route_failed', + '"oc get route" returned an unexpected response:\n' + get_route + ) - # check that the route has been accepted by a router - ingress = route["status"]["ingress"] # ingress can be null if there is no router, or empty if not routed if not ingress or not ingress[0]: - return None, 'route_not_accepted' + raise OpenShiftCheckException( + 'route_not_accepted', + 'The logging-kibana route is not being routed by any router.\n' + 'Is the router deployed and working?' + ) host = route.get("spec", {}).get("host") if not host: - return None, 'route_missing_host' + raise OpenShiftCheckException( + 'route_missing_host', + 'The logging-kibana route has no hostname defined,\n' + 'which should never happen. Did something alter its definition?' + ) - return 'https://{}/'.format(host), None + return 'https://{}/'.format(host) - def _check_kibana_route(self): + def check_kibana_route(self): """ Check to see if kibana route is up and working. - Returns: error string + Raises exception if not. """ - known_errors = dict( - no_route_exists=( - 'No route is defined for Kibana in the logging namespace,\n' - 'so the logging stack is not accessible. Is logging deployed?\n' - 'Did something remove the logging-kibana route?' - ), - route_not_accepted=( - 'The logging-kibana route is not being routed by any router.\n' - 'Is the router deployed and working?' - ), - route_missing_host=( - 'The logging-kibana route has no hostname defined,\n' - 'which should never happen. Did something alter its definition?' - ), - ) - kibana_url, error = self._get_kibana_url() - if not kibana_url: - return known_errors.get(error, error) + kibana_url = self._get_kibana_url() # first, check that kibana is reachable from the master. error = self._verify_url_internal(kibana_url) if error: if 'urlopen error [Errno 111] Connection refused' in error: - error = ( + raise OpenShiftCheckException( + 'FailedToConnectInternal', 'Failed to connect from this master to Kibana URL {url}\n' - 'Is kibana running, and is at least one router routing to it?' - ).format(url=kibana_url) + 'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url) + ) elif 'urlopen error [Errno -2] Name or service not known' in error: - error = ( + raise OpenShiftCheckException( + 'FailedToResolveInternal', 'Failed to connect from this master to Kibana URL {url}\n' 'because the hostname does not resolve.\n' - 'Is DNS configured for the Kibana hostname?' - ).format(url=kibana_url) + 'Is DNS configured for the Kibana hostname?'.format(url=kibana_url) + ) elif 'Status code was not' in error: - error = ( + raise OpenShiftCheckException( + 'WrongReturnCodeInternal', 'A request from this master to the Kibana URL {url}\n' 'did not return the correct status code (302).\n' 'This could mean that Kibana is malfunctioning, the hostname is\n' 'resolving incorrectly, or other network issues. The output was:\n' - ' {error}' - ).format(url=kibana_url, error=error) - return 'Error validating the logging Kibana route:\n' + error + ' {error}'.format(url=kibana_url, error=error) + ) + raise OpenShiftCheckException( + 'MiscRouteErrorInternal', + 'Error validating the logging Kibana route internally:\n' + error + ) # in production we would like the kibana route to work from outside the # cluster too; but that may not be the case, so allow disabling just this part. - if not self.get_var("openshift_check_efk_kibana_external", default=True): - return None + if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true": + return error = self._verify_url_external(kibana_url) - if error: - if 'urlopen error [Errno 111] Connection refused' in error: - error = ( - 'Failed to connect from the Ansible control host to Kibana URL {url}\n' - 'Is the router for the Kibana hostname exposed externally?' - ).format(url=kibana_url) - elif 'urlopen error [Errno -2] Name or service not known' in error: - error = ( - 'Failed to resolve the Kibana hostname in {url}\n' - 'from the Ansible control host.\n' - 'Is DNS configured to resolve this Kibana hostname externally?' - ).format(url=kibana_url) - elif 'Expected success (200)' in error: - error = ( - 'A request to Kibana at {url}\n' - 'returned the wrong error code:\n' - ' {error}\n' - 'This could mean that Kibana is malfunctioning, the hostname is\n' - 'resolving incorrectly, or other network issues.' - ).format(url=kibana_url, error=error) - error = ( - 'Error validating the logging Kibana route:\n{error}\n' - 'To disable external Kibana route validation, set in your inventory:\n' - ' openshift_check_efk_kibana_external=False' - ).format(error=error) - return error - return None + + if not error: + return + + error_fmt = ( + 'Error validating the logging Kibana route:\n{error}\n' + 'To disable external Kibana route validation, set the variable:\n' + ' openshift_check_efk_kibana_external=False' + ) + if 'urlopen error [Errno 111] Connection refused' in error: + msg = ( + 'Failed to connect from the Ansible control host to Kibana URL {url}\n' + 'Is the router for the Kibana hostname exposed externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg)) + elif 'urlopen error [Errno -2] Name or service not known' in error: + msg = ( + 'Failed to resolve the Kibana hostname in {url}\n' + 'from the Ansible control host.\n' + 'Is DNS configured to resolve this Kibana hostname externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg)) + elif 'Expected success (200)' in error: + msg = ( + 'A request to Kibana at {url}\n' + 'returned the wrong error code:\n' + ' {error}\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues.' + ).format(url=kibana_url, error=error) + raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg)) + raise OpenShiftCheckException( + 'MiscRouteError', + 'Error validating the logging Kibana route externally:\n' + error + ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py index 43ba6c406..3b7c39760 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -8,6 +8,16 @@ import os from openshift_checks import OpenShiftCheck, OpenShiftCheckException +class MissingComponentPods(OpenShiftCheckException): + """Raised when a component has no pods in the namespace.""" + pass + + +class CouldNotUseOc(OpenShiftCheckException): + """Raised when ocutil has a failure running oc.""" + pass + + class LoggingCheck(OpenShiftCheck): """Base class for OpenShift aggregated logging component checks""" @@ -15,7 +25,6 @@ class LoggingCheck(OpenShiftCheck): # run by itself. name = "logging" - logging_namespace = "logging" def is_active(self): logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) @@ -32,22 +41,24 @@ class LoggingCheck(OpenShiftCheck): def run(self): return {} - def get_pods_for_component(self, namespace, logging_component): - """Get all pods for a given component. Returns: list of pods for component, error string""" + def get_pods_for_component(self, logging_component): + """Get all pods for a given component. Returns: list of pods.""" pod_output = self.exec_oc( - namespace, "get pods -l component={} -o json".format(logging_component), [], ) try: - pods = json.loads(pod_output) - if not pods or not pods.get('items'): + pods = json.loads(pod_output) # raises ValueError if deserialize fails + if not pods or not pods.get('items'): # also a broken response, treat the same raise ValueError() except ValueError: - # successful run but non-parsing data generally means there were no pods in the namespace - return None, 'No pods were found for the "{}" logging component.'.format(logging_component) + # successful run but non-parsing data generally means there were no pods to be found + raise MissingComponentPods( + 'There are no "{}" component pods in the "{}" namespace.\n' + 'Is logging deployed?'.format(logging_component, self.logging_namespace()) + ) - return pods['items'], None + return pods['items'] @staticmethod def not_running_pods(pods): @@ -63,15 +74,19 @@ class LoggingCheck(OpenShiftCheck): ) ] - def exec_oc(self, namespace="logging", cmd_str="", extra_args=None): + def logging_namespace(self): + """Returns the namespace in which logging is configured to deploy.""" + return self.get_var("openshift_logging_namespace", default="logging") + + def exec_oc(self, cmd_str="", extra_args=None): """ Execute an 'oc' command in the remote host. Returns: output of command and namespace, - or raises OpenShiftCheckException on error + or raises CouldNotUseOc on error """ config_base = self.get_var("openshift", "common", "config_base") args = { - "namespace": namespace, + "namespace": self.logging_namespace(), "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), "cmd": cmd_str, "extra_args": list(extra_args) if extra_args else [], @@ -79,17 +94,16 @@ class LoggingCheck(OpenShiftCheck): result = self.execute_module("ocutil", args) if result.get("failed"): - msg = ( - 'Unexpected error using `oc` to validate the logging stack components.\n' - 'Error executing `oc {cmd}`:\n' - '{error}' - ).format(cmd=args['cmd'], error=result['result']) - if result['result'] == '[Errno 2] No such file or directory': - msg = ( + raise CouldNotUseOc( "This host is supposed to be a master but does not have the `oc` command where expected.\n" "Has an installation been run on this host yet?" ) - raise OpenShiftCheckException(msg) + + raise CouldNotUseOc( + 'Unexpected error using `oc` to validate the logging stack components.\n' + 'Error executing `oc {cmd}`:\n' + '{error}'.format(cmd=args['cmd'], error=result['result']) + ) return result.get("result", "") diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py index b24e88e05..d781db649 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py @@ -19,8 +19,6 @@ class LoggingIndexTime(LoggingCheck): name = "logging_index_time" tags = ["health", "logging"] - logging_namespace = "logging" - def run(self): """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs.""" try: @@ -28,29 +26,25 @@ class LoggingIndexTime(LoggingCheck): self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS) ) except ValueError: - return { - "failed": True, - "msg": ('Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' - 'Value must be an integer representing an amount in seconds.'), - } + raise OpenShiftCheckException( + 'InvalidTimeout', + 'Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' + 'Value must be an integer representing an amount in seconds.' + ) running_component_pods = dict() # get all component pods - self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace) for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']): - pods, error = self.get_pods_for_component(self.logging_namespace, component) - - if error: - msg = 'Unable to retrieve pods for the {} logging component: {}' - return {"failed": True, "changed": False, "msg": msg.format(name, error)} - + pods = self.get_pods_for_component(component) running_pods = self.running_pods(pods) if not running_pods: - msg = ('No {} pods in the "Running" state were found.' - 'At least one pod is required in order to perform this check.') - return {"failed": True, "changed": False, "msg": msg.format(name)} + raise OpenShiftCheckException( + component + 'NoRunningPods', + 'No {} pods in the "Running" state were found.' + 'At least one pod is required in order to perform this check.'.format(name) + ) running_component_pods[component] = running_pods @@ -65,8 +59,11 @@ class LoggingIndexTime(LoggingCheck): interval = 1 while not self.query_es_from_es(es_pod, uuid): if time.time() + interval > deadline: - msg = "expecting match in Elasticsearch for message with uuid {}, but no matches were found after {}s." - raise OpenShiftCheckException(msg.format(uuid, timeout_secs)) + raise OpenShiftCheckException( + "NoMatchFound", + "expecting match in Elasticsearch for message with uuid {}, " + "but no matches were found after {}s.".format(uuid, timeout_secs) + ) time.sleep(interval) def curl_kibana_with_uuid(self, kibana_pod): @@ -76,22 +73,23 @@ class LoggingIndexTime(LoggingCheck): exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}" exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid) - error_str = self.exec_oc(self.logging_namespace, exec_cmd, []) + error_str = self.exec_oc(exec_cmd, []) try: error_code = json.loads(error_str)["statusCode"] - except KeyError: - msg = ('invalid response returned from Kibana request (Missing "statusCode" key):\n' - 'Command: {}\nResponse: {}').format(exec_cmd, error_str) - raise OpenShiftCheckException(msg) - except ValueError: - msg = ('invalid response returned from Kibana request (Non-JSON output):\n' - 'Command: {}\nResponse: {}').format(exec_cmd, error_str) - raise OpenShiftCheckException(msg) + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'kibanaInvalidResponse', + 'invalid response returned from Kibana request:\n' + 'Command: {}\nResponse: {}'.format(exec_cmd, error_str) + ) if error_code != 404: - msg = 'invalid error code returned from Kibana request. Expecting error code "404", but got "{}" instead.' - raise OpenShiftCheckException(msg.format(error_code)) + raise OpenShiftCheckException( + 'kibanaInvalidReturnCode', + 'invalid error code returned from Kibana request.\n' + 'Expecting error code "404", but got "{}" instead.'.format(error_code) + ) return uuid @@ -105,17 +103,18 @@ class LoggingIndexTime(LoggingCheck): "--key /etc/elasticsearch/secret/admin-key " "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}" ) - exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace, uuid=uuid) - result = self.exec_oc(self.logging_namespace, exec_cmd, []) + exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid) + result = self.exec_oc(exec_cmd, []) try: count = json.loads(result)["count"] - except KeyError: - msg = 'invalid response from Elasticsearch query:\n"{}"\nMissing "count" key:\n{}' - raise OpenShiftCheckException(msg.format(exec_cmd, result)) - except ValueError: - msg = 'invalid response from Elasticsearch query:\n"{}"\nNon-JSON output:\n{}' - raise OpenShiftCheckException(msg.format(exec_cmd, result)) + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'esInvalidResponse', + 'Invalid response from Elasticsearch query:\n' + ' {}\n' + 'Response was:\n{}'.format(exec_cmd, result) + ) return count diff --git a/roles/openshift_health_checker/test/curator_test.py b/roles/openshift_health_checker/test/curator_test.py index ae108c96e..62c680b74 100644 --- a/roles/openshift_health_checker/test/curator_test.py +++ b/roles/openshift_health_checker/test/curator_test.py @@ -1,22 +1,6 @@ import pytest -from openshift_checks.logging.curator import Curator - - -def canned_curator(exec_oc=None): - """Create a Curator check object with canned exec_oc method""" - check = Curator("dummy") # fails if a module is actually invoked - if exec_oc: - check._exec_oc = exec_oc - return check - - -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +from openshift_checks.logging.curator import Curator, OpenShiftCheckException plain_curator_pod = { @@ -44,25 +28,30 @@ not_running_curator_pod = { } +def test_get_curator_pods(): + check = Curator() + check.get_pods_for_component = lambda *_: [plain_curator_pod] + result = check.run() + assert "failed" not in result or not result["failed"] + + @pytest.mark.parametrize('pods, expect_error', [ ( [], - "no Curator pods", - ), - ( - [plain_curator_pod], - None, + 'MissingComponentPods', ), ( [not_running_curator_pod], - "not currently in a running state", + 'CuratorNotRunning', ), ( [plain_curator_pod, plain_curator_pod], - "more than one Curator pod", + 'TooManyCurators', ), ]) -def test_get_curator_pods(pods, expect_error): - check = canned_curator() - error = check.check_curator(pods) - assert_error(error, expect_error) +def test_get_curator_pods_fail(pods, expect_error): + check = Curator() + check.get_pods_for_component = lambda *_: pods + with pytest.raises(OpenShiftCheckException) as excinfo: + check.run() + assert excinfo.value.name == expect_error diff --git a/roles/openshift_health_checker/test/elasticsearch_test.py b/roles/openshift_health_checker/test/elasticsearch_test.py index 67408609a..09bacd9ac 100644 --- a/roles/openshift_health_checker/test/elasticsearch_test.py +++ b/roles/openshift_health_checker/test/elasticsearch_test.py @@ -1,17 +1,26 @@ import pytest import json -from openshift_checks.logging.elasticsearch import Elasticsearch +from openshift_checks.logging.elasticsearch import Elasticsearch, OpenShiftCheckExceptionList + task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +def canned_elasticsearch(task_vars=None, exec_oc=None): + """Create an Elasticsearch check object with stubbed exec_oc method""" + check = Elasticsearch(None, task_vars or {}) + if exec_oc: + check.exec_oc = exec_oc + return check + + +def assert_error_in_list(expect_err, errorlist): + assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist)) + + +def pods_by_name(pods): + return {pod['metadata']['name']: pod for pod in pods} plain_es_pod = { @@ -19,6 +28,7 @@ plain_es_pod = { "labels": {"component": "es", "deploymentconfig": "logging-es"}, "name": "logging-es", }, + "spec": {}, "status": { "conditions": [{"status": "True", "type": "Ready"}], "containerStatuses": [{"ready": True}], @@ -32,6 +42,7 @@ split_es_pod = { "labels": {"component": "es", "deploymentconfig": "logging-es-2"}, "name": "logging-es-2", }, + "spec": {}, "status": { "conditions": [{"status": "True", "type": "Ready"}], "containerStatuses": [{"ready": True}], @@ -40,12 +51,28 @@ split_es_pod = { "_test_master_name_str": "name logging-es-2", } +unready_es_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es-3"}, + "name": "logging-es-3", + }, + "spec": {}, + "status": { + "conditions": [{"status": "False", "type": "Ready"}], + "containerStatuses": [{"ready": False}], + "podIP": "10.10.10.10", + }, + "_test_master_name_str": "BAD_NAME_RESPONSE", +} + def test_check_elasticsearch(): - assert 'No logging Elasticsearch pods' in Elasticsearch().check_elasticsearch([]) + with pytest.raises(OpenShiftCheckExceptionList) as excinfo: + canned_elasticsearch().check_elasticsearch([]) + assert_error_in_list('NoRunningPods', excinfo.value) # canned oc responses to match so all the checks pass - def _exec_oc(ns, cmd, args): + def exec_oc(cmd, args): if '_cat/master' in cmd: return 'name logging-es' elif '/_nodes' in cmd: @@ -57,35 +84,41 @@ def test_check_elasticsearch(): else: raise Exception(cmd) - check = Elasticsearch(None, {}) - check.exec_oc = _exec_oc - assert not check.check_elasticsearch([plain_es_pod]) + check = canned_elasticsearch({}, exec_oc) + check.get_pods_for_component = lambda *_: [plain_es_pod] + assert {} == check.run() -def pods_by_name(pods): - return {pod['metadata']['name']: pod for pod in pods} +def test_check_running_es_pods(): + pods, errors = Elasticsearch().running_elasticsearch_pods([plain_es_pod, unready_es_pod]) + assert plain_es_pod in pods + assert_error_in_list('PodNotRunning', errors) + + +def test_check_elasticsearch_masters(): + pods = [plain_es_pod] + check = canned_elasticsearch(task_vars_config_base, lambda *_: plain_es_pod['_test_master_name_str']) + assert not check.check_elasticsearch_masters(pods_by_name(pods)) @pytest.mark.parametrize('pods, expect_error', [ ( [], - 'No logging Elasticsearch masters', + 'NoMasterFound', ), ( - [plain_es_pod], - None, + [unready_es_pod], + 'NoMasterName', ), ( [plain_es_pod, split_es_pod], - 'Found multiple Elasticsearch masters', + 'SplitBrainMasters', ), ]) -def test_check_elasticsearch_masters(pods, expect_error): +def test_check_elasticsearch_masters_error(pods, expect_error): test_pods = list(pods) - check = Elasticsearch(None, task_vars_config_base) - check.execute_module = lambda cmd, args: {'result': test_pods.pop(0)['_test_master_name_str']} - errors = check._check_elasticsearch_masters(pods_by_name(pods)) - assert_error(''.join(errors), expect_error) + check = canned_elasticsearch(task_vars_config_base, lambda *_: test_pods.pop(0)['_test_master_name_str']) + assert_error_in_list(expect_error, check.check_elasticsearch_masters(pods_by_name(pods))) es_node_list = { @@ -95,83 +128,76 @@ es_node_list = { }}} +def test_check_elasticsearch_node_list(): + check = canned_elasticsearch(task_vars_config_base, lambda *_: json.dumps(es_node_list)) + assert not check.check_elasticsearch_node_list(pods_by_name([plain_es_pod])) + + @pytest.mark.parametrize('pods, node_list, expect_error', [ ( [], {}, - 'No logging Elasticsearch masters', - ), - ( - [plain_es_pod], - es_node_list, - None, + 'MissingComponentPods', ), ( [plain_es_pod], {}, # empty list of nodes triggers KeyError - "Failed to query", + 'MissingNodeList', ), ( [split_es_pod], es_node_list, - 'does not correspond to any known ES pod', + 'EsPodNodeMismatch', ), ]) -def test_check_elasticsearch_node_list(pods, node_list, expect_error): - check = Elasticsearch(None, task_vars_config_base) - check.execute_module = lambda cmd, args: {'result': json.dumps(node_list)} +def test_check_elasticsearch_node_list_errors(pods, node_list, expect_error): + check = canned_elasticsearch(task_vars_config_base, lambda cmd, args: json.dumps(node_list)) + assert_error_in_list(expect_error, check.check_elasticsearch_node_list(pods_by_name(pods))) - errors = check._check_elasticsearch_node_list(pods_by_name(pods)) - assert_error(''.join(errors), expect_error) + +def test_check_elasticsearch_cluster_health(): + test_health_data = [{"status": "green"}] + check = canned_elasticsearch(exec_oc=lambda *_: json.dumps(test_health_data.pop(0))) + assert not check.check_es_cluster_health(pods_by_name([plain_es_pod])) @pytest.mark.parametrize('pods, health_data, expect_error', [ - ( - [plain_es_pod], - [{"status": "green"}], - None, - ), ( [plain_es_pod], [{"no-status": "should bomb"}], - 'Could not retrieve cluster health status', + 'BadEsResponse', ), ( [plain_es_pod, split_es_pod], [{"status": "green"}, {"status": "red"}], - 'Elasticsearch cluster health status is RED', + 'EsClusterHealthRed', ), ]) -def test_check_elasticsearch_cluster_health(pods, health_data, expect_error): +def test_check_elasticsearch_cluster_health_errors(pods, health_data, expect_error): test_health_data = list(health_data) - check = Elasticsearch(None, task_vars_config_base) - check.execute_module = lambda cmd, args: {'result': json.dumps(test_health_data.pop(0))} + check = canned_elasticsearch(exec_oc=lambda *_: json.dumps(test_health_data.pop(0))) + assert_error_in_list(expect_error, check.check_es_cluster_health(pods_by_name(pods))) - errors = check._check_es_cluster_health(pods_by_name(pods)) - assert_error(''.join(errors), expect_error) + +def test_check_elasticsearch_diskspace(): + check = canned_elasticsearch(exec_oc=lambda *_: 'IUse% Use%\n 3% 4%\n') + assert not check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod])) @pytest.mark.parametrize('disk_data, expect_error', [ ( 'df: /elasticsearch/persistent: No such file or directory\n', - 'Could not retrieve storage usage', - ), - ( - 'IUse% Use%\n 3% 4%\n', - None, + 'BadDfResponse', ), ( 'IUse% Use%\n 95% 40%\n', - 'Inode percent usage on the storage volume', + 'InodeUsageTooHigh', ), ( 'IUse% Use%\n 3% 94%\n', - 'Disk percent usage on the storage volume', + 'DiskUsageTooHigh', ), ]) -def test_check_elasticsearch_diskspace(disk_data, expect_error): - check = Elasticsearch(None, task_vars_config_base) - check.execute_module = lambda cmd, args: {'result': disk_data} - - errors = check._check_elasticsearch_diskspace(pods_by_name([plain_es_pod])) - assert_error(''.join(errors), expect_error) +def test_check_elasticsearch_diskspace_errors(disk_data, expect_error): + check = canned_elasticsearch(exec_oc=lambda *_: disk_data) + assert_error_in_list(expect_error, check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod]))) diff --git a/roles/openshift_health_checker/test/fluentd_config_test.py b/roles/openshift_health_checker/test/fluentd_config_test.py index 8a2d8b72b..10db253bc 100644 --- a/roles/openshift_health_checker/test/fluentd_config_test.py +++ b/roles/openshift_health_checker/test/fluentd_config_test.py @@ -198,12 +198,9 @@ def test_check_logging_config_master(name, pods, logging_driver, extra_words): ), ) - def get_pods(namespace, logging_component): - return pods, None - check = FluentdConfig(execute_module, task_vars) check.execute_module = execute_module - check.get_pods_for_component = get_pods + check.get_pods_for_component = lambda _: pods error = check.check_logging_config() assert error is None @@ -283,12 +280,9 @@ def test_check_logging_config_master_failed(name, pods, logging_driver, words): ), ) - def get_pods(namespace, logging_component): - return pods, None - check = FluentdConfig(execute_module, task_vars) check.execute_module = execute_module - check.get_pods_for_component = get_pods + check.get_pods_for_component = lambda _: pods error = check.check_logging_config() assert error is not None @@ -343,11 +337,8 @@ def test_check_logging_config_master_fails_on_unscheduled_deployment(name, pods, ), ) - def get_pods(namespace, logging_component): - return pods, None - check = FluentdConfig(execute_module, task_vars) - check.get_pods_for_component = get_pods + check.get_pods_for_component = lambda _: pods with pytest.raises(OpenShiftCheckException) as error: check.check_logging_config() diff --git a/roles/openshift_health_checker/test/fluentd_test.py b/roles/openshift_health_checker/test/fluentd_test.py index a84d89cef..e7bf9818b 100644 --- a/roles/openshift_health_checker/test/fluentd_test.py +++ b/roles/openshift_health_checker/test/fluentd_test.py @@ -1,15 +1,11 @@ import pytest import json -from openshift_checks.logging.fluentd import Fluentd +from openshift_checks.logging.fluentd import Fluentd, OpenShiftCheckExceptionList, OpenShiftCheckException -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +def assert_error_in_list(expect_err, errorlist): + assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist)) fluentd_pod_node1 = { @@ -57,45 +53,60 @@ fluentd_node3_unlabeled = { } +def test_get_fluentd_pods(): + check = Fluentd() + check.exec_oc = lambda *_: json.dumps(dict(items=[fluentd_node1])) + check.get_pods_for_component = lambda *_: [fluentd_pod_node1] + assert not check.run() + + @pytest.mark.parametrize('pods, nodes, expect_error', [ ( [], [], - 'No nodes appear to be defined', + 'NoNodesDefined', ), ( [], [fluentd_node3_unlabeled], - 'There are no nodes with the fluentd label', + 'NoNodesLabeled', ), ( [], [fluentd_node1, fluentd_node3_unlabeled], - 'Fluentd will not aggregate logs from these nodes.', + 'NodesUnlabeled', ), ( [], [fluentd_node2], - "nodes are supposed to have a Fluentd pod but do not", + 'MissingFluentdPod', ), ( [fluentd_pod_node1, fluentd_pod_node1], [fluentd_node1], - 'more Fluentd pods running than nodes labeled', + 'TooManyFluentdPods', ), ( [fluentd_pod_node2_down], [fluentd_node2], - "Fluentd pods are supposed to be running", - ), - ( - [fluentd_pod_node1], - [fluentd_node1], - None, + 'FluentdNotRunning', ), ]) -def test_get_fluentd_pods(pods, nodes, expect_error): +def test_get_fluentd_pods_errors(pods, nodes, expect_error): + check = Fluentd() + check.exec_oc = lambda *_: json.dumps(dict(items=nodes)) + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_fluentd(pods) + if isinstance(excinfo.value, OpenShiftCheckExceptionList): + assert_error_in_list(expect_error, excinfo.value) + else: + assert expect_error == excinfo.value.name + + +def test_bad_oc_node_list(): check = Fluentd() - check.exec_oc = lambda ns, cmd, args: json.dumps(dict(items=nodes)) - error = check.check_fluentd(pods) - assert_error(error, expect_error) + check.exec_oc = lambda *_: "this isn't even json" + with pytest.raises(OpenShiftCheckException) as excinfo: + check.get_nodes_by_name() + assert 'BadOcNodeList' == excinfo.value.name diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py index 0bf492511..04a5e89c4 100644 --- a/roles/openshift_health_checker/test/kibana_test.py +++ b/roles/openshift_health_checker/test/kibana_test.py @@ -8,15 +8,7 @@ except ImportError: from urllib.error import HTTPError, URLError import urllib.request as urllib2 -from openshift_checks.logging.kibana import Kibana - - -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +from openshift_checks.logging.kibana import Kibana, OpenShiftCheckException plain_kibana_pod = { @@ -41,39 +33,45 @@ not_running_kibana_pod = { } +def test_check_kibana(): + # should run without exception: + Kibana().check_kibana([plain_kibana_pod]) + + @pytest.mark.parametrize('pods, expect_error', [ ( [], - "There are no Kibana pods deployed", - ), - ( - [plain_kibana_pod], - None, + "MissingComponentPods", ), ( [not_running_kibana_pod], - "No Kibana pod is in a running state", + "NoRunningPods", ), ( [plain_kibana_pod, not_running_kibana_pod], - "The following Kibana pods are not currently in a running state", + "PodNotRunning", ), ]) -def test_check_kibana(pods, expect_error): - check = Kibana() - error = check.check_kibana(pods) - assert_error(error, expect_error) +def test_check_kibana_error(pods, expect_error): + with pytest.raises(OpenShiftCheckException) as excinfo: + Kibana().check_kibana(pods) + assert expect_error == excinfo.value.name -@pytest.mark.parametrize('route, expect_url, expect_error', [ +@pytest.mark.parametrize('comment, route, expect_error', [ ( + "No route returned", None, - None, - 'no_route_exists', + "no_route_exists", ), - # test route with no ingress ( + "broken route response", + {"status": {}}, + "get_route_failed", + ), + ( + "route with no ingress", { "metadata": { "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -86,12 +84,11 @@ def test_check_kibana(pods, expect_error): "host": "hostname", } }, - None, - 'route_not_accepted', + "route_not_accepted", ), - # test route with no host ( + "route with no host", { "metadata": { "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -104,12 +101,21 @@ def test_check_kibana(pods, expect_error): }, "spec": {}, }, - None, - 'route_missing_host', + "route_missing_host", ), +]) +def test_get_kibana_url_error(comment, route, expect_error): + check = Kibana() + check.exec_oc = lambda *_: json.dumps(route) if route else "" + + with pytest.raises(OpenShiftCheckException) as excinfo: + check._get_kibana_url() + assert excinfo.value.name == expect_error - # test route that looks fine + +@pytest.mark.parametrize('comment, route, expect_url', [ ( + "test route that looks fine", { "metadata": { "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -125,62 +131,57 @@ def test_check_kibana(pods, expect_error): }, }, "https://hostname/", - None, ), ]) -def test_get_kibana_url(route, expect_url, expect_error): +def test_get_kibana_url(comment, route, expect_url): check = Kibana() - check.exec_oc = lambda ns, cmd, args: json.dumps(route) if route else "" - - url, error = check._get_kibana_url() - if expect_url: - assert url == expect_url - else: - assert not url - if expect_error: - assert error == expect_error - else: - assert not error + check.exec_oc = lambda *_: json.dumps(route) + assert expect_url == check._get_kibana_url() @pytest.mark.parametrize('exec_result, expect', [ ( 'urlopen error [Errno 111] Connection refused', - 'at least one router routing to it?', + 'FailedToConnectInternal', ), ( 'urlopen error [Errno -2] Name or service not known', - 'DNS configured for the Kibana hostname?', + 'FailedToResolveInternal', ), ( 'Status code was not [302]: HTTP Error 500: Server error', - 'did not return the correct status code', + 'WrongReturnCodeInternal', ), ( 'bork bork bork', - 'bork bork bork', # should pass through + 'MiscRouteErrorInternal', ), ]) def test_verify_url_internal_failure(exec_result, expect): check = Kibana(execute_module=lambda *_: dict(failed=True, msg=exec_result)) - check._get_kibana_url = lambda: ('url', None) + check._get_kibana_url = lambda: 'url' - error = check._check_kibana_route() - assert_error(error, expect) + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_kibana_route() + assert expect == excinfo.value.name @pytest.mark.parametrize('lib_result, expect', [ ( - HTTPError('url', 500, "it broke", hdrs=None, fp=None), - 'it broke', + HTTPError('url', 500, 'it broke', hdrs=None, fp=None), + 'MiscRouteError', + ), + ( + URLError('urlopen error [Errno 111] Connection refused'), + 'FailedToConnect', ), ( - URLError('it broke'), - 'it broke', + URLError('urlopen error [Errno -2] Name or service not known'), + 'FailedToResolve', ), ( 302, - 'returned the wrong error code', + 'WrongReturnCode', ), ( 200, @@ -204,8 +205,40 @@ def test_verify_url_external_failure(lib_result, expect, monkeypatch): monkeypatch.setattr(urllib2, 'urlopen', urlopen) check = Kibana() - check._get_kibana_url = lambda: ('url', None) + check._get_kibana_url = lambda: 'url' check._verify_url_internal = lambda url: None - error = check._check_kibana_route() - assert_error(error, expect) + if not expect: + check.check_kibana_route() + return + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_kibana_route() + assert expect == excinfo.value.name + + +def test_verify_url_external_skip(): + check = Kibana(lambda *_: {}, dict(openshift_check_efk_kibana_external="false")) + check._get_kibana_url = lambda: 'url' + check.check_kibana_route() + + +# this is kind of silly but it adds coverage for the run() method... +def test_run(): + pods = ["foo"] + ran = dict(check_kibana=False, check_route=False) + + def check_kibana(pod_list): + ran["check_kibana"] = True + assert pod_list == pods + + def check_kibana_route(): + ran["check_route"] = True + + check = Kibana() + check.get_pods_for_component = lambda *_: pods + check.check_kibana = check_kibana + check.check_kibana_route = check_kibana_route + + check.run() + assert ran["check_kibana"] and ran["check_route"] diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py index 6f1697ee6..1a1c190f6 100644 --- a/roles/openshift_health_checker/test/logging_check_test.py +++ b/roles/openshift_health_checker/test/logging_check_test.py @@ -1,18 +1,14 @@ import pytest import json -from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck, MissingComponentPods, CouldNotUseOc task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) -logging_namespace = "logging" - - -def canned_loggingcheck(exec_oc=None): +def canned_loggingcheck(exec_oc=None, execute_module=None): """Create a LoggingCheck object with canned exec_oc method""" - check = LoggingCheck() # fails if a module is actually invoked - check.logging_namespace = 'logging' + check = LoggingCheck(execute_module) if exec_oc: check.exec_oc = exec_oc return check @@ -97,8 +93,8 @@ def test_oc_failure(problem, expect): check = LoggingCheck(execute_module, task_vars_config_base) - with pytest.raises(OpenShiftCheckException) as excinfo: - check.exec_oc(logging_namespace, 'get foo', []) + with pytest.raises(CouldNotUseOc) as excinfo: + check.exec_oc('get foo', []) assert expect in str(excinfo) @@ -124,25 +120,32 @@ def test_is_active(groups, logging_deployed, is_active): assert LoggingCheck(None, task_vars).is_active() == is_active -@pytest.mark.parametrize('pod_output, expect_pods, expect_error', [ +@pytest.mark.parametrize('pod_output, expect_pods', [ + ( + json.dumps({'items': [plain_es_pod]}), + [plain_es_pod], + ), +]) +def test_get_pods_for_component(pod_output, expect_pods): + check = canned_loggingcheck(lambda *_: pod_output) + pods = check.get_pods_for_component("es") + assert pods == expect_pods + + +@pytest.mark.parametrize('exec_oc_output, expect_error', [ ( 'No resources found.', - None, - 'No pods were found for the "es"', + MissingComponentPods, ), ( - json.dumps({'items': [plain_kibana_pod, plain_es_pod, plain_curator_pod, fluentd_pod_node1]}), - [plain_es_pod], - None, + '{"items": null}', + MissingComponentPods, ), ]) -def test_get_pods_for_component(pod_output, expect_pods, expect_error): - check = canned_loggingcheck(lambda namespace, cmd, args: pod_output) - pods, error = check.get_pods_for_component( - logging_namespace, - "es", - ) - assert_error(error, expect_error) +def test_get_pods_for_component_fail(exec_oc_output, expect_error): + check = canned_loggingcheck(lambda *_: exec_oc_output) + with pytest.raises(expect_error): + check.get_pods_for_component("es") @pytest.mark.parametrize('name, pods, expected_pods', [ @@ -159,7 +162,7 @@ def test_get_pods_for_component(pod_output, expect_pods, expect_error): ], ids=lambda argvals: argvals[0]) def test_get_not_running_pods_no_container_status(name, pods, expected_pods): - check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: '') + check = canned_loggingcheck(lambda *_: '') result = check.not_running_pods(pods) assert result == expected_pods diff --git a/roles/openshift_health_checker/test/logging_index_time_test.py b/roles/openshift_health_checker/test/logging_index_time_test.py index 178d7cd84..22566b295 100644 --- a/roles/openshift_health_checker/test/logging_index_time_test.py +++ b/roles/openshift_health_checker/test/logging_index_time_test.py @@ -69,7 +69,29 @@ def test_check_running_pods(pods, expect_pods): assert pods == expect_pods -@pytest.mark.parametrize('name, json_response, uuid, timeout, extra_words', [ +def test_bad_config_param(): + with pytest.raises(OpenShiftCheckException) as error: + LoggingIndexTime(task_vars=dict(openshift_check_logging_index_timeout_seconds="foo")).run() + assert 'InvalidTimeout' == error.value.name + + +def test_no_running_pods(): + check = LoggingIndexTime() + check.get_pods_for_component = lambda *_: [not_running_kibana_pod] + with pytest.raises(OpenShiftCheckException) as error: + check.run() + assert 'kibanaNoRunningPods' == error.value.name + + +def test_with_running_pods(): + check = LoggingIndexTime() + check.get_pods_for_component = lambda *_: [plain_running_kibana_pod, plain_running_elasticsearch_pod] + check.curl_kibana_with_uuid = lambda *_: SAMPLE_UUID + check.wait_until_cmd_or_err = lambda *_: None + assert not check.run().get("failed") + + +@pytest.mark.parametrize('name, json_response, uuid, timeout', [ ( 'valid count in response', { @@ -77,94 +99,72 @@ def test_check_running_pods(pods, expect_pods): }, SAMPLE_UUID, 0.001, - [], ), ], ids=lambda argval: argval[0]) -def test_wait_until_cmd_or_err_succeeds(name, json_response, uuid, timeout, extra_words): +def test_wait_until_cmd_or_err_succeeds(name, json_response, uuid, timeout): check = canned_loggingindextime(lambda *_: json.dumps(json_response)) check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, uuid, timeout) -@pytest.mark.parametrize('name, json_response, uuid, timeout, extra_words', [ +@pytest.mark.parametrize('name, json_response, timeout, expect_error', [ ( 'invalid json response', { "invalid_field": 1, }, - SAMPLE_UUID, 0.001, - ["invalid response", "Elasticsearch"], + 'esInvalidResponse', ), ( 'empty response', {}, - SAMPLE_UUID, 0.001, - ["invalid response", "Elasticsearch"], + 'esInvalidResponse', ), ( 'valid response but invalid match count', { "count": 0, }, - SAMPLE_UUID, 0.005, - ["expecting match", SAMPLE_UUID, "0.005s"], + 'NoMatchFound', ) ], ids=lambda argval: argval[0]) -def test_wait_until_cmd_or_err(name, json_response, uuid, timeout, extra_words): +def test_wait_until_cmd_or_err(name, json_response, timeout, expect_error): check = canned_loggingindextime(lambda *_: json.dumps(json_response)) with pytest.raises(OpenShiftCheckException) as error: - check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, uuid, timeout) + check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, SAMPLE_UUID, timeout) - for word in extra_words: - assert word in str(error) + assert expect_error == error.value.name -@pytest.mark.parametrize('name, json_response, uuid, extra_words', [ - ( - 'correct response code, found unique id is returned', - { - "statusCode": 404, - }, - "sample unique id", - ["sample unique id"], - ), -], ids=lambda argval: argval[0]) -def test_curl_kibana_with_uuid(name, json_response, uuid, extra_words): - check = canned_loggingindextime(lambda *_: json.dumps(json_response)) - check.generate_uuid = lambda: uuid - - result = check.curl_kibana_with_uuid(plain_running_kibana_pod) - - for word in extra_words: - assert word in result +def test_curl_kibana_with_uuid(): + check = canned_loggingindextime(lambda *_: json.dumps({"statusCode": 404})) + check.generate_uuid = lambda: SAMPLE_UUID + assert SAMPLE_UUID == check.curl_kibana_with_uuid(plain_running_kibana_pod) -@pytest.mark.parametrize('name, json_response, uuid, extra_words', [ +@pytest.mark.parametrize('name, json_response, expect_error', [ ( 'invalid json response', { "invalid_field": "invalid", }, - SAMPLE_UUID, - ["invalid response returned", 'Missing "statusCode" key'], + 'kibanaInvalidResponse', ), ( 'wrong error code in response', { "statusCode": 500, }, - SAMPLE_UUID, - ["Expecting error code", "500"], + 'kibanaInvalidReturnCode', ), ], ids=lambda argval: argval[0]) -def test_failed_curl_kibana_with_uuid(name, json_response, uuid, extra_words): +def test_failed_curl_kibana_with_uuid(name, json_response, expect_error): check = canned_loggingindextime(lambda *_: json.dumps(json_response)) - check.generate_uuid = lambda: uuid + check.generate_uuid = lambda: SAMPLE_UUID with pytest.raises(OpenShiftCheckException) as error: check.curl_kibana_with_uuid(plain_running_kibana_pod) - for word in extra_words: - assert word in str(error) + assert expect_error == error.value.name -- cgit v1.2.3