summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker')
-rw-r--r--roles/openshift_health_checker/action_plugins/openshift_health_check.py40
-rw-r--r--roles/openshift_health_checker/callback_plugins/zz_failure_summary.py38
-rw-r--r--[-rwxr-xr-x]roles/openshift_health_checker/library/aos_version.py6
-rw-r--r--[-rwxr-xr-x]roles/openshift_health_checker/library/check_yum_update.py0
-rw-r--r--roles/openshift_health_checker/library/docker_info.py2
-rw-r--r--roles/openshift_health_checker/library/rpm_version.py14
-rw-r--r--roles/openshift_health_checker/library/search_journalctl.py150
-rw-r--r--roles/openshift_health_checker/meta/main.yml4
-rw-r--r--roles/openshift_health_checker/openshift_checks/__init__.py203
-rw-r--r--roles/openshift_health_checker/openshift_checks/disk_availability.py84
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_image_availability.py80
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_storage.py173
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py46
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_traffic.py44
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_volume.py41
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/curator.py48
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py199
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd.py208
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py131
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/kibana.py225
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging.py89
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py129
-rw-r--r--roles/openshift_health_checker/openshift_checks/memory_availability.py21
-rw-r--r--roles/openshift_health_checker/openshift_checks/mixins.py33
-rw-r--r--roles/openshift_health_checker/openshift_checks/ovs_version.py54
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_availability.py21
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_update.py8
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_version.py125
-rw-r--r--roles/openshift_health_checker/test/action_plugin_test.py31
-rw-r--r--roles/openshift_health_checker/test/curator_test.py45
-rw-r--r--roles/openshift_health_checker/test/disk_availability_test.py108
-rw-r--r--roles/openshift_health_checker/test/docker_image_availability_test.py37
-rw-r--r--roles/openshift_health_checker/test/docker_storage_test.py125
-rw-r--r--roles/openshift_health_checker/test/elasticsearch_test.py143
-rw-r--r--roles/openshift_health_checker/test/etcd_imagedata_size_test.py25
-rw-r--r--roles/openshift_health_checker/test/etcd_traffic_test.py72
-rw-r--r--roles/openshift_health_checker/test/etcd_volume_test.py14
-rw-r--r--roles/openshift_health_checker/test/fluentd_config_test.py348
-rw-r--r--roles/openshift_health_checker/test/fluentd_test.py63
-rw-r--r--roles/openshift_health_checker/test/kibana_test.py164
-rw-r--r--roles/openshift_health_checker/test/logging_check_test.py57
-rw-r--r--roles/openshift_health_checker/test/logging_index_time_test.py170
-rw-r--r--roles/openshift_health_checker/test/memory_availability_test.py8
-rw-r--r--roles/openshift_health_checker/test/mixins_test.py4
-rw-r--r--roles/openshift_health_checker/test/openshift_check_test.py65
-rw-r--r--roles/openshift_health_checker/test/ovs_version_test.py21
-rw-r--r--roles/openshift_health_checker/test/package_availability_test.py7
-rw-r--r--roles/openshift_health_checker/test/package_update_test.py5
-rw-r--r--roles/openshift_health_checker/test/package_version_test.py74
-rw-r--r--roles/openshift_health_checker/test/rpm_version_test.py6
-rw-r--r--roles/openshift_health_checker/test/search_journalctl_test.py157
51 files changed, 2777 insertions, 1188 deletions
diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
index 0390dc82e..05e53333d 100644
--- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py
+++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
@@ -13,6 +13,7 @@ except ImportError:
display = Display()
from ansible.plugins.action import ActionBase
+from ansible.module_utils.six import string_types
# Augment sys.path so that we can import checks from a directory relative to
# this callback plugin.
@@ -37,9 +38,10 @@ class ActionModule(ActionBase):
return result
try:
- known_checks = self.load_known_checks()
+ known_checks = self.load_known_checks(tmp, task_vars)
args = self._task.args
- resolved_checks = resolve_checks(args.get("checks", []), known_checks.values())
+ requested_checks = normalize(args.get('checks', []))
+ resolved_checks = resolve_checks(requested_checks, known_checks.values())
except OpenShiftCheckException as e:
result["failed"] = True
result["msg"] = str(e)
@@ -47,38 +49,37 @@ class ActionModule(ActionBase):
result["checks"] = check_results = {}
- user_disabled_checks = [
- check.strip()
- for check in task_vars.get("openshift_disable_check", "").split(",")
- ]
+ user_disabled_checks = normalize(task_vars.get('openshift_disable_check', []))
for check_name in resolved_checks:
display.banner("CHECK [{} : {}]".format(check_name, task_vars["ansible_host"]))
check = known_checks[check_name]
- if not check.is_active(task_vars):
+ if not check.is_active():
r = dict(skipped=True, skipped_reason="Not active for this host")
elif check_name in user_disabled_checks:
r = dict(skipped=True, skipped_reason="Disabled by user request")
else:
try:
- r = check.run(tmp, task_vars)
+ r = check.run()
except OpenShiftCheckException as e:
r = dict(
failed=True,
msg=str(e),
)
+ if check.changed:
+ r["changed"] = True
check_results[check_name] = r
- if r.get("failed", False):
- result["failed"] = True
- result["msg"] = "One or more checks failed"
+ result["changed"] = any(r.get("changed") for r in check_results.values())
+ if any(r.get("failed") for r in check_results.values()):
+ result["failed"] = True
+ result["msg"] = "One or more checks failed"
- result["changed"] = any(r.get("changed", False) for r in check_results.values())
return result
- def load_known_checks(self):
+ def load_known_checks(self, tmp, task_vars):
load_checks()
known_checks = {}
@@ -91,7 +92,7 @@ class ActionModule(ActionBase):
check_name,
cls.__module__, cls.__name__,
other_cls.__module__, other_cls.__name__))
- known_checks[check_name] = cls(execute_module=self._execute_module)
+ known_checks[check_name] = cls(execute_module=self._execute_module, tmp=tmp, task_vars=task_vars)
return known_checks
@@ -134,3 +135,14 @@ def resolve_checks(names, all_checks):
resolved.update(tag_to_checks[tag])
return resolved
+
+
+def normalize(checks):
+ """Return a clean list of check names.
+
+ The input may be a comma-separated string or a sequence. Leading and
+ trailing whitespace characters are removed. Empty items are discarded.
+ """
+ if isinstance(checks, string_types):
+ checks = checks.split(',')
+ return [name.strip() for name in checks if name.strip()]
diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
index 443b76ea1..d10200719 100644
--- a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
+++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
@@ -1,6 +1,6 @@
-'''
-Ansible callback plugin.
-'''
+"""
+Ansible callback plugin to give a nicely formatted summary of failures.
+"""
# Reason: In several locations below we disable pylint protected-access
# for Ansible objects that do not give us any public way
@@ -16,11 +16,11 @@ from ansible.utils.color import stringc
class CallbackModule(CallbackBase):
- '''
+ """
This callback plugin stores task results and summarizes failures.
The file name is prefixed with `zz_` to make this plugin be loaded last by
Ansible, thus making its output the last thing that users see.
- '''
+ """
CALLBACK_VERSION = 2.0
CALLBACK_TYPE = 'aggregate'
@@ -48,7 +48,7 @@ class CallbackModule(CallbackBase):
self._print_failure_details(self.__failures)
def _print_failure_details(self, failures):
- '''Print a summary of failed tasks or checks.'''
+ """Print a summary of failed tasks or checks."""
self._display.display(u'\nFailure summary:\n')
width = len(str(len(failures)))
@@ -69,7 +69,9 @@ class CallbackModule(CallbackBase):
playbook_context = None
# re: result attrs see top comment # pylint: disable=protected-access
for failure in failures:
- # get context from check task result since callback plugins cannot access task vars
+ # Get context from check task result since callback plugins cannot access task vars.
+ # NOTE: thus context is not known unless checks run. Failures prior to checks running
+ # don't have playbook_context in the results. But we only use it now when checks fail.
playbook_context = playbook_context or failure['result']._result.get('playbook_context')
failed_checks.update(
name
@@ -81,8 +83,11 @@ class CallbackModule(CallbackBase):
def _print_check_failure_summary(self, failed_checks, context):
checks = ','.join(sorted(failed_checks))
- # NOTE: context is not set if all failures occurred prior to checks task
- summary = (
+ # The purpose of specifying context is to vary the output depending on what the user was
+ # expecting to happen (based on which playbook they ran). The only use currently is to
+ # vary the message depending on whether the user was deliberately running checks or was
+ # trying to install/upgrade and checks are just included. Other use cases may arise.
+ summary = ( # default to explaining what checks are in the first place
'\n'
'The execution of "{playbook}"\n'
'includes checks designed to fail early if the requirements\n'
@@ -94,27 +99,26 @@ class CallbackModule(CallbackBase):
'Some checks may be configurable by variables if your requirements\n'
'are different from the defaults; consult check documentation.\n'
'Variables can be set in the inventory or passed on the\n'
- 'command line using the -e flag to ansible-playbook.\n'
+ 'command line using the -e flag to ansible-playbook.\n\n'
).format(playbook=self._playbook_file, checks=checks)
if context in ['pre-install', 'health']:
- summary = (
+ summary = ( # user was expecting to run checks, less explanation needed
'\n'
'You may choose to configure or disable failing checks by\n'
'setting Ansible variables. To disable those above:\n\n'
' openshift_disable_check={checks}\n\n'
'Consult check documentation for configurable variables.\n'
'Variables can be set in the inventory or passed on the\n'
- 'command line using the -e flag to ansible-playbook.\n'
+ 'command line using the -e flag to ansible-playbook.\n\n'
).format(checks=checks)
- # other expected contexts: install, upgrade
self._display.display(summary)
# re: result attrs see top comment # pylint: disable=protected-access
def _format_failure(failure):
- '''Return a list of pretty-formatted text entries describing a failure, including
+ """Return a list of pretty-formatted text entries describing a failure, including
relevant information about it. Expect that the list of text entries will be joined
- by a newline separator when output to the user.'''
+ by a newline separator when output to the user."""
result = failure['result']
host = result._host.get_name()
play = _get_play(result._task)
@@ -135,7 +139,7 @@ def _format_failure(failure):
def _format_failed_checks(checks):
- '''Return pretty-formatted text describing checks that failed.'''
+ """Return pretty-formatted text describing checks that failed."""
failed_check_msgs = []
for check, body in checks.items():
if body.get('failed', False): # only show the failed checks
@@ -150,7 +154,7 @@ def _format_failed_checks(checks):
# This is inspired by ansible.playbook.base.Base.dump_me.
# re: play/task/block attrs see top comment # pylint: disable=protected-access
def _get_play(obj):
- '''Given a task or block, recursively tries to find its parent play.'''
+ """Given a task or block, recursively try to find its parent play."""
if hasattr(obj, '_play'):
return obj._play
if getattr(obj, '_parent'):
diff --git a/roles/openshift_health_checker/library/aos_version.py b/roles/openshift_health_checker/library/aos_version.py
index 4f43ee751..f9babebb9 100755..100644
--- a/roles/openshift_health_checker/library/aos_version.py
+++ b/roles/openshift_health_checker/library/aos_version.py
@@ -1,5 +1,5 @@
#!/usr/bin/python
-'''
+"""
Ansible module for yum-based systems determining if multiple releases
of an OpenShift package are available, and if the release requested
(if any) is available down to the given precision.
@@ -16,7 +16,7 @@ of release availability already. Without duplicating all that, we would
like the user to have a helpful error message if we detect things will
not work out right. Note that if openshift_release is not specified in
the inventory, the version comparison checks just pass.
-'''
+"""
from ansible.module_utils.basic import AnsibleModule
# NOTE: because of the dependency on yum (Python 2-only), this module does not
@@ -32,7 +32,7 @@ except ImportError as err:
class AosVersionException(Exception):
- '''Base exception class for package version problems'''
+ """Base exception class for package version problems"""
def __init__(self, message, problem_pkgs=None):
Exception.__init__(self, message)
self.problem_pkgs = problem_pkgs
diff --git a/roles/openshift_health_checker/library/check_yum_update.py b/roles/openshift_health_checker/library/check_yum_update.py
index 433795b67..433795b67 100755..100644
--- a/roles/openshift_health_checker/library/check_yum_update.py
+++ b/roles/openshift_health_checker/library/check_yum_update.py
diff --git a/roles/openshift_health_checker/library/docker_info.py b/roles/openshift_health_checker/library/docker_info.py
index 7f712bcff..0d0ddae8b 100644
--- a/roles/openshift_health_checker/library/docker_info.py
+++ b/roles/openshift_health_checker/library/docker_info.py
@@ -1,4 +1,3 @@
-# pylint: disable=missing-docstring
"""
Ansible module for determining information about the docker host.
@@ -13,6 +12,7 @@ from ansible.module_utils.docker_common import AnsibleDockerClient
def main():
+ """Entrypoint for running an Ansible module."""
client = AnsibleDockerClient()
client.module.exit_json(
diff --git a/roles/openshift_health_checker/library/rpm_version.py b/roles/openshift_health_checker/library/rpm_version.py
index 8ea223055..c24fbba3b 100644
--- a/roles/openshift_health_checker/library/rpm_version.py
+++ b/roles/openshift_health_checker/library/rpm_version.py
@@ -4,6 +4,7 @@ Ansible module for rpm-based systems determining existing package version inform
"""
from ansible.module_utils.basic import AnsibleModule
+from ansible.module_utils.six import string_types
IMPORT_EXCEPTION = None
try:
@@ -82,11 +83,16 @@ def _check_pkg_versions(found_pkgs_dict, expected_pkgs_dict):
continue
found_versions = [_parse_version(version) for version in found_pkgs_dict[pkg_name]]
- expected_version = _parse_version(pkg["version"])
- if expected_version not in found_versions:
+
+ if isinstance(pkg["version"], string_types):
+ expected_versions = [_parse_version(pkg["version"])]
+ else:
+ expected_versions = [_parse_version(version) for version in pkg["version"]]
+
+ if not set(expected_versions) & set(found_versions):
invalid_pkg_versions[pkg_name] = {
"found_versions": found_versions,
- "required_version": expected_version,
+ "required_versions": expected_versions,
}
if not_found_pkgs:
@@ -106,7 +112,7 @@ def _check_pkg_versions(found_pkgs_dict, expected_pkgs_dict):
"The following packages were found to be installed with an incorrect version: {}".format('\n'.join([
" \n{}\n Required version: {}\n Found versions: {}".format(
pkg_name,
- pkg["required_version"],
+ ', '.join(pkg["required_versions"]),
', '.join([version for version in pkg["found_versions"]]))
for pkg_name, pkg in invalid_pkg_versions.items()
]))
diff --git a/roles/openshift_health_checker/library/search_journalctl.py b/roles/openshift_health_checker/library/search_journalctl.py
new file mode 100644
index 000000000..3631f71c8
--- /dev/null
+++ b/roles/openshift_health_checker/library/search_journalctl.py
@@ -0,0 +1,150 @@
+#!/usr/bin/python
+"""Interface to journalctl."""
+
+from time import time
+import json
+import re
+import subprocess
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+class InvalidMatcherRegexp(Exception):
+ """Exception class for invalid matcher regexp."""
+ pass
+
+
+class InvalidLogEntry(Exception):
+ """Exception class for invalid / non-json log entries."""
+ pass
+
+
+class LogInputSubprocessError(Exception):
+ """Exception class for errors that occur while executing a subprocess."""
+ pass
+
+
+def main():
+ """Scan a given list of "log_matchers" for journalctl messages containing given patterns.
+ "log_matchers" is a list of dicts consisting of three keys that help fine-tune log searching:
+ 'start_regexp', 'regexp', and 'unit'.
+
+ Sample "log_matchers" list:
+
+ [
+ {
+ 'start_regexp': r'Beginning of systemd unit',
+ 'regexp': r'the specific log message to find',
+ 'unit': 'etcd',
+ }
+ ]
+ """
+ module = AnsibleModule(
+ argument_spec=dict(
+ log_count_limit=dict(type="int", default=500),
+ log_matchers=dict(type="list", required=True),
+ ),
+ )
+
+ timestamp_limit_seconds = time() - 60 * 60 # 1 hour
+
+ log_count_limit = module.params["log_count_limit"]
+ log_matchers = module.params["log_matchers"]
+
+ matched_regexp, errors = get_log_matches(log_matchers, log_count_limit, timestamp_limit_seconds)
+
+ module.exit_json(
+ changed=False,
+ failed=bool(errors),
+ errors=errors,
+ matched=matched_regexp,
+ )
+
+
+def get_log_matches(matchers, log_count_limit, timestamp_limit_seconds):
+ """Return a list of up to log_count_limit matches for each matcher.
+
+ Log entries are only considered if newer than timestamp_limit_seconds.
+ """
+ matched_regexp = []
+ errors = []
+
+ for matcher in matchers:
+ try:
+ log_output = get_log_output(matcher)
+ except LogInputSubprocessError as err:
+ errors.append(str(err))
+ continue
+
+ try:
+ matched = find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds)
+ if matched:
+ matched_regexp.append(matcher.get("regexp", ""))
+ except InvalidMatcherRegexp as err:
+ errors.append(str(err))
+ except InvalidLogEntry as err:
+ errors.append(str(err))
+
+ return matched_regexp, errors
+
+
+def get_log_output(matcher):
+ """Return an iterator on the logs of a given matcher."""
+ try:
+ cmd_output = subprocess.Popen(list([
+ '/bin/journalctl',
+ '-ru', matcher.get("unit", ""),
+ '--output', 'json',
+ ]), stdout=subprocess.PIPE)
+
+ return iter(cmd_output.stdout.readline, '')
+
+ except subprocess.CalledProcessError as exc:
+ msg = "Could not obtain journalctl logs for the specified systemd unit: {}: {}"
+ raise LogInputSubprocessError(msg.format(matcher.get("unit", "<missing>"), str(exc)))
+ except OSError as exc:
+ raise LogInputSubprocessError(str(exc))
+
+
+def find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds):
+ """Return log messages matched in iterable log_output by a given matcher.
+
+ Ignore any log_output items older than timestamp_limit_seconds.
+ """
+ try:
+ regexp = re.compile(matcher.get("regexp", ""))
+ start_regexp = re.compile(matcher.get("start_regexp", ""))
+ except re.error as err:
+ msg = "A log matcher object was provided with an invalid regular expression: {}"
+ raise InvalidMatcherRegexp(msg.format(str(err)))
+
+ matched = None
+
+ for log_count, line in enumerate(log_output):
+ if log_count >= log_count_limit:
+ break
+
+ try:
+ obj = json.loads(line)
+
+ # don't need to look past the most recent service restart
+ if start_regexp.match(obj["MESSAGE"]):
+ break
+
+ log_timestamp_seconds = float(obj["__REALTIME_TIMESTAMP"]) / 1000000
+ if log_timestamp_seconds < timestamp_limit_seconds:
+ break
+
+ if regexp.match(obj["MESSAGE"]):
+ matched = line
+ break
+
+ except ValueError:
+ msg = "Log entry for systemd unit {} contained invalid json syntax: {}"
+ raise InvalidLogEntry(msg.format(matcher.get("unit"), line))
+
+ return matched
+
+
+if __name__ == '__main__':
+ main()
diff --git a/roles/openshift_health_checker/meta/main.yml b/roles/openshift_health_checker/meta/main.yml
index 4d141974c..bc8e7bdcf 100644
--- a/roles/openshift_health_checker/meta/main.yml
+++ b/roles/openshift_health_checker/meta/main.yml
@@ -1,5 +1,3 @@
---
dependencies:
- - role: openshift_facts
- - role: openshift_repos
- - role: openshift_version
+- role: openshift_facts
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index 5c9949ced..07ec6f7ef 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -10,24 +10,56 @@ from importlib import import_module
from ansible.module_utils import six
from ansible.module_utils.six.moves import reduce # pylint: disable=import-error,redefined-builtin
+from ansible.plugins.filter.core import to_bool as ansible_to_bool
class OpenShiftCheckException(Exception):
- """Raised when a check cannot proceed."""
- pass
+ """Raised when a check encounters a failure condition."""
+
+ def __init__(self, name, msg=None):
+ # msg is for the message the user will see when this is raised.
+ # name is for test code to identify the error without looking at msg text.
+ if msg is None: # for parameter backward compatibility
+ msg = name
+ name = self.__class__.__name__
+ self.name = name
+ super(OpenShiftCheckException, self).__init__(msg)
+
+
+class OpenShiftCheckExceptionList(OpenShiftCheckException):
+ """A container for multiple logging errors that may be detected in one check."""
+ def __init__(self, errors):
+ self.errors = errors
+ super(OpenShiftCheckExceptionList, self).__init__(
+ 'OpenShiftCheckExceptionList',
+ '\n'.join(str(msg) for msg in errors)
+ )
+
+ # make iterable
+ def __getitem__(self, index):
+ return self.errors[index]
@six.add_metaclass(ABCMeta)
class OpenShiftCheck(object):
- """A base class for defining checks for an OpenShift cluster environment."""
+ """
+ A base class for defining checks for an OpenShift cluster environment.
+
+ Expect optional params: method execute_module, dict task_vars, and string tmp.
+ execute_module is expected to have a signature compatible with _execute_module
+ from ansible plugins/action/__init__.py, e.g.:
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *args):
+ This is stored so that it can be invoked in subclasses via check.execute_module("name", args)
+ which provides the check's stored task_vars and tmp.
+ """
+
+ def __init__(self, execute_module=None, task_vars=None, tmp=None):
+ self._execute_module = execute_module
+ self.task_vars = task_vars or {}
+ self.tmp = tmp
- def __init__(self, execute_module=None, module_executor=None):
- if execute_module is module_executor is None:
- raise TypeError(
- "__init__() takes either execute_module (recommended) "
- "or module_executor (deprecated), none given")
- self.execute_module = execute_module or module_executor
- self.module_executor = self.execute_module
+ # set to True when the check changes the host, for accurate total "changed" count
+ self.changed = False
@abstractproperty
def name(self):
@@ -43,13 +75,13 @@ class OpenShiftCheck(object):
"""
return []
- @classmethod
- def is_active(cls, task_vars): # pylint: disable=unused-argument
+ @staticmethod
+ def is_active():
"""Returns true if this check applies to the ansible-playbook run."""
return True
@abstractmethod
- def run(self, tmp, task_vars):
+ def run(self):
"""Executes a check, normally implemented as a module."""
return {}
@@ -62,6 +94,134 @@ class OpenShiftCheck(object):
for subclass in subclass.subclasses():
yield subclass
+ def execute_module(self, module_name=None, module_args=None):
+ """Invoke an Ansible module from a check.
+
+ Invoke stored _execute_module, normally copied from the action
+ plugin, with its params and the task_vars and tmp given at
+ check initialization. No positional parameters beyond these
+ are specified. If it's necessary to specify any of the other
+ parameters to _execute_module then that should just be invoked
+ directly (with awareness of changes in method signature per
+ Ansible version).
+
+ So e.g. check.execute_module("foo", dict(arg1=...))
+ Return: result hash from module execution.
+ """
+ if self._execute_module is None:
+ raise NotImplementedError(
+ self.__class__.__name__ +
+ " invoked execute_module without providing the method at initialization."
+ )
+ return self._execute_module(module_name, module_args, self.tmp, self.task_vars)
+
+ def get_var(self, *keys, **kwargs):
+ """Get deeply nested values from task_vars.
+
+ Ansible task_vars structures are Python dicts, often mapping strings to
+ other dicts. This helper makes it easier to get a nested value, raising
+ OpenShiftCheckException when a key is not found.
+
+ Keyword args:
+ default:
+ On missing key, return this as default value instead of raising exception.
+ convert:
+ Supply a function to apply to normalize the value before returning it.
+ None is the default (return as-is).
+ This function should raise ValueError if the user has provided a value
+ that cannot be converted, or OpenShiftCheckException if some other
+ problem needs to be described to the user.
+ """
+ if len(keys) == 1:
+ keys = keys[0].split(".")
+
+ try:
+ value = reduce(operator.getitem, keys, self.task_vars)
+ except (KeyError, TypeError):
+ if "default" not in kwargs:
+ raise OpenShiftCheckException(
+ "This check expects the '{}' inventory variable to be defined\n"
+ "in order to proceed, but it is undefined. There may be a bug\n"
+ "in Ansible, the checks, or their dependencies."
+ "".format(".".join(map(str, keys)))
+ )
+ value = kwargs["default"]
+
+ convert = kwargs.get("convert", None)
+ try:
+ if convert is None:
+ return value
+ elif convert is bool: # interpret bool as Ansible does, instead of python truthiness
+ return ansible_to_bool(value)
+ else:
+ return convert(value)
+
+ except ValueError as error: # user error in specifying value
+ raise OpenShiftCheckException(
+ 'Cannot convert inventory variable to expected type:\n'
+ ' "{var}={value}"\n'
+ '{error}'.format(var=".".join(keys), value=value, error=error)
+ )
+
+ except OpenShiftCheckException: # some other check-specific problem
+ raise
+
+ except Exception as error: # probably a bug in the function
+ raise OpenShiftCheckException(
+ 'There is a bug in this check. While trying to convert variable \n'
+ ' "{var}={value}"\n'
+ 'the given converter cannot be used or failed unexpectedly:\n'
+ '{error}'.format(var=".".join(keys), value=value, error=error)
+ )
+
+ @staticmethod
+ def get_major_minor_version(openshift_image_tag):
+ """Parse and return the deployed version of OpenShift as a tuple."""
+ if openshift_image_tag and openshift_image_tag[0] == 'v':
+ openshift_image_tag = openshift_image_tag[1:]
+
+ # map major release versions across releases
+ # to a common major version
+ openshift_major_release_version = {
+ "1": "3",
+ }
+
+ components = openshift_image_tag.split(".")
+ if not components or len(components) < 2:
+ msg = "An invalid version of OpenShift was found for this host: {}"
+ raise OpenShiftCheckException(msg.format(openshift_image_tag))
+
+ if components[0] in openshift_major_release_version:
+ components[0] = openshift_major_release_version[components[0]]
+
+ components = tuple(int(x) for x in components[:2])
+ return components
+
+ def find_ansible_mount(self, path):
+ """Return the mount point for path from ansible_mounts."""
+
+ # reorganize list of mounts into dict by path
+ mount_for_path = {
+ mount['mount']: mount
+ for mount
+ in self.get_var('ansible_mounts')
+ }
+
+ # NOTE: including base cases '/' and '' to ensure the loop ends
+ mount_targets = set(mount_for_path.keys()) | {'/', ''}
+ mount_point = path
+ while mount_point not in mount_targets:
+ mount_point = os.path.dirname(mount_point)
+
+ try:
+ return mount_for_path[mount_point]
+ except KeyError:
+ known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path))
+ raise OpenShiftCheckException(
+ 'Unable to determine mount point for path "{}".\n'
+ 'Known mount points: {}.'.format(path, known_mounts or 'none')
+ )
+
LOADER_EXCLUDES = (
"__init__.py",
@@ -86,20 +246,3 @@ def load_checks(path=None, subpkg=""):
modules.append(import_module(__package__ + subpkg + "." + name[:-3]))
return modules
-
-
-def get_var(task_vars, *keys, **kwargs):
- """Helper function to get deeply nested values from task_vars.
-
- Ansible task_vars structures are Python dicts, often mapping strings to
- other dicts. This helper makes it easier to get a nested value, raising
- OpenShiftCheckException when a key is not found or returning a default value
- provided as a keyword argument.
- """
- try:
- value = reduce(operator.getitem, keys, task_vars)
- except (KeyError, TypeError):
- if "default" in kwargs:
- return kwargs["default"]
- raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys))))
- return value
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index e93e81efa..6d1dea9ce 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -1,9 +1,8 @@
"""Check that there is enough disk space in predefined paths."""
-import os.path
import tempfile
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
class DiskAvailability(OpenShiftCheck):
@@ -35,22 +34,27 @@ class DiskAvailability(OpenShiftCheck):
},
}
- @classmethod
- def is_active(cls, task_vars):
+ # recommended disk space for each location under an upgrade context
+ recommended_disk_upgrade_bytes = {
+ '/var': {
+ 'masters': 10 * 10**9,
+ 'nodes': 5 * 10 ** 9,
+ 'etcd': 5 * 10 ** 9,
+ },
+ }
+
+ def is_active(self):
"""Skip hosts that do not have recommended disk space requirements."""
- group_names = get_var(task_vars, "group_names", default=[])
+ group_names = self.get_var("group_names", default=[])
active_groups = set()
- for recommendation in cls.recommended_disk_space_bytes.values():
+ for recommendation in self.recommended_disk_space_bytes.values():
active_groups.update(recommendation.keys())
has_disk_space_recommendation = bool(active_groups.intersection(group_names))
- return super(DiskAvailability, cls).is_active(task_vars) and has_disk_space_recommendation
-
- def run(self, tmp, task_vars):
- group_names = get_var(task_vars, "group_names")
- ansible_mounts = get_var(task_vars, "ansible_mounts")
- ansible_mounts = {mount['mount']: mount for mount in ansible_mounts}
+ return super(DiskAvailability, self).is_active() and has_disk_space_recommendation
- user_config = get_var(task_vars, "openshift_check_min_host_disk_gb", default={})
+ def run(self):
+ group_names = self.get_var("group_names")
+ user_config = self.get_var("openshift_check_min_host_disk_gb", default={})
try:
# For backwards-compatibility, if openshift_check_min_host_disk_gb
# is a number, then it overrides the required config for '/var'.
@@ -72,7 +76,7 @@ class DiskAvailability(OpenShiftCheck):
# not part of the official recommendation but present in the user
# configuration.
for path, recommendation in self.recommended_disk_space_bytes.items():
- free_bytes = self.free_bytes(path, ansible_mounts)
+ free_bytes = self.free_bytes(path)
recommended_bytes = max(recommendation.get(name, 0) for name in group_names)
config = user_config.get(path, {})
@@ -81,9 +85,34 @@ class DiskAvailability(OpenShiftCheck):
config_bytes = max(config.get(name, 0) for name in group_names) * 10**9
recommended_bytes = config_bytes or recommended_bytes
+ # if an "upgrade" context is set, update the minimum disk requirement
+ # as this signifies an in-place upgrade - the node might have the
+ # required total disk space, but some of that space may already be
+ # in use by the existing OpenShift deployment.
+ context = self.get_var("r_openshift_health_checker_playbook_context", default="")
+ if context == "upgrade":
+ recommended_upgrade_paths = self.recommended_disk_upgrade_bytes.get(path, {})
+ if recommended_upgrade_paths:
+ recommended_bytes = config_bytes or max(recommended_upgrade_paths.get(name, 0)
+ for name in group_names)
+
if free_bytes < recommended_bytes:
free_gb = float(free_bytes) / 10**9
recommended_gb = float(recommended_bytes) / 10**9
+ msg = (
+ 'Available disk space in "{}" ({:.1f} GB) '
+ 'is below minimum recommended ({:.1f} GB)'
+ ).format(path, free_gb, recommended_gb)
+
+ # warn if check failed under an "upgrade" context
+ # due to limits imposed by the user config
+ if config_bytes and context == "upgrade":
+ msg += ('\n\nMake sure to account for decreased disk space during an upgrade\n'
+ 'due to an existing OpenShift deployment. Please check the value of\n'
+ ' openshift_check_min_host_disk_gb={}\n'
+ 'in your Ansible inventory, and lower the recommended disk space availability\n'
+ 'if necessary for this upgrade.').format(config_bytes)
+
return {
'failed': True,
'msg': (
@@ -94,22 +123,17 @@ class DiskAvailability(OpenShiftCheck):
return {}
- @staticmethod
- def free_bytes(path, ansible_mounts):
+ def free_bytes(self, path):
"""Return the size available in path based on ansible_mounts."""
- mount_point = path
- # arbitry value to prevent an infinite loop, in the unlike case that '/'
- # is not in ansible_mounts.
- max_depth = 32
- while mount_point not in ansible_mounts and max_depth > 0:
- mount_point = os.path.dirname(mount_point)
- max_depth -= 1
-
+ mount = self.find_ansible_mount(path)
try:
- free_bytes = ansible_mounts[mount_point]['size_available']
+ return mount['size_available']
except KeyError:
- known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(ansible_mounts)) or 'none'
- msg = 'Unable to determine disk availability for "{}". Known mount points: {}.'
- raise OpenShiftCheckException(msg.format(path, known_mounts))
-
- return free_bytes
+ raise OpenShiftCheckException(
+ 'Unable to retrieve disk availability for "{path}".\n'
+ 'Ansible facts included a matching mount point for this path:\n'
+ ' {mount}\n'
+ 'however it is missing the size_available field.\n'
+ 'To investigate, you can inspect the output of `ansible -m setup <host>`'
+ ''.format(path=path, mount=mount)
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index bde81ad2c..85a922f86 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -1,6 +1,6 @@
"""Check that required Docker images are available."""
-from openshift_checks import OpenShiftCheck, get_var
+from openshift_checks import OpenShiftCheck
from openshift_checks.mixins import DockerHostMixin
@@ -22,44 +22,44 @@ DEPLOYMENT_IMAGE_INFO = {
class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
"""Check that required Docker images are available.
- This check attempts to ensure that required docker images are
- either present locally, or able to be pulled down from available
- registries defined in a host machine.
+ Determine docker images that an install would require and check that they
+ are either present in the host's docker index, or available for the host to pull
+ with known registries as defined in our inventory file (or defaults).
"""
name = "docker_image_availability"
tags = ["preflight"]
- dependencies = ["skopeo", "python-docker-py"]
+ # we use python-docker-py to check local docker for images, and skopeo
+ # to look for images available remotely without waiting to pull them.
+ dependencies = ["python-docker-py", "skopeo"]
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Skip hosts with unsupported deployment types."""
- deployment_type = get_var(task_vars, "openshift_deployment_type")
+ deployment_type = self.get_var("openshift_deployment_type")
has_valid_deployment_type = deployment_type in DEPLOYMENT_IMAGE_INFO
- return super(DockerImageAvailability, cls).is_active(task_vars) and has_valid_deployment_type
+ return super(DockerImageAvailability, self).is_active() and has_valid_deployment_type
- def run(self, tmp, task_vars):
- msg, failed, changed = self.ensure_dependencies(task_vars)
+ def run(self):
+ msg, failed = self.ensure_dependencies()
if failed:
return {
"failed": True,
- "changed": changed,
"msg": "Some dependencies are required in order to check Docker image availability.\n" + msg
}
- required_images = self.required_images(task_vars)
- missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
+ required_images = self.required_images()
+ missing_images = set(required_images) - set(self.local_images(required_images))
# exit early if all images were found locally
if not missing_images:
- return {"changed": changed}
+ return {}
- registries = self.known_docker_registries(task_vars)
+ registries = self.known_docker_registries()
if not registries:
- return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed}
+ return {"failed": True, "msg": "Unable to retrieve any docker registries."}
- available_images = self.available_images(missing_images, registries, task_vars)
+ available_images = self.available_images(missing_images, registries)
unavailable_images = set(missing_images) - set(available_images)
if unavailable_images:
@@ -69,13 +69,11 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
"One or more required Docker images are not available:\n {}\n"
"Configured registries: {}"
).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)),
- "changed": changed,
}
- return {"changed": changed}
+ return {}
- @staticmethod
- def required_images(task_vars):
+ def required_images(self):
"""
Determine which images we expect to need for this host.
Returns: a set of required images like 'openshift/origin:v3.6'
@@ -92,17 +90,17 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
Registry is not included in constructed images. It may be in oreg_url or etcd image.
"""
required = set()
- deployment_type = get_var(task_vars, "openshift_deployment_type")
- host_groups = get_var(task_vars, "group_names")
+ deployment_type = self.get_var("openshift_deployment_type")
+ host_groups = self.get_var("group_names")
# containerized etcd may not have openshift_image_tag, see bz 1466622
- image_tag = get_var(task_vars, "openshift_image_tag", default="latest")
+ image_tag = self.get_var("openshift_image_tag", default="latest")
image_info = DEPLOYMENT_IMAGE_INFO[deployment_type]
if not image_info:
return required
# template for images that run on top of OpenShift
image_url = "{}/{}-{}:{}".format(image_info["namespace"], image_info["name"], "${component}", "${version}")
- image_url = get_var(task_vars, "oreg_url", default="") or image_url
+ image_url = self.get_var("oreg_url", default="") or image_url
if 'nodes' in host_groups:
for suffix in NODE_IMAGE_SUFFIXES:
required.add(image_url.replace("${component}", suffix).replace("${version}", image_tag))
@@ -112,7 +110,7 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
required.add(image_info["registry_console_image"])
# images for containerized components
- if get_var(task_vars, "openshift", "common", "is_containerized"):
+ if self.get_var("openshift", "common", "is_containerized"):
components = set()
if 'nodes' in host_groups:
components.update(["node", "openvswitch"])
@@ -125,28 +123,27 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
return required
- def local_images(self, images, task_vars):
+ def local_images(self, images):
"""Filter a list of images and return those available locally."""
return [
image for image in images
- if self.is_image_local(image, task_vars)
+ if self.is_image_local(image)
]
- def is_image_local(self, image, task_vars):
+ def is_image_local(self, image):
"""Check if image is already in local docker index."""
- result = self.execute_module("docker_image_facts", {"name": image}, task_vars=task_vars)
+ result = self.execute_module("docker_image_facts", {"name": image})
if result.get("failed", False):
return False
return bool(result.get("images", []))
- @staticmethod
- def known_docker_registries(task_vars):
+ def known_docker_registries(self):
"""Build a list of docker registries available according to inventory vars."""
- docker_facts = get_var(task_vars, "openshift", "docker")
+ docker_facts = self.get_var("openshift", "docker")
regs = set(docker_facts["additional_registries"])
- deployment_type = get_var(task_vars, "openshift_deployment_type")
+ deployment_type = self.get_var("openshift_deployment_type")
if deployment_type == "origin":
regs.update(["docker.io"])
elif "enterprise" in deployment_type:
@@ -154,24 +151,25 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
return list(regs)
- def available_images(self, images, registries, task_vars):
- """Inspect existing images using Skopeo and return all images successfully inspected."""
+ def available_images(self, images, default_registries):
+ """Search remotely for images. Returns: list of images found."""
return [
image for image in images
- if self.is_available_skopeo_image(image, registries, task_vars)
+ if self.is_available_skopeo_image(image, default_registries)
]
- def is_available_skopeo_image(self, image, registries, task_vars):
+ def is_available_skopeo_image(self, image, default_registries):
"""Use Skopeo to determine if required image exists in known registry(s)."""
+ registries = default_registries
- # if image does already includes a registry, just use that
+ # if image already includes a registry, only use that
if image.count("/") > 1:
registry, image = image.split("/", 1)
registries = [registry]
for registry in registries:
args = {"_raw_params": "skopeo inspect --tls-verify=false docker://{}/{}".format(registry, image)}
- result = self.execute_module("command", args, task_vars=task_vars)
+ result = self.execute_module("command", args)
if result.get("rc", 0) == 0 and not result.get("failed"):
return True
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
index e80691ef3..0558ddf14 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_storage.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -1,7 +1,7 @@
"""Check Docker storage driver and usage."""
import json
import re
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
from openshift_checks.mixins import DockerHostMixin
@@ -20,59 +20,82 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
storage_drivers = ["devicemapper", "overlay", "overlay2"]
max_thinpool_data_usage_percent = 90.0
max_thinpool_meta_usage_percent = 90.0
+ max_overlay_usage_percent = 90.0
- # pylint: disable=too-many-return-statements
- # Reason: permanent stylistic exception;
- # it is clearer to return on failures and there are just many ways to fail here.
- def run(self, tmp, task_vars):
- msg, failed, changed = self.ensure_dependencies(task_vars)
+ # TODO(lmeyer): mention these in the output when check fails
+ configuration_variables = [
+ (
+ "max_thinpool_data_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for data. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent),
+ ),
+ (
+ "max_thinpool_meta_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for metadata. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent),
+ ),
+ (
+ "max_overlay_usage_percent",
+ "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. "
+ "Format: float. Default: {:.1f}".format(max_overlay_usage_percent),
+ ),
+ ]
+
+ def run(self):
+ msg, failed = self.ensure_dependencies()
if failed:
return {
"failed": True,
- "changed": changed,
"msg": "Some dependencies are required in order to query docker storage on host:\n" + msg
}
# attempt to get the docker info hash from the API
- info = self.execute_module("docker_info", {}, task_vars=task_vars)
- if info.get("failed"):
- return {"failed": True, "changed": changed,
+ docker_info = self.execute_module("docker_info", {})
+ if docker_info.get("failed"):
+ return {"failed": True,
"msg": "Failed to query Docker API. Is docker running on this host?"}
- if not info.get("info"): # this would be very strange
- return {"failed": True, "changed": changed,
- "msg": "Docker API query missing info:\n{}".format(json.dumps(info))}
- info = info["info"]
+ if not docker_info.get("info"): # this would be very strange
+ return {"failed": True,
+ "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))}
+ docker_info = docker_info["info"]
# check if the storage driver we saw is valid
- driver = info.get("Driver", "[NONE]")
+ driver = docker_info.get("Driver", "[NONE]")
if driver not in self.storage_drivers:
msg = (
"Detected unsupported Docker storage driver '{driver}'.\n"
"Supported storage drivers are: {drivers}"
).format(driver=driver, drivers=', '.join(self.storage_drivers))
- return {"failed": True, "changed": changed, "msg": msg}
+ return {"failed": True, "msg": msg}
# driver status info is a list of tuples; convert to dict and validate based on driver
- driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])}
+ driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])}
+
+ result = {}
+
if driver == "devicemapper":
- if driver_status.get("Data loop file"):
- msg = (
- "Use of loopback devices with the Docker devicemapper storage driver\n"
- "(the default storage configuration) is unsupported in production.\n"
- "Please use docker-storage-setup to configure a backing storage volume.\n"
- "See http://red.ht/2rNperO for further information."
- )
- return {"failed": True, "changed": changed, "msg": msg}
- result = self._check_dm_usage(driver_status, task_vars)
- result['changed'] = result.get('changed', False) or changed
- return result
+ result = self.check_devicemapper_support(driver_status)
- # TODO(lmeyer): determine how to check usage for overlay2
+ if driver in ['overlay', 'overlay2']:
+ result = self.check_overlay_support(docker_info, driver_status)
- return {"changed": changed}
+ return result
- def _check_dm_usage(self, driver_status, task_vars):
- """
+ def check_devicemapper_support(self, driver_status):
+ """Check if dm storage driver is supported as configured. Return: result dict."""
+ if driver_status.get("Data loop file"):
+ msg = (
+ "Use of loopback devices with the Docker devicemapper storage driver\n"
+ "(the default storage configuration) is unsupported in production.\n"
+ "Please use docker-storage-setup to configure a backing storage volume.\n"
+ "See http://red.ht/2rNperO for further information."
+ )
+ return {"failed": True, "msg": msg}
+ result = self.check_dm_usage(driver_status)
+ return result
+
+ def check_dm_usage(self, driver_status):
+ """Check usage thresholds for Docker dm storage driver. Return: result dict.
Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
devicemapper storage. The LV is "thin" because it does not use all available storage
@@ -83,7 +106,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
could run out of space first; so we check both.
"""
vals = dict(
- vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars),
+ vg_free=self.get_vg_free(driver_status.get("Pool Name")),
data_used=driver_status.get("Data Space Used"),
data_total=driver_status.get("Data Space Total"),
metadata_used=driver_status.get("Metadata Space Used"),
@@ -93,7 +116,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
# convert all human-readable strings to bytes
for key, value in vals.copy().items():
try:
- vals[key + "_bytes"] = self._convert_to_bytes(value)
+ vals[key + "_bytes"] = self.convert_to_bytes(value)
except ValueError as err: # unlikely to hit this from API info, but just to be safe
return {
"failed": True,
@@ -104,7 +127,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
# determine the threshold percentages which usage should not exceed
for name, default in [("data", self.max_thinpool_data_usage_percent),
("metadata", self.max_thinpool_meta_usage_percent)]:
- percent = get_var(task_vars, "max_thinpool_" + name + "_usage_percent", default=default)
+ percent = self.get_var("max_thinpool_" + name + "_usage_percent", default=default)
try:
vals[name + "_threshold"] = float(percent)
except ValueError:
@@ -131,10 +154,12 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
return vals
- def _get_vg_free(self, pool, task_vars):
- # Determine which VG to examine according to the pool name, the only indicator currently
- # available from the Docker API driver info. We assume a name that looks like
- # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen.
+ def get_vg_free(self, pool):
+ """Determine which VG to examine according to the pool name. Return: size vgs reports.
+ Pool name is the only indicator currently available from the Docker API driver info.
+ We assume a name that looks like "vg--name-docker--pool";
+ vg and lv names with inner hyphens doubled, joined by a hyphen.
+ """
match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen
if not match: # unlikely, but... be clear if we assumed wrong
raise OpenShiftCheckException(
@@ -146,7 +171,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
vgs_cmd = "/sbin/vgs --noheadings -o vg_free --units g --select vg_name=" + vg_name
# should return free space like " 12.00g" if the VG exists; empty if it does not
- ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars=task_vars)
+ ret = self.execute_module("command", {"_raw_params": vgs_cmd})
if ret.get("failed") or ret.get("rc", 0) != 0:
raise OpenShiftCheckException(
"Is LVM installed? Failed to run /sbin/vgs "
@@ -163,7 +188,8 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
return size
@staticmethod
- def _convert_to_bytes(string):
+ def convert_to_bytes(string):
+ """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes."""
units = dict(
b=1,
k=1024,
@@ -183,3 +209,68 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
raise ValueError("Cannot convert to a byte size: " + string)
return float(number) * multiplier
+
+ def check_overlay_support(self, docker_info, driver_status):
+ """Check if overlay storage driver is supported for this host. Return: result dict."""
+ # check for xfs as backing store
+ backing_fs = driver_status.get("Backing Filesystem", "[NONE]")
+ if backing_fs != "xfs":
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n"
+ "'xfs' as the backing storage, but this host's storage is type '{fs}'."
+ ).format(fs=backing_fs)
+ return {"failed": True, "msg": msg}
+
+ # check support for OS and kernel version
+ o_s = docker_info.get("OperatingSystem", "[NONE]")
+ if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s:
+ # keep it simple, only check enterprise kernel versions; assume everyone else is good
+ kernel = docker_info.get("KernelVersion", "[NONE]")
+ kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)]
+ if kernel_arr < [3, 10, 0, 514]: # rhel < 7.3
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n"
+ "kernel version 3.10.0-514; but Docker reports kernel version {version}."
+ ).format(version=kernel)
+ return {"failed": True, "msg": msg}
+ # NOTE: we could check for --selinux-enabled here but docker won't even start with
+ # that option until it's supported in the kernel so we don't need to.
+
+ return self.check_overlay_usage(docker_info)
+
+ def check_overlay_usage(self, docker_info):
+ """Check disk usage on OverlayFS backing store volume. Return: result dict."""
+ path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"]
+
+ threshold = self.get_var("max_overlay_usage_percent", default=self.max_overlay_usage_percent)
+ try:
+ threshold = float(threshold)
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold),
+ }
+
+ mount = self.find_ansible_mount(path)
+ try:
+ free_bytes = mount['size_available']
+ total_bytes = mount['size_total']
+ usage = 100.0 * (total_bytes - free_bytes) / total_bytes
+ except (KeyError, ZeroDivisionError):
+ return {
+ "failed": True,
+ "msg": "The ansible_mount found for path {} is invalid.\n"
+ "This is likely to be an Ansible bug. The record was:\n"
+ "{}".format(path, json.dumps(mount, indent=2)),
+ }
+
+ if usage > threshold:
+ return {
+ "failed": True,
+ "msg": (
+ "For Docker OverlayFS mount point {path},\n"
+ "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}."
+ ).format(path=mount["mount"], pct=usage, thresh=threshold)
+ }
+
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
index c04a69765..f4296753a 100644
--- a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
+++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
@@ -2,7 +2,7 @@
Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster.
"""
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck
class EtcdImageDataSize(OpenShiftCheck):
@@ -11,24 +11,25 @@ class EtcdImageDataSize(OpenShiftCheck):
name = "etcd_imagedata_size"
tags = ["etcd"]
- def run(self, tmp, task_vars):
- etcd_mountpath = self._get_etcd_mountpath(get_var(task_vars, "ansible_mounts"))
+ def run(self):
+ etcd_mountpath = self.find_ansible_mount("/var/lib/etcd")
etcd_avail_diskspace = etcd_mountpath["size_available"]
etcd_total_diskspace = etcd_mountpath["size_total"]
- etcd_imagedata_size_limit = get_var(task_vars,
- "etcd_max_image_data_size_bytes",
- default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace)))
+ etcd_imagedata_size_limit = self.get_var(
+ "etcd_max_image_data_size_bytes",
+ default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace))
+ )
- etcd_is_ssl = get_var(task_vars, "openshift", "master", "etcd_use_ssl", default=False)
- etcd_port = get_var(task_vars, "openshift", "master", "etcd_port", default=2379)
- etcd_hosts = get_var(task_vars, "openshift", "master", "etcd_hosts")
+ etcd_is_ssl = self.get_var("openshift", "master", "etcd_use_ssl", default=False)
+ etcd_port = self.get_var("openshift", "master", "etcd_port", default=2379)
+ etcd_hosts = self.get_var("openshift", "master", "etcd_hosts")
- config_base = get_var(task_vars, "openshift", "common", "config_base")
+ config_base = self.get_var("openshift", "common", "config_base")
- cert = task_vars.get("etcd_client_cert", config_base + "/master/master.etcd-client.crt")
- key = task_vars.get("etcd_client_key", config_base + "/master/master.etcd-client.key")
- ca_cert = task_vars.get("etcd_client_ca_cert", config_base + "/master/master.etcd-ca.crt")
+ cert = self.get_var("etcd_client_cert", default=config_base + "/master/master.etcd-client.crt")
+ key = self.get_var("etcd_client_key", default=config_base + "/master/master.etcd-client.key")
+ ca_cert = self.get_var("etcd_client_ca_cert", default=config_base + "/master/master.etcd-ca.crt")
for etcd_host in list(etcd_hosts):
args = {
@@ -46,7 +47,7 @@ class EtcdImageDataSize(OpenShiftCheck):
},
}
- etcdkeysize = self.module_executor("etcdkeysize", args, task_vars)
+ etcdkeysize = self.execute_module("etcdkeysize", args)
if etcdkeysize.get("rc", 0) != 0 or etcdkeysize.get("failed"):
msg = 'Failed to retrieve stats for etcd host "{host}": {reason}'
@@ -55,7 +56,7 @@ class EtcdImageDataSize(OpenShiftCheck):
reason = etcdkeysize["module_stderr"]
msg = msg.format(host=etcd_host, reason=reason)
- return {"failed": True, "changed": False, "msg": msg}
+ return {"failed": True, "msg": msg}
if etcdkeysize["size_limit_exceeded"]:
limit = self._to_gigabytes(etcd_imagedata_size_limit)
@@ -64,20 +65,7 @@ class EtcdImageDataSize(OpenShiftCheck):
"Use the `oadm prune images` command to cleanup unused Docker images.")
return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)}
- return {"changed": False}
-
- @staticmethod
- def _get_etcd_mountpath(ansible_mounts):
- valid_etcd_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
-
- mount_for_path = {mnt.get("mount"): mnt for mnt in ansible_mounts}
- for path in valid_etcd_mount_paths:
- if path in mount_for_path:
- return mount_for_path[path]
-
- paths = ', '.join(sorted(mount_for_path)) or 'none'
- msg = "Unable to determine a valid etcd mountpath. Paths mounted: {}.".format(paths)
- raise OpenShiftCheckException(msg)
+ return {}
@staticmethod
def _to_gigabytes(byte_size):
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
new file mode 100644
index 000000000..b4c8957e9
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
@@ -0,0 +1,44 @@
+"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic."""
+
+from openshift_checks import OpenShiftCheck
+
+
+class EtcdTraffic(OpenShiftCheck):
+ """Check if host is being affected by an increase in etcd traffic."""
+
+ name = "etcd_traffic"
+ tags = ["health", "etcd"]
+
+ def is_active(self):
+ """Skip hosts that do not have etcd in their group names."""
+ group_names = self.get_var("group_names", default=[])
+ valid_group_names = "etcd" in group_names
+
+ version = self.get_major_minor_version(self.get_var("openshift_image_tag"))
+ valid_version = version in ((3, 4), (3, 5))
+
+ return super(EtcdTraffic, self).is_active() and valid_group_names and valid_version
+
+ def run(self):
+ is_containerized = self.get_var("openshift", "common", "is_containerized")
+ unit = "etcd_container" if is_containerized else "etcd"
+
+ log_matchers = [{
+ "start_regexp": r"Starting Etcd Server",
+ "regexp": r"etcd: sync duration of [^,]+, expected less than 1s",
+ "unit": unit
+ }]
+
+ match = self.execute_module("search_journalctl", {"log_matchers": log_matchers})
+
+ if match.get("matched"):
+ msg = ("Higher than normal etcd traffic detected.\n"
+ "OpenShift 3.4 introduced an increase in etcd traffic.\n"
+ "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n"
+ "Please refer to https://access.redhat.com/solutions/2916381 for more information.")
+ return {"failed": True, "msg": msg}
+
+ if match.get("failed"):
+ return {"failed": True, "msg": "\n".join(match.get("errors"))}
+
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
index 7452c9cc1..e5d93ff3f 100644
--- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py
+++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
@@ -1,6 +1,6 @@
"""A health check for OpenShift clusters."""
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck
class EtcdVolume(OpenShiftCheck):
@@ -11,24 +11,21 @@ class EtcdVolume(OpenShiftCheck):
# Default device usage threshold. Value should be in the range [0, 100].
default_threshold_percent = 90
- # Where to find ectd data, higher priority first.
- supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
-
- @classmethod
- def is_active(cls, task_vars):
- etcd_hosts = get_var(task_vars, "groups", "etcd", default=[]) or get_var(task_vars, "groups", "masters",
- default=[]) or []
- is_etcd_host = get_var(task_vars, "ansible_ssh_host") in etcd_hosts
- return super(EtcdVolume, cls).is_active(task_vars) and is_etcd_host
-
- def run(self, tmp, task_vars):
- mount_info = self._etcd_mount_info(task_vars)
+ # Where to find etcd data
+ etcd_mount_path = "/var/lib/etcd"
+
+ def is_active(self):
+ etcd_hosts = self.get_var("groups", "etcd", default=[]) or self.get_var("groups", "masters", default=[]) or []
+ is_etcd_host = self.get_var("ansible_ssh_host") in etcd_hosts
+ return super(EtcdVolume, self).is_active() and is_etcd_host
+
+ def run(self):
+ mount_info = self.find_ansible_mount(self.etcd_mount_path)
available = mount_info["size_available"]
total = mount_info["size_total"]
used = total - available
- threshold = get_var(
- task_vars,
+ threshold = self.get_var(
"etcd_device_usage_threshold_percent",
default=self.default_threshold_percent
)
@@ -43,16 +40,4 @@ class EtcdVolume(OpenShiftCheck):
)
return {"failed": True, "msg": msg}
- return {"changed": False}
-
- def _etcd_mount_info(self, task_vars):
- ansible_mounts = get_var(task_vars, "ansible_mounts")
- mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts}
-
- for path in self.supported_mount_paths:
- if path in mounts:
- return mounts[path]
-
- paths = ', '.join(sorted(mounts)) or 'none'
- msg = "Unable to find etcd storage mount point. Paths mounted: {}.".format(paths)
- raise OpenShiftCheckException(msg)
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
index c9fc59896..b27f97172 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/curator.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -1,61 +1,43 @@
-"""
-Module for performing checks on an Curator logging deployment
-"""
+"""Check for an aggregated logging Curator deployment"""
-from openshift_checks import get_var
-from openshift_checks.logging.logging import LoggingCheck
+from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck
class Curator(LoggingCheck):
- """Module that checks an integrated logging Curator deployment"""
+ """Check for an aggregated logging Curator deployment"""
name = "curator"
tags = ["health", "logging"]
- logging_namespace = None
-
- def run(self, tmp, task_vars):
+ def run(self):
"""Check various things and gather errors. Returns: result as hash"""
- self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
- curator_pods, error = super(Curator, self).get_pods_for_component(
- self.module_executor,
- self.logging_namespace,
- "curator",
- task_vars
- )
- if error:
- return {"failed": True, "changed": False, "msg": error}
- check_error = self.check_curator(curator_pods)
-
- if check_error:
- msg = ("The following Curator deployment issue was found:"
- "\n-------\n"
- "{}".format(check_error))
- return {"failed": True, "changed": False, "msg": msg}
-
+ curator_pods = self.get_pods_for_component("curator")
+ self.check_curator(curator_pods)
# TODO(lmeyer): run it all again for the ops cluster
- return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'}
+
+ return {}
def check_curator(self, pods):
"""Check to see if curator is up and working. Returns: error string"""
if not pods:
- return (
+ raise OpenShiftCheckException(
+ "MissingComponentPods",
"There are no Curator pods for the logging stack,\n"
"so nothing will prune Elasticsearch indexes.\n"
"Is Curator correctly deployed?"
)
- not_running = super(Curator, self).not_running_pods(pods)
+ not_running = self.not_running_pods(pods)
if len(not_running) == len(pods):
- return (
+ raise OpenShiftCheckException(
+ "CuratorNotRunning",
"The Curator pod is not currently in a running state,\n"
"so Elasticsearch indexes may increase without bound."
)
if len(pods) - len(not_running) > 1:
- return (
+ raise OpenShiftCheckException(
+ "TooManyCurators",
"There is more than one Curator pod running. This should not normally happen.\n"
"Although this doesn't cause any problems, you may want to investigate."
)
-
- return None
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
index 01cb35b81..7fc843fd7 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -1,190 +1,193 @@
-"""
-Module for performing checks on an Elasticsearch logging deployment
-"""
+"""Check for an aggregated logging Elasticsearch deployment"""
import json
import re
-from openshift_checks import get_var
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
from openshift_checks.logging.logging import LoggingCheck
class Elasticsearch(LoggingCheck):
- """Module that checks an integrated logging Elasticsearch deployment"""
+ """Check for an aggregated logging Elasticsearch deployment"""
name = "elasticsearch"
tags = ["health", "logging"]
- logging_namespace = None
-
- def run(self, tmp, task_vars):
+ def run(self):
"""Check various things and gather errors. Returns: result as hash"""
- self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
- es_pods, error = super(Elasticsearch, self).get_pods_for_component(
- self.execute_module,
- self.logging_namespace,
- "es",
- task_vars,
- )
- if error:
- return {"failed": True, "changed": False, "msg": error}
- check_error = self.check_elasticsearch(es_pods, task_vars)
-
- if check_error:
- msg = ("The following Elasticsearch deployment issue was found:"
- "\n-------\n"
- "{}".format(check_error))
- return {"failed": True, "changed": False, "msg": msg}
-
+ es_pods = self.get_pods_for_component("es")
+ self.check_elasticsearch(es_pods)
# TODO(lmeyer): run it all again for the ops cluster
- return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'}
- def _not_running_elasticsearch_pods(self, es_pods):
- """Returns: list of running pods, list of errors about non-running pods"""
- not_running = super(Elasticsearch, self).not_running_pods(es_pods)
- if not_running:
- return not_running, [(
- 'The following Elasticsearch pods are not running:\n'
- '{pods}'
- 'These pods will not aggregate logs from their nodes.'
- ).format(pods=''.join(
- " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
- for pod in not_running
- ))]
- return not_running, []
-
- def check_elasticsearch(self, es_pods, task_vars):
- """Various checks for elasticsearch. Returns: error string"""
- not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods)
- running_pods = [pod for pod in es_pods if pod not in not_running_pods]
+ return {}
+
+ def check_elasticsearch(self, es_pods):
+ """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors."""
+ running_pods, errors = self.running_elasticsearch_pods(es_pods)
pods_by_name = {
pod['metadata']['name']: pod for pod in running_pods
# Filter out pods that are not members of a DC
if pod['metadata'].get('labels', {}).get('deploymentconfig')
}
if not pods_by_name:
- return 'No logging Elasticsearch pods were found. Is logging deployed?'
- error_msgs += self._check_elasticsearch_masters(pods_by_name, task_vars)
- error_msgs += self._check_elasticsearch_node_list(pods_by_name, task_vars)
- error_msgs += self._check_es_cluster_health(pods_by_name, task_vars)
- error_msgs += self._check_elasticsearch_diskspace(pods_by_name, task_vars)
- return '\n'.join(error_msgs)
+ # nothing running, cannot run the rest of the check
+ errors.append(OpenShiftCheckException(
+ 'NoRunningPods',
+ 'No logging Elasticsearch pods were found running, so no logs are being aggregated.'
+ ))
+ raise OpenShiftCheckExceptionList(errors)
+
+ errors += self.check_elasticsearch_masters(pods_by_name)
+ errors += self.check_elasticsearch_node_list(pods_by_name)
+ errors += self.check_es_cluster_health(pods_by_name)
+ errors += self.check_elasticsearch_diskspace(pods_by_name)
+ if errors:
+ raise OpenShiftCheckExceptionList(errors)
+
+ def running_elasticsearch_pods(self, es_pods):
+ """Returns: list of running pods, list of errors about non-running pods"""
+ not_running = self.not_running_pods(es_pods)
+ running_pods = [pod for pod in es_pods if pod not in not_running]
+ if not_running:
+ return running_pods, [OpenShiftCheckException(
+ 'PodNotRunning',
+ 'The following Elasticsearch pods are defined but not running:\n'
+ '{pods}'.format(pods=''.join(
+ " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+ for pod in not_running
+ ))
+ )]
+ return running_pods, []
@staticmethod
def _build_es_curl_cmd(pod_name, url):
base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
- def _check_elasticsearch_masters(self, pods_by_name, task_vars):
- """Check that Elasticsearch masters are sane. Returns: list of error strings"""
+ def check_elasticsearch_masters(self, pods_by_name):
+ """Check that Elasticsearch masters are sane. Returns: list of errors"""
es_master_names = set()
- error_msgs = []
+ errors = []
for pod_name in pods_by_name.keys():
# Compare what each ES node reports as master and compare for split brain
get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
- master_name_str = self._exec_oc(get_master_cmd, [], task_vars)
+ master_name_str = self.exec_oc(get_master_cmd, [])
master_names = (master_name_str or '').split(' ')
if len(master_names) > 1:
es_master_names.add(master_names[1])
else:
- error_msgs.append(
- 'No master? Elasticsearch {pod} returned bad string when asked master name:\n'
+ errors.append(OpenShiftCheckException(
+ 'NoMasterName',
+ 'Elasticsearch {pod} gave unexpected response when asked master name:\n'
' {response}'.format(pod=pod_name, response=master_name_str)
- )
+ ))
if not es_master_names:
- error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?')
- return '\n'.join(error_msgs)
+ errors.append(OpenShiftCheckException(
+ 'NoMasterFound',
+ 'No logging Elasticsearch masters were found.'
+ ))
+ return errors
if len(es_master_names) > 1:
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'SplitBrainMasters',
'Found multiple Elasticsearch masters according to the pods:\n'
'{master_list}\n'
'This implies that the masters have "split brain" and are not correctly\n'
'replicating data for the logging cluster. Log loss is likely to occur.'
.format(master_list='\n'.join(' ' + master for master in es_master_names))
- )
+ ))
- return error_msgs
+ return errors
- def _check_elasticsearch_node_list(self, pods_by_name, task_vars):
- """Check that reported ES masters are accounted for by pods. Returns: list of error strings"""
+ def check_elasticsearch_node_list(self, pods_by_name):
+ """Check that reported ES masters are accounted for by pods. Returns: list of errors"""
if not pods_by_name:
- return ['No logging Elasticsearch masters were found. Is logging deployed?']
+ return [OpenShiftCheckException(
+ 'MissingComponentPods',
+ 'No logging Elasticsearch pods were found.'
+ )]
# get ES cluster nodes
node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
- cluster_node_data = self._exec_oc(node_cmd, [], task_vars)
+ cluster_node_data = self.exec_oc(node_cmd, [])
try:
cluster_nodes = json.loads(cluster_node_data)['nodes']
except (ValueError, KeyError):
- return [
+ return [OpenShiftCheckException(
+ 'MissingNodeList',
'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
cluster_node_data
- ]
+ )]
# Try to match all ES-reported node hosts to known pods.
- error_msgs = []
+ errors = []
for node in cluster_nodes.values():
# Note that with 1.4/3.4 the pod IP may be used as the master name
if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
for pod_name, pod in pods_by_name.items()):
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'EsPodNodeMismatch',
'The Elasticsearch cluster reports a member node "{node}"\n'
'that does not correspond to any known ES pod.'.format(node=node['host'])
- )
+ ))
- return error_msgs
+ return errors
- def _check_es_cluster_health(self, pods_by_name, task_vars):
+ def check_es_cluster_health(self, pods_by_name):
"""Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
- error_msgs = []
+ errors = []
for pod_name in pods_by_name.keys():
cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
- cluster_health_data = self._exec_oc(cluster_health_cmd, [], task_vars)
+ cluster_health_data = self.exec_oc(cluster_health_cmd, [])
try:
health_res = json.loads(cluster_health_data)
if not health_res or not health_res.get('status'):
raise ValueError()
except ValueError:
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'BadEsResponse',
'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
- )
+ ))
continue
if health_res['status'] not in ['green', 'yellow']:
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'EsClusterHealthRed',
'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
- )
+ ))
- return error_msgs
+ return errors
- def _check_elasticsearch_diskspace(self, pods_by_name, task_vars):
+ def check_elasticsearch_diskspace(self, pods_by_name):
"""
Exec into an ES pod and query the diskspace on the persistent volume.
Returns: list of errors
"""
- error_msgs = []
+ errors = []
for pod_name in pods_by_name.keys():
df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
- disk_output = self._exec_oc(df_cmd, [], task_vars)
+ disk_output = self.exec_oc(df_cmd, [])
lines = disk_output.splitlines()
# expecting one header looking like 'IUse% Use%' and one body line
body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'BadDfResponse',
'Could not retrieve storage usage from logging ES pod "{pod}".\n'
'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
- )
+ ))
continue
inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
- inode_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_inode_pct', default='90')
+ inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90')
if int(inode_pct) >= int(inode_pct_thresh):
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'InodeUsageTooHigh',
'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
' is {pct}, greater than threshold {limit}.\n'
' Note: threshold can be specified in inventory with {param}'.format(
@@ -192,10 +195,11 @@ class Elasticsearch(LoggingCheck):
pct=str(inode_pct),
limit=str(inode_pct_thresh),
param='openshift_check_efk_es_inode_pct',
- ))
- disk_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_storage_pct', default='80')
+ )))
+ disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80')
if int(disk_pct) >= int(disk_pct_thresh):
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'DiskUsageTooHigh',
'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
' is {pct}, greater than threshold {limit}.\n'
' Note: threshold can be specified in inventory with {param}'.format(
@@ -203,15 +207,6 @@ class Elasticsearch(LoggingCheck):
pct=str(disk_pct),
limit=str(disk_pct_thresh),
param='openshift_check_efk_es_storage_pct',
- ))
-
- return error_msgs
-
- def _exec_oc(self, cmd_str, extra_args, task_vars):
- return super(Elasticsearch, self).exec_oc(
- self.execute_module,
- self.logging_namespace,
- cmd_str,
- extra_args,
- task_vars,
- )
+ )))
+
+ return errors
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
index 627567293..3b192a281 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -1,76 +1,109 @@
-"""
-Module for performing checks on an Fluentd logging deployment
-"""
+"""Check for an aggregated logging Fluentd deployment"""
import json
-from openshift_checks import get_var
+
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
from openshift_checks.logging.logging import LoggingCheck
class Fluentd(LoggingCheck):
- """Module that checks an integrated logging Fluentd deployment"""
+ """Check for an aggregated logging Fluentd deployment"""
+
name = "fluentd"
tags = ["health", "logging"]
- logging_namespace = None
+ def run(self):
+ """Check the Fluentd deployment and raise an error if any problems are found."""
+
+ fluentd_pods = self.get_pods_for_component("fluentd")
+ self.check_fluentd(fluentd_pods)
+ return {}
- def run(self, tmp, task_vars):
- """Check various things and gather errors. Returns: result as hash"""
+ def check_fluentd(self, pods):
+ """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found."""
- self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
- fluentd_pods, error = super(Fluentd, self).get_pods_for_component(
- self.execute_module,
- self.logging_namespace,
- "fluentd",
- task_vars,
+ node_selector = self.get_var(
+ 'openshift_logging_fluentd_nodeselector',
+ default='logging-infra-fluentd=true'
)
- if error:
- return {"failed": True, "changed": False, "msg": error}
- check_error = self.check_fluentd(fluentd_pods, task_vars)
- if check_error:
- msg = ("The following Fluentd deployment issue was found:"
- "\n-------\n"
- "{}".format(check_error))
- return {"failed": True, "changed": False, "msg": msg}
+ nodes_by_name = self.get_nodes_by_name()
+ fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
+
+ errors = []
+ errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
+ errors += self.check_nodes_have_fluentd(pods, fluentd_nodes)
+ errors += self.check_fluentd_pods_running(pods)
+
+ # Make sure there are no extra fluentd pods
+ if len(pods) > len(fluentd_nodes):
+ errors.append(OpenShiftCheckException(
+ 'TooManyFluentdPods',
+ 'There are more Fluentd pods running than nodes labeled.\n'
+ 'This may not cause problems with logging but it likely indicates something wrong.'
+ ))
+
+ if errors:
+ raise OpenShiftCheckExceptionList(errors)
- # TODO(lmeyer): run it all again for the ops cluster
- return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'}
+ def get_nodes_by_name(self):
+ """Retrieve all the node definitions. Returns: dict(name: node)"""
+ nodes_json = self.exec_oc("get nodes -o json", [])
+ try:
+ nodes = json.loads(nodes_json)
+ except ValueError: # no valid json - should not happen
+ raise OpenShiftCheckException(
+ "BadOcNodeList",
+ "Could not obtain a list of nodes to validate fluentd.\n"
+ "Output from oc get:\n" + nodes_json
+ )
+ if not nodes or not nodes.get('items'): # also should not happen
+ raise OpenShiftCheckException(
+ "NoNodesDefined",
+ "No nodes appear to be defined according to the API."
+ )
+ return {
+ node['metadata']['name']: node
+ for node in nodes['items']
+ }
@staticmethod
- def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
- """Filter to all nodes with fluentd label. Returns dict(name: node), error string"""
+ def filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+ """Filter to all nodes with fluentd label. Returns dict(name: node)"""
label, value = node_selector.split('=', 1)
fluentd_nodes = {
name: node for name, node in nodes_by_name.items()
if node['metadata']['labels'].get(label) == value
}
if not fluentd_nodes:
- return None, (
+ raise OpenShiftCheckException(
+ 'NoNodesLabeled',
'There are no nodes with the fluentd label {label}.\n'
- 'This means no logs will be aggregated from the nodes.'
- ).format(label=node_selector)
- return fluentd_nodes, None
+ 'This means no logs will be aggregated from the nodes.'.format(label=node_selector)
+ )
+ return fluentd_nodes
- @staticmethod
- def _check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars):
- """Note if nodes are not labeled as expected. Returns: error string"""
- intended_nodes = get_var(task_vars, 'openshift_logging_fluentd_hosts', default=['--all'])
+ def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
+ """Note if nodes are not labeled as expected. Returns: error list"""
+ intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all'])
if not intended_nodes or '--all' in intended_nodes:
intended_nodes = nodes_by_name.keys()
nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
if nodes_missing_labels:
- return (
+ return [OpenShiftCheckException(
+ 'NodesUnlabeled',
'The following nodes are supposed to be labeled with {label} but are not:\n'
' {nodes}\n'
- 'Fluentd will not aggregate logs from these nodes.'
- ).format(label=node_selector, nodes=', '.join(nodes_missing_labels))
- return None
+ 'Fluentd will not aggregate logs from these nodes.'.format(
+ label=node_selector, nodes=', '.join(nodes_missing_labels)
+ ))]
+
+ return []
@staticmethod
- def _check_nodes_have_fluentd(pods, fluentd_nodes):
- """Make sure fluentd is on all the labeled nodes. Returns: error string"""
+ def check_nodes_have_fluentd(pods, fluentd_nodes):
+ """Make sure fluentd is on all the labeled nodes. Returns: error list"""
unmatched_nodes = fluentd_nodes.copy()
node_names_by_label = {
node['metadata']['labels']['kubernetes.io/hostname']: name
@@ -90,81 +123,32 @@ class Fluentd(LoggingCheck):
]:
unmatched_nodes.pop(name, None)
if unmatched_nodes:
- return (
+ return [OpenShiftCheckException(
+ 'MissingFluentdPod',
'The following nodes are supposed to have a Fluentd pod but do not:\n'
- '{nodes}'
- 'These nodes will not have their logs aggregated.'
- ).format(nodes=''.join(
- " {}\n".format(name)
- for name in unmatched_nodes.keys()
- ))
- return None
+ ' {nodes}\n'
+ 'These nodes will not have their logs aggregated.'.format(
+ nodes='\n '.join(unmatched_nodes.keys())
+ ))]
+
+ return []
- def _check_fluentd_pods_running(self, pods):
+ def check_fluentd_pods_running(self, pods):
"""Make sure all fluentd pods are running. Returns: error string"""
not_running = super(Fluentd, self).not_running_pods(pods)
if not_running:
- return (
+ return [OpenShiftCheckException(
+ 'FluentdNotRunning',
'The following Fluentd pods are supposed to be running but are not:\n'
- '{pods}'
- 'These pods will not aggregate logs from their nodes.'
- ).format(pods=''.join(
- " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
- for pod in not_running
- ))
- return None
-
- def check_fluentd(self, pods, task_vars):
- """Verify fluentd is running everywhere. Returns: error string"""
-
- node_selector = get_var(task_vars, 'openshift_logging_fluentd_nodeselector',
- default='logging-infra-fluentd=true')
-
- nodes_by_name, error = self.get_nodes_by_name(task_vars)
-
- if error:
- return error
- fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
- if error:
- return error
-
- error_msgs = []
- error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars)
- if error:
- error_msgs.append(error)
- error = self._check_nodes_have_fluentd(pods, fluentd_nodes)
- if error:
- error_msgs.append(error)
- error = self._check_fluentd_pods_running(pods)
- if error:
- error_msgs.append(error)
-
- # Make sure there are no extra fluentd pods
- if len(pods) > len(fluentd_nodes):
- error_msgs.append(
- 'There are more Fluentd pods running than nodes labeled.\n'
- 'This may not cause problems with logging but it likely indicates something wrong.'
- )
-
- return '\n'.join(error_msgs)
-
- def get_nodes_by_name(self, task_vars):
- """Retrieve all the node definitions. Returns: dict(name: node), error string"""
- nodes_json = self._exec_oc("get nodes -o json", [], task_vars)
- try:
- nodes = json.loads(nodes_json)
- except ValueError: # no valid json - should not happen
- return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json
- if not nodes or not nodes.get('items'): # also should not happen
- return None, "No nodes appear to be defined according to the API."
- return {
- node['metadata']['name']: node
- for node in nodes['items']
- }, None
-
- def _exec_oc(self, cmd_str, extra_args, task_vars):
- return super(Fluentd, self).exec_oc(self.execute_module,
- self.logging_namespace,
- cmd_str,
- extra_args,
- task_vars)
+ ' {pods}\n'
+ 'These pods will not aggregate logs from their nodes.'.format(
+ pods='\n'.join(
+ " {name} ({host})".format(
+ name=pod['metadata']['name'],
+ host=pod['spec'].get('host', 'None')
+ )
+ for pod in not_running
+ )
+ ))]
+
+ return []
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
new file mode 100644
index 000000000..d783e6760
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
@@ -0,0 +1,131 @@
+"""
+Module for performing checks on a Fluentd logging deployment configuration
+"""
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class FluentdConfig(LoggingCheck):
+ """Module that checks logging configuration of an integrated logging Fluentd deployment"""
+ name = "fluentd_config"
+ tags = ["health"]
+
+ def is_active(self):
+ logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False)
+
+ try:
+ version = self.get_major_minor_version(self.get_var("openshift_image_tag"))
+ except ValueError:
+ # if failed to parse OpenShift version, perform check anyway (if logging enabled)
+ return logging_deployed
+
+ return logging_deployed and version < (3, 6)
+
+ def run(self):
+ """Check that Fluentd has running pods, and that its logging config matches Docker's logging config."""
+ config_error = self.check_logging_config()
+ if config_error:
+ msg = ("The following Fluentd logging configuration problem was found:"
+ "\n{}".format(config_error))
+ return {"failed": True, "msg": msg}
+
+ return {}
+
+ def check_logging_config(self):
+ """Ensure that the configured Docker logging driver matches fluentd settings.
+ This means that, at least for now, if the following condition is met:
+
+ openshift_logging_fluentd_use_journal == True
+
+ then the value of the configured Docker logging driver should be "journald".
+ Otherwise, the value of the Docker logging driver should be "json-file".
+ Returns an error string if the above condition is not met, or None otherwise."""
+ use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True)
+
+ # if check is running on a master, retrieve all running pods
+ # and check any pod's container for the env var "USE_JOURNAL"
+ group_names = self.get_var("group_names")
+ if "masters" in group_names:
+ use_journald = self.check_fluentd_env_var()
+
+ docker_info = self.execute_module("docker_info", {})
+ try:
+ logging_driver = docker_info["info"]["LoggingDriver"]
+ except KeyError:
+ return "Unable to determine Docker logging driver."
+
+ logging_driver = docker_info["info"]["LoggingDriver"]
+ recommended_logging_driver = "journald"
+ error = None
+
+ # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver`
+ # option as an inventory file variable, or adding the log driver value as part of the
+ # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that
+ # can be passed to the Docker binary; the only other recommendation that can be made, would be
+ # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running
+ # individual containers.
+ if use_journald and logging_driver != "journald":
+ error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n'
+ 'This differs from your Docker configuration, which has been set to use "{driver}" '
+ 'as the default method of storing logs.\n'
+ 'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+ 'from your Docker containers.').format(driver=logging_driver)
+ elif not use_journald and logging_driver != "json-file":
+ recommended_logging_driver = "json-file"
+ error = ('Your Fluentd configuration is set to aggregate Docker container logs from '
+ 'individual json log files per container.\n '
+ 'This differs from your Docker configuration, which has been set to use '
+ '"{driver}" as the default method of storing logs.\n'
+ 'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+ 'from your Docker containers.').format(driver=logging_driver)
+
+ if error:
+ error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n'
+ ' openshift_docker_options="--log-driver={driver}"\n\n'
+ 'Alternatively, you can add the following option to your Docker configuration, located in'
+ '"/etc/docker/daemon.json":\n\n'
+ '{{ "log-driver": "{driver}" }}\n\n'
+ 'See https://docs.docker.com/engine/admin/logging/json-file '
+ 'for more information.').format(driver=recommended_logging_driver)
+
+ return error
+
+ def check_fluentd_env_var(self):
+ """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod."""
+ running_pods = self.running_fluentd_pods()
+
+ try:
+ pod_containers = running_pods[0]["spec"]["containers"]
+ except KeyError:
+ return "Unable to detect running containers on selected Fluentd pod."
+
+ if not pod_containers:
+ msg = ('There are no running containers on selected Fluentd pod "{}".\n'
+ 'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", ""))
+ raise OpenShiftCheckException(msg)
+
+ pod_env = pod_containers[0].get("env")
+ if not pod_env:
+ msg = ('There are no environment variables set on the Fluentd container "{}".\n'
+ 'Unable to calculate expected logging driver.').format(pod_containers[0].get("name"))
+ raise OpenShiftCheckException(msg)
+
+ for env in pod_env:
+ if env["name"] == "USE_JOURNAL":
+ return env.get("value", "false") != "false"
+
+ return False
+
+ def running_fluentd_pods(self):
+ """Return a list of running fluentd pods."""
+ fluentd_pods = self.get_pods_for_component("fluentd")
+
+ running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running']
+ if not running_fluentd_pods:
+ raise OpenShiftCheckException(
+ 'No Fluentd pods were found to be in the "Running" state. '
+ 'At least one Fluentd pod is required in order to perform this check.'
+ )
+
+ return running_fluentd_pods
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
index 551e8dfa0..3b1cf8baa 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -12,8 +12,7 @@ except ImportError:
from urllib.error import HTTPError, URLError
import urllib.request as urllib2
-from openshift_checks import get_var
-from openshift_checks.logging.logging import LoggingCheck
+from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException
class Kibana(LoggingCheck):
@@ -22,35 +21,17 @@ class Kibana(LoggingCheck):
name = "kibana"
tags = ["health", "logging"]
- logging_namespace = None
-
- def run(self, tmp, task_vars):
+ def run(self):
"""Check various things and gather errors. Returns: result as hash"""
- self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
- kibana_pods, error = super(Kibana, self).get_pods_for_component(
- self.execute_module,
- self.logging_namespace,
- "kibana",
- task_vars,
- )
- if error:
- return {"failed": True, "changed": False, "msg": error}
- check_error = self.check_kibana(kibana_pods)
-
- if not check_error:
- check_error = self._check_kibana_route(task_vars)
-
- if check_error:
- msg = ("The following Kibana deployment issue was found:"
- "\n-------\n"
- "{}".format(check_error))
- return {"failed": True, "changed": False, "msg": msg}
-
+ kibana_pods = self.get_pods_for_component("kibana")
+ self.check_kibana(kibana_pods)
+ self.check_kibana_route()
# TODO(lmeyer): run it all again for the ops cluster
- return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'}
- def _verify_url_internal(self, url, task_vars):
+ return {}
+
+ def _verify_url_internal(self, url):
"""
Try to reach a URL from the host.
Returns: success (bool), reason (for failure)
@@ -62,7 +43,7 @@ class Kibana(LoggingCheck):
# TODO(lmeyer): give users option to validate certs
status_code=302,
)
- result = self.execute_module('uri', args, None, task_vars)
+ result = self.execute_module('uri', args)
if result.get('failed'):
return result['msg']
return None
@@ -71,7 +52,7 @@ class Kibana(LoggingCheck):
def _verify_url_external(url):
"""
Try to reach a URL from ansible control host.
- Returns: success (bool), reason (for failure)
+ Raise an OpenShiftCheckException if anything goes wrong.
"""
# This actually checks from the ansible control host, which may or may not
# really be "external" to the cluster.
@@ -97,133 +78,149 @@ class Kibana(LoggingCheck):
return None
def check_kibana(self, pods):
- """Check to see if Kibana is up and working. Returns: error string."""
+ """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not."""
if not pods:
- return "There are no Kibana pods deployed, so no access to the logging UI."
+ raise OpenShiftCheckException(
+ "MissingComponentPods",
+ "There are no Kibana pods deployed, so no access to the logging UI."
+ )
not_running = self.not_running_pods(pods)
if len(not_running) == len(pods):
- return "No Kibana pod is in a running state, so there is no access to the logging UI."
+ raise OpenShiftCheckException(
+ "NoRunningPods",
+ "No Kibana pod is in a running state, so there is no access to the logging UI."
+ )
elif not_running:
- return (
+ raise OpenShiftCheckException(
+ "PodNotRunning",
"The following Kibana pods are not currently in a running state:\n"
- "{pods}"
- "However at least one is, so service may not be impacted."
- ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running))
+ " {pods}\n"
+ "However at least one is, so service may not be impacted.".format(
+ pods="\n ".join(pod['metadata']['name'] for pod in not_running)
+ )
+ )
- return None
-
- def _get_kibana_url(self, task_vars):
+ def _get_kibana_url(self):
"""
Get kibana route or report error.
- Returns: url (or empty), reason for failure
+ Returns: url
"""
# Get logging url
- get_route = self._exec_oc("get route logging-kibana -o json", [], task_vars)
+ get_route = self.exec_oc("get route logging-kibana -o json", [])
if not get_route:
- return None, 'no_route_exists'
+ raise OpenShiftCheckException(
+ 'no_route_exists',
+ 'No route is defined for Kibana in the logging namespace,\n'
+ 'so the logging stack is not accessible. Is logging deployed?\n'
+ 'Did something remove the logging-kibana route?'
+ )
- route = json.loads(get_route)
+ try:
+ route = json.loads(get_route)
+ # check that the route has been accepted by a router
+ ingress = route["status"]["ingress"]
+ except (ValueError, KeyError):
+ raise OpenShiftCheckException(
+ 'get_route_failed',
+ '"oc get route" returned an unexpected response:\n' + get_route
+ )
- # check that the route has been accepted by a router
- ingress = route["status"]["ingress"]
# ingress can be null if there is no router, or empty if not routed
if not ingress or not ingress[0]:
- return None, 'route_not_accepted'
+ raise OpenShiftCheckException(
+ 'route_not_accepted',
+ 'The logging-kibana route is not being routed by any router.\n'
+ 'Is the router deployed and working?'
+ )
host = route.get("spec", {}).get("host")
if not host:
- return None, 'route_missing_host'
+ raise OpenShiftCheckException(
+ 'route_missing_host',
+ 'The logging-kibana route has no hostname defined,\n'
+ 'which should never happen. Did something alter its definition?'
+ )
- return 'https://{}/'.format(host), None
+ return 'https://{}/'.format(host)
- def _check_kibana_route(self, task_vars):
+ def check_kibana_route(self):
"""
Check to see if kibana route is up and working.
- Returns: error string
+ Raises exception if not.
"""
- known_errors = dict(
- no_route_exists=(
- 'No route is defined for Kibana in the logging namespace,\n'
- 'so the logging stack is not accessible. Is logging deployed?\n'
- 'Did something remove the logging-kibana route?'
- ),
- route_not_accepted=(
- 'The logging-kibana route is not being routed by any router.\n'
- 'Is the router deployed and working?'
- ),
- route_missing_host=(
- 'The logging-kibana route has no hostname defined,\n'
- 'which should never happen. Did something alter its definition?'
- ),
- )
- kibana_url, error = self._get_kibana_url(task_vars)
- if not kibana_url:
- return known_errors.get(error, error)
+ kibana_url = self._get_kibana_url()
# first, check that kibana is reachable from the master.
- error = self._verify_url_internal(kibana_url, task_vars)
+ error = self._verify_url_internal(kibana_url)
if error:
if 'urlopen error [Errno 111] Connection refused' in error:
- error = (
+ raise OpenShiftCheckException(
+ 'FailedToConnectInternal',
'Failed to connect from this master to Kibana URL {url}\n'
- 'Is kibana running, and is at least one router routing to it?'
- ).format(url=kibana_url)
+ 'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url)
+ )
elif 'urlopen error [Errno -2] Name or service not known' in error:
- error = (
+ raise OpenShiftCheckException(
+ 'FailedToResolveInternal',
'Failed to connect from this master to Kibana URL {url}\n'
'because the hostname does not resolve.\n'
- 'Is DNS configured for the Kibana hostname?'
- ).format(url=kibana_url)
+ 'Is DNS configured for the Kibana hostname?'.format(url=kibana_url)
+ )
elif 'Status code was not' in error:
- error = (
+ raise OpenShiftCheckException(
+ 'WrongReturnCodeInternal',
'A request from this master to the Kibana URL {url}\n'
'did not return the correct status code (302).\n'
'This could mean that Kibana is malfunctioning, the hostname is\n'
'resolving incorrectly, or other network issues. The output was:\n'
- ' {error}'
- ).format(url=kibana_url, error=error)
- return 'Error validating the logging Kibana route:\n' + error
+ ' {error}'.format(url=kibana_url, error=error)
+ )
+ raise OpenShiftCheckException(
+ 'MiscRouteErrorInternal',
+ 'Error validating the logging Kibana route internally:\n' + error
+ )
# in production we would like the kibana route to work from outside the
# cluster too; but that may not be the case, so allow disabling just this part.
- if not get_var(task_vars, "openshift_check_efk_kibana_external", default=True):
- return None
+ if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true":
+ return
error = self._verify_url_external(kibana_url)
- if error:
- if 'urlopen error [Errno 111] Connection refused' in error:
- error = (
- 'Failed to connect from the Ansible control host to Kibana URL {url}\n'
- 'Is the router for the Kibana hostname exposed externally?'
- ).format(url=kibana_url)
- elif 'urlopen error [Errno -2] Name or service not known' in error:
- error = (
- 'Failed to resolve the Kibana hostname in {url}\n'
- 'from the Ansible control host.\n'
- 'Is DNS configured to resolve this Kibana hostname externally?'
- ).format(url=kibana_url)
- elif 'Expected success (200)' in error:
- error = (
- 'A request to Kibana at {url}\n'
- 'returned the wrong error code:\n'
- ' {error}\n'
- 'This could mean that Kibana is malfunctioning, the hostname is\n'
- 'resolving incorrectly, or other network issues.'
- ).format(url=kibana_url, error=error)
- error = (
- 'Error validating the logging Kibana route:\n{error}\n'
- 'To disable external Kibana route validation, set in your inventory:\n'
- ' openshift_check_efk_kibana_external=False'
- ).format(error=error)
- return error
- return None
- def _exec_oc(self, cmd_str, extra_args, task_vars):
- return super(Kibana, self).exec_oc(self.execute_module,
- self.logging_namespace,
- cmd_str,
- extra_args,
- task_vars)
+ if not error:
+ return
+
+ error_fmt = (
+ 'Error validating the logging Kibana route:\n{error}\n'
+ 'To disable external Kibana route validation, set the variable:\n'
+ ' openshift_check_efk_kibana_external=False'
+ )
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ msg = (
+ 'Failed to connect from the Ansible control host to Kibana URL {url}\n'
+ 'Is the router for the Kibana hostname exposed externally?'
+ ).format(url=kibana_url)
+ raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg))
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ msg = (
+ 'Failed to resolve the Kibana hostname in {url}\n'
+ 'from the Ansible control host.\n'
+ 'Is DNS configured to resolve this Kibana hostname externally?'
+ ).format(url=kibana_url)
+ raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg))
+ elif 'Expected success (200)' in error:
+ msg = (
+ 'A request to Kibana at {url}\n'
+ 'returned the wrong error code:\n'
+ ' {error}\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues.'
+ ).format(url=kibana_url, error=error)
+ raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg))
+ raise OpenShiftCheckException(
+ 'MiscRouteError',
+ 'Error validating the logging Kibana route externally:\n' + error
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
index 6e951e82c..ecd8adb64 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -5,49 +5,60 @@ Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana st
import json
import os
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+class MissingComponentPods(OpenShiftCheckException):
+ """Raised when a component has no pods in the namespace."""
+ pass
+
+
+class CouldNotUseOc(OpenShiftCheckException):
+ """Raised when ocutil has a failure running oc."""
+ pass
class LoggingCheck(OpenShiftCheck):
- """Base class for logging component checks"""
+ """Base class for OpenShift aggregated logging component checks"""
+
+ # FIXME: this should not be listed as a check, since it is not meant to be
+ # run by itself.
name = "logging"
- @classmethod
- def is_active(cls, task_vars):
- return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars)
+ def is_active(self):
+ logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False)
+ return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master()
- @staticmethod
- def is_first_master(task_vars):
- """Run only on first master and only when logging is configured. Returns: bool"""
- logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True)
+ def is_first_master(self):
+ """Determine if running on first master. Returns: bool"""
# Note: It would be nice to use membership in oo_first_master group, however for now it
# seems best to avoid requiring that setup and just check this is the first master.
- hostname = get_var(task_vars, "ansible_ssh_host") or [None]
- masters = get_var(task_vars, "groups", "masters", default=None) or [None]
- return logging_deployed and masters[0] == hostname
+ hostname = self.get_var("ansible_ssh_host") or [None]
+ masters = self.get_var("groups", "masters", default=None) or [None]
+ return masters[0] == hostname
- def run(self, tmp, task_vars):
- pass
+ def run(self):
+ return {}
- def get_pods_for_component(self, execute_module, namespace, logging_component, task_vars):
- """Get all pods for a given component. Returns: list of pods for component, error string"""
+ def get_pods_for_component(self, logging_component):
+ """Get all pods for a given component. Returns: list of pods."""
pod_output = self.exec_oc(
- execute_module,
- namespace,
"get pods -l component={} -o json".format(logging_component),
[],
- task_vars
)
try:
- pods = json.loads(pod_output)
- if not pods or not pods.get('items'):
+ pods = json.loads(pod_output) # raises ValueError if deserialize fails
+ if not pods or not pods.get('items'): # also a broken response, treat the same
raise ValueError()
except ValueError:
- # successful run but non-parsing data generally means there were no pods in the namespace
- return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace)
+ # successful run but non-parsing data generally means there were no pods to be found
+ raise MissingComponentPods(
+ 'There are no "{}" component pods in the "{}" namespace.\n'
+ 'Is logging deployed?'.format(logging_component, self.logging_namespace())
+ )
- return pods['items'], None
+ return pods['items']
@staticmethod
def not_running_pods(pods):
@@ -63,34 +74,36 @@ class LoggingCheck(OpenShiftCheck):
)
]
- @staticmethod
- def exec_oc(execute_module=None, namespace="logging", cmd_str="", extra_args=None, task_vars=None):
+ def logging_namespace(self):
+ """Returns the namespace in which logging is configured to deploy."""
+ return self.get_var("openshift_logging_namespace", default="logging")
+
+ def exec_oc(self, cmd_str="", extra_args=None):
"""
Execute an 'oc' command in the remote host.
Returns: output of command and namespace,
- or raises OpenShiftCheckException on error
+ or raises CouldNotUseOc on error
"""
- config_base = get_var(task_vars, "openshift", "common", "config_base")
+ config_base = self.get_var("openshift", "common", "config_base")
args = {
- "namespace": namespace,
+ "namespace": self.logging_namespace(),
"config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
"cmd": cmd_str,
"extra_args": list(extra_args) if extra_args else [],
}
- result = execute_module("ocutil", args, None, task_vars)
+ result = self.execute_module("ocutil", args)
if result.get("failed"):
- msg = (
- 'Unexpected error using `oc` to validate the logging stack components.\n'
- 'Error executing `oc {cmd}`:\n'
- '{error}'
- ).format(cmd=args['cmd'], error=result['result'])
-
if result['result'] == '[Errno 2] No such file or directory':
- msg = (
+ raise CouldNotUseOc(
"This host is supposed to be a master but does not have the `oc` command where expected.\n"
"Has an installation been run on this host yet?"
)
- raise OpenShiftCheckException(msg)
+
+ raise CouldNotUseOc(
+ 'Unexpected error using `oc` to validate the logging stack components.\n'
+ 'Error executing `oc {cmd}`:\n'
+ '{error}'.format(cmd=args['cmd'], error=result['result'])
+ )
return result.get("result", "")
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
new file mode 100644
index 000000000..d781db649
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -0,0 +1,129 @@
+"""
+Check for ensuring logs from pods can be queried in a reasonable amount of time.
+"""
+
+import json
+import time
+
+from uuid import uuid4
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+ES_CMD_TIMEOUT_SECONDS = 30
+
+
+class LoggingIndexTime(LoggingCheck):
+ """Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time."""
+ name = "logging_index_time"
+ tags = ["health", "logging"]
+
+ def run(self):
+ """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs."""
+ try:
+ log_index_timeout = int(
+ self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS)
+ )
+ except ValueError:
+ raise OpenShiftCheckException(
+ 'InvalidTimeout',
+ 'Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
+ 'Value must be an integer representing an amount in seconds.'
+ )
+
+ running_component_pods = dict()
+
+ # get all component pods
+ for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']):
+ pods = self.get_pods_for_component(component)
+ running_pods = self.running_pods(pods)
+
+ if not running_pods:
+ raise OpenShiftCheckException(
+ component + 'NoRunningPods',
+ 'No {} pods in the "Running" state were found.'
+ 'At least one pod is required in order to perform this check.'.format(name)
+ )
+
+ running_component_pods[component] = running_pods
+
+ uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0])
+ self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout)
+ return {}
+
+ def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs):
+ """Retry an Elasticsearch query every second until query success, or a defined
+ length of time has passed."""
+ deadline = time.time() + timeout_secs
+ interval = 1
+ while not self.query_es_from_es(es_pod, uuid):
+ if time.time() + interval > deadline:
+ raise OpenShiftCheckException(
+ "NoMatchFound",
+ "expecting match in Elasticsearch for message with uuid {}, "
+ "but no matches were found after {}s.".format(uuid, timeout_secs)
+ )
+ time.sleep(interval)
+
+ def curl_kibana_with_uuid(self, kibana_pod):
+ """curl Kibana with a unique uuid."""
+ uuid = self.generate_uuid()
+ pod_name = kibana_pod["metadata"]["name"]
+ exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}"
+ exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid)
+
+ error_str = self.exec_oc(exec_cmd, [])
+
+ try:
+ error_code = json.loads(error_str)["statusCode"]
+ except (KeyError, ValueError):
+ raise OpenShiftCheckException(
+ 'kibanaInvalidResponse',
+ 'invalid response returned from Kibana request:\n'
+ 'Command: {}\nResponse: {}'.format(exec_cmd, error_str)
+ )
+
+ if error_code != 404:
+ raise OpenShiftCheckException(
+ 'kibanaInvalidReturnCode',
+ 'invalid error code returned from Kibana request.\n'
+ 'Expecting error code "404", but got "{}" instead.'.format(error_code)
+ )
+
+ return uuid
+
+ def query_es_from_es(self, es_pod, uuid):
+ """curl the Elasticsearch pod and look for a unique uuid in its logs."""
+ pod_name = es_pod["metadata"]["name"]
+ exec_cmd = (
+ "exec {pod_name} -- curl --max-time 30 -s -f "
+ "--cacert /etc/elasticsearch/secret/admin-ca "
+ "--cert /etc/elasticsearch/secret/admin-cert "
+ "--key /etc/elasticsearch/secret/admin-key "
+ "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
+ )
+ exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid)
+ result = self.exec_oc(exec_cmd, [])
+
+ try:
+ count = json.loads(result)["count"]
+ except (KeyError, ValueError):
+ raise OpenShiftCheckException(
+ 'esInvalidResponse',
+ 'Invalid response from Elasticsearch query:\n'
+ ' {}\n'
+ 'Response was:\n{}'.format(exec_cmd, result)
+ )
+
+ return count
+
+ @staticmethod
+ def running_pods(pods):
+ """Filter pods that are running."""
+ return [pod for pod in pods if pod['status']['phase'] == 'Running']
+
+ @staticmethod
+ def generate_uuid():
+ """Wrap uuid generator. Allows for testing with expected values."""
+ return str(uuid4())
diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py
index f4e31065f..765ba072d 100644
--- a/roles/openshift_health_checker/openshift_checks/memory_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py
@@ -1,5 +1,5 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that recommended memory is available."""
+from openshift_checks import OpenShiftCheck
MIB = 2**20
GIB = 2**30
@@ -21,19 +21,18 @@ class MemoryAvailability(OpenShiftCheck):
# https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal
memtotal_adjustment = 1 * GIB
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Skip hosts that do not have recommended memory requirements."""
- group_names = get_var(task_vars, "group_names", default=[])
- has_memory_recommendation = bool(set(group_names).intersection(cls.recommended_memory_bytes))
- return super(MemoryAvailability, cls).is_active(task_vars) and has_memory_recommendation
+ group_names = self.get_var("group_names", default=[])
+ has_memory_recommendation = bool(set(group_names).intersection(self.recommended_memory_bytes))
+ return super(MemoryAvailability, self).is_active() and has_memory_recommendation
- def run(self, tmp, task_vars):
- group_names = get_var(task_vars, "group_names")
- total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * MIB
+ def run(self):
+ group_names = self.get_var("group_names")
+ total_memory_bytes = self.get_var("ansible_memtotal_mb") * MIB
recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
- configured_min = float(get_var(task_vars, "openshift_check_min_host_memory_gb", default=0)) * GIB
+ configured_min = float(self.get_var("openshift_check_min_host_memory_gb", default=0)) * GIB
min_memory_bytes = configured_min or recommended_min
if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes:
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
index 2cb2e21aa..e9bae60a3 100644
--- a/roles/openshift_health_checker/openshift_checks/mixins.py
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -2,19 +2,16 @@
Mixin classes meant to be used with subclasses of OpenShiftCheck.
"""
-from openshift_checks import get_var
-
class NotContainerizedMixin(object):
"""Mixin for checks that are only active when not in containerized mode."""
# permanent # pylint: disable=too-few-public-methods
# Reason: The mixin is not intended to stand on its own as a class.
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Only run on non-containerized hosts."""
- is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
- return super(NotContainerizedMixin, cls).is_active(task_vars) and not is_containerized
+ is_containerized = self.get_var("openshift", "common", "is_containerized")
+ return super(NotContainerizedMixin, self).is_active() and not is_containerized
class DockerHostMixin(object):
@@ -22,28 +19,26 @@ class DockerHostMixin(object):
dependencies = []
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Only run on hosts that depend on Docker."""
- is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
- is_node = "nodes" in get_var(task_vars, "group_names", default=[])
- return super(DockerHostMixin, cls).is_active(task_vars) and (is_containerized or is_node)
+ is_containerized = self.get_var("openshift", "common", "is_containerized")
+ is_node = "nodes" in self.get_var("group_names", default=[])
+ return super(DockerHostMixin, self).is_active() and (is_containerized or is_node)
- def ensure_dependencies(self, task_vars):
+ def ensure_dependencies(self):
"""
Ensure that docker-related packages exist, but not on atomic hosts
(which would not be able to install but should already have them).
- Returns: msg, failed, changed
+ Returns: msg, failed
"""
- if get_var(task_vars, "openshift", "common", "is_atomic"):
- return "", False, False
+ if self.get_var("openshift", "common", "is_atomic"):
+ return "", False
# NOTE: we would use the "package" module but it's actually an action plugin
# and it's not clear how to invoke one of those. This is about the same anyway:
result = self.execute_module(
- get_var(task_vars, "ansible_pkg_mgr", default="yum"),
+ self.get_var("ansible_pkg_mgr", default="yum"),
{"name": self.dependencies, "state": "present"},
- task_vars=task_vars,
)
msg = result.get("msg", "")
if result.get("failed"):
@@ -54,5 +49,5 @@ class DockerHostMixin(object):
" {deps}\n{msg}"
).format(deps=',\n '.join(self.dependencies), msg=msg)
failed = result.get("failed", False) or result.get("rc", 0) != 0
- changed = result.get("changed", False)
- return msg, failed, changed
+ self.changed = result.get("changed", False)
+ return msg, failed
diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py
index 2dd045f1f..363c12def 100644
--- a/roles/openshift_health_checker/openshift_checks/ovs_version.py
+++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py
@@ -3,7 +3,7 @@ Ansible module for determining if an installed version of Open vSwitch is incomp
currently installed version of OpenShift.
"""
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
from openshift_checks.mixins import NotContainerizedMixin
@@ -16,63 +16,39 @@ class OvsVersion(NotContainerizedMixin, OpenShiftCheck):
tags = ["health"]
openshift_to_ovs_version = {
- "3.6": "2.6",
- "3.5": "2.6",
+ "3.6": ["2.6", "2.7"],
+ "3.5": ["2.6", "2.7"],
"3.4": "2.4",
}
- # map major release versions across releases
- # to a common major version
- openshift_major_release_version = {
- "1": "3",
- }
-
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Skip hosts that do not have package requirements."""
- group_names = get_var(task_vars, "group_names", default=[])
+ group_names = self.get_var("group_names", default=[])
master_or_node = 'masters' in group_names or 'nodes' in group_names
- return super(OvsVersion, cls).is_active(task_vars) and master_or_node
+ return super(OvsVersion, self).is_active() and master_or_node
- def run(self, tmp, task_vars):
+ def run(self):
args = {
"package_list": [
{
"name": "openvswitch",
- "version": self.get_required_ovs_version(task_vars),
+ "version": self.get_required_ovs_version(),
},
],
}
- return self.execute_module("rpm_version", args, task_vars=task_vars)
+ return self.execute_module("rpm_version", args)
- def get_required_ovs_version(self, task_vars):
+ def get_required_ovs_version(self):
"""Return the correct Open vSwitch version for the current OpenShift version"""
- openshift_version = self._get_openshift_version(task_vars)
+ openshift_version_tuple = self.get_major_minor_version(self.get_var("openshift_image_tag"))
- if float(openshift_version) < 3.5:
+ if openshift_version_tuple < (3, 5):
return self.openshift_to_ovs_version["3.4"]
- ovs_version = self.openshift_to_ovs_version.get(str(openshift_version))
+ openshift_version = ".".join(str(x) for x in openshift_version_tuple)
+ ovs_version = self.openshift_to_ovs_version.get(openshift_version)
if ovs_version:
- return self.openshift_to_ovs_version[str(openshift_version)]
+ return self.openshift_to_ovs_version[openshift_version]
msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
raise OpenShiftCheckException(msg.format(openshift_version))
-
- def _get_openshift_version(self, task_vars):
- openshift_version = get_var(task_vars, "openshift_image_tag")
- if openshift_version and openshift_version[0] == 'v':
- openshift_version = openshift_version[1:]
-
- return self._parse_version(openshift_version)
-
- def _parse_version(self, version):
- components = version.split(".")
- if not components or len(components) < 2:
- msg = "An invalid version of OpenShift was found for this host: {}"
- raise OpenShiftCheckException(msg.format(version))
-
- if components[0] in self.openshift_major_release_version:
- components[0] = self.openshift_major_release_version[components[0]]
-
- return '.'.join(components[:2])
diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py
index 0dd2b1286..a86180b00 100644
--- a/roles/openshift_health_checker/openshift_checks/package_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/package_availability.py
@@ -1,5 +1,6 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that required RPM packages are available."""
+
+from openshift_checks import OpenShiftCheck
from openshift_checks.mixins import NotContainerizedMixin
@@ -9,13 +10,13 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
name = "package_availability"
tags = ["preflight"]
- @classmethod
- def is_active(cls, task_vars):
- return super(PackageAvailability, cls).is_active(task_vars) and task_vars["ansible_pkg_mgr"] == "yum"
+ def is_active(self):
+ """Run only when yum is the package manager as the code is specific to it."""
+ return super(PackageAvailability, self).is_active() and self.get_var("ansible_pkg_mgr") == "yum"
- def run(self, tmp, task_vars):
- rpm_prefix = get_var(task_vars, "openshift", "common", "service_type")
- group_names = get_var(task_vars, "group_names", default=[])
+ def run(self):
+ rpm_prefix = self.get_var("openshift", "common", "service_type")
+ group_names = self.get_var("group_names", default=[])
packages = set()
@@ -25,10 +26,11 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
packages.update(self.node_packages(rpm_prefix))
args = {"packages": sorted(set(packages))}
- return self.execute_module("check_yum_update", args, tmp=tmp, task_vars=task_vars)
+ return self.execute_module("check_yum_update", args)
@staticmethod
def master_packages(rpm_prefix):
+ """Return a list of RPMs that we expect a master install to have available."""
return [
"{rpm_prefix}".format(rpm_prefix=rpm_prefix),
"{rpm_prefix}-clients".format(rpm_prefix=rpm_prefix),
@@ -44,6 +46,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
@staticmethod
def node_packages(rpm_prefix):
+ """Return a list of RPMs that we expect a node install to have available."""
return [
"{rpm_prefix}".format(rpm_prefix=rpm_prefix),
"{rpm_prefix}-node".format(rpm_prefix=rpm_prefix),
diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py
index f432380c6..1e9aecbe0 100644
--- a/roles/openshift_health_checker/openshift_checks/package_update.py
+++ b/roles/openshift_health_checker/openshift_checks/package_update.py
@@ -1,14 +1,14 @@
-# pylint: disable=missing-docstring
+"""Check that a yum update would not run into conflicts with available packages."""
from openshift_checks import OpenShiftCheck
from openshift_checks.mixins import NotContainerizedMixin
class PackageUpdate(NotContainerizedMixin, OpenShiftCheck):
- """Check that there are no conflicts in RPM packages."""
+ """Check that a yum update would not run into conflicts with available packages."""
name = "package_update"
tags = ["preflight"]
- def run(self, tmp, task_vars):
+ def run(self):
args = {"packages": []}
- return self.execute_module("check_yum_update", args, tmp=tmp, task_vars=task_vars)
+ return self.execute_module("check_yum_update", args)
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
index 204752bd0..8b780114f 100644
--- a/roles/openshift_health_checker/openshift_checks/package_version.py
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -1,5 +1,8 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+"""Check that available RPM packages match the required versions."""
+
+import re
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
from openshift_checks.mixins import NotContainerizedMixin
@@ -9,48 +12,49 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
name = "package_version"
tags = ["preflight"]
+ # NOTE: versions outside those specified are mapped to least/greatest
openshift_to_ovs_version = {
- "3.6": ["2.6", "2.7"],
- "3.5": ["2.6", "2.7"],
- "3.4": "2.4",
+ (3, 4): "2.4",
+ (3, 5): ["2.6", "2.7"],
+ (3, 6): ["2.6", "2.7"],
}
openshift_to_docker_version = {
- "3.1": "1.8",
- "3.2": "1.10",
- "3.3": "1.10",
- "3.4": "1.12",
+ (3, 1): "1.8",
+ (3, 2): "1.10",
+ (3, 3): "1.10",
+ (3, 4): "1.12",
+ (3, 5): "1.12",
+ (3, 6): "1.12",
}
- # map major release versions across releases
- # to a common major version
- openshift_major_release_version = {
- "1": "3",
+ # map major OpenShift release versions across releases to a common major version
+ map_major_release_version = {
+ 1: 3,
}
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Skip hosts that do not have package requirements."""
- group_names = get_var(task_vars, "group_names", default=[])
+ group_names = self.get_var("group_names", default=[])
master_or_node = 'masters' in group_names or 'nodes' in group_names
- return super(PackageVersion, cls).is_active(task_vars) and master_or_node
+ return super(PackageVersion, self).is_active() and master_or_node
- def run(self, tmp, task_vars):
- rpm_prefix = get_var(task_vars, "openshift", "common", "service_type")
- openshift_release = get_var(task_vars, "openshift_release", default='')
- deployment_type = get_var(task_vars, "openshift_deployment_type")
+ def run(self):
+ rpm_prefix = self.get_var("openshift", "common", "service_type")
+ openshift_release = self.get_var("openshift_release", default='')
+ deployment_type = self.get_var("openshift_deployment_type")
check_multi_minor_release = deployment_type in ['openshift-enterprise']
args = {
"package_list": [
{
"name": "openvswitch",
- "version": self.get_required_ovs_version(task_vars),
+ "version": self.get_required_ovs_version(),
"check_multi": False,
},
{
"name": "docker",
- "version": self.get_required_docker_version(task_vars),
+ "version": self.get_required_docker_version(),
"check_multi": False,
},
{
@@ -71,55 +75,52 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
],
}
- return self.execute_module("aos_version", args, tmp=tmp, task_vars=task_vars)
+ return self.execute_module("aos_version", args)
- def get_required_ovs_version(self, task_vars):
- """Return the correct Open vSwitch version for the current OpenShift version.
- If the current OpenShift version is >= 3.5, ensure Open vSwitch version 2.6,
- Else ensure Open vSwitch version 2.4"""
- openshift_version = self.get_openshift_version(task_vars)
+ def get_required_ovs_version(self):
+ """Return the correct Open vSwitch version(s) for the current OpenShift version."""
+ openshift_version = self.get_openshift_version_tuple()
- if float(openshift_version) < 3.5:
- return self.openshift_to_ovs_version["3.4"]
+ earliest = min(self.openshift_to_ovs_version)
+ latest = max(self.openshift_to_ovs_version)
+ if openshift_version < earliest:
+ return self.openshift_to_ovs_version[earliest]
+ if openshift_version > latest:
+ return self.openshift_to_ovs_version[latest]
- ovs_version = self.openshift_to_ovs_version.get(str(openshift_version))
- if ovs_version:
- return ovs_version
+ ovs_version = self.openshift_to_ovs_version.get(openshift_version)
+ if not ovs_version:
+ msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version)))
- msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
- raise OpenShiftCheckException(msg.format(openshift_version))
+ return ovs_version
- def get_required_docker_version(self, task_vars):
- """Return the correct Docker version for the current OpenShift version.
- If the OpenShift version is 3.1, ensure Docker version 1.8.
- If the OpenShift version is 3.2 or 3.3, ensure Docker version 1.10.
- If the current OpenShift version is >= 3.4, ensure Docker version 1.12."""
- openshift_version = self.get_openshift_version(task_vars)
+ def get_required_docker_version(self):
+ """Return the correct Docker version(s) for the current OpenShift version."""
+ openshift_version = self.get_openshift_version_tuple()
- if float(openshift_version) >= 3.4:
- return self.openshift_to_docker_version["3.4"]
+ earliest = min(self.openshift_to_docker_version)
+ latest = max(self.openshift_to_docker_version)
+ if openshift_version < earliest:
+ return self.openshift_to_docker_version[earliest]
+ if openshift_version > latest:
+ return self.openshift_to_docker_version[latest]
- docker_version = self.openshift_to_docker_version.get(str(openshift_version))
- if docker_version:
- return docker_version
+ docker_version = self.openshift_to_docker_version.get(openshift_version)
+ if not docker_version:
+ msg = "There is no recommended version of Docker for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version)))
- msg = "There is no recommended version of Docker for the current version of OpenShift: {}"
- raise OpenShiftCheckException(msg.format(openshift_version))
+ return docker_version
- def get_openshift_version(self, task_vars):
- openshift_version = get_var(task_vars, "openshift_image_tag")
- if openshift_version and openshift_version[0] == 'v':
- openshift_version = openshift_version[1:]
+ def get_openshift_version_tuple(self):
+ """Return received image tag as a normalized (X, Y) minor version tuple."""
+ version = self.get_var("openshift_image_tag")
+ comps = [int(component) for component in re.findall(r'\d+', version)]
- return self.parse_version(openshift_version)
-
- def parse_version(self, version):
- components = version.split(".")
- if not components or len(components) < 2:
+ if len(comps) < 2:
msg = "An invalid version of OpenShift was found for this host: {}"
raise OpenShiftCheckException(msg.format(version))
- if components[0] in self.openshift_major_release_version:
- components[0] = self.openshift_major_release_version[components[0]]
-
- return '.'.join(components[:2])
+ comps[0] = self.map_major_release_version.get(comps[0], comps[0])
+ return tuple(comps[0:2])
diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py
index 9383b233c..f5161d6f5 100644
--- a/roles/openshift_health_checker/test/action_plugin_test.py
+++ b/roles/openshift_health_checker/test/action_plugin_test.py
@@ -6,7 +6,7 @@ from openshift_health_check import ActionModule, resolve_checks
from openshift_checks import OpenShiftCheckException
-def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None):
+def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None, changed=False):
"""Returns a new class that is compatible with OpenShiftCheck for testing."""
_name, _tags = name, tags
@@ -14,15 +14,16 @@ def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, ru
class FakeCheck(object):
name = _name
tags = _tags or []
+ changed = False
- def __init__(self, execute_module=None):
+ def __init__(self, execute_module=None, task_vars=None, tmp=None):
pass
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
return is_active
- def run(self, tmp, task_vars):
+ def run(self):
+ self.changed = changed
if run_exception is not None:
raise run_exception
return run_return
@@ -124,7 +125,7 @@ def test_action_plugin_skip_disabled_checks(plugin, task_vars, monkeypatch):
def test_action_plugin_run_check_ok(plugin, task_vars, monkeypatch):
check_return_value = {'ok': 'test'}
check_class = fake_check(run_return=check_return_value)
- monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()})
monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
result = plugin.run(tmp=None, task_vars=task_vars)
@@ -136,14 +137,15 @@ def test_action_plugin_run_check_ok(plugin, task_vars, monkeypatch):
def test_action_plugin_run_check_changed(plugin, task_vars, monkeypatch):
- check_return_value = {'ok': 'test', 'changed': True}
- check_class = fake_check(run_return=check_return_value)
- monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ check_return_value = {'ok': 'test'}
+ check_class = fake_check(run_return=check_return_value, changed=True)
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()})
monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
result = plugin.run(tmp=None, task_vars=task_vars)
assert result['checks']['fake_check'] == check_return_value
+ assert changed(result['checks']['fake_check'])
assert not failed(result)
assert changed(result)
assert not skipped(result)
@@ -152,7 +154,7 @@ def test_action_plugin_run_check_changed(plugin, task_vars, monkeypatch):
def test_action_plugin_run_check_fail(plugin, task_vars, monkeypatch):
check_return_value = {'failed': True}
check_class = fake_check(run_return=check_return_value)
- monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()})
monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
result = plugin.run(tmp=None, task_vars=task_vars)
@@ -166,20 +168,21 @@ def test_action_plugin_run_check_fail(plugin, task_vars, monkeypatch):
def test_action_plugin_run_check_exception(plugin, task_vars, monkeypatch):
exception_msg = 'fake check has an exception'
run_exception = OpenShiftCheckException(exception_msg)
- check_class = fake_check(run_exception=run_exception)
- monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ check_class = fake_check(run_exception=run_exception, changed=True)
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()})
monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
result = plugin.run(tmp=None, task_vars=task_vars)
assert failed(result['checks']['fake_check'], msg_has=exception_msg)
assert failed(result, msg_has=['failed'])
- assert not changed(result)
+ assert changed(result['checks']['fake_check'])
+ assert changed(result)
assert not skipped(result)
def test_action_plugin_resolve_checks_exception(plugin, task_vars, monkeypatch):
- monkeypatch.setattr(plugin, 'load_known_checks', lambda: {})
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {})
result = plugin.run(tmp=None, task_vars=task_vars)
diff --git a/roles/openshift_health_checker/test/curator_test.py b/roles/openshift_health_checker/test/curator_test.py
index ae108c96e..62c680b74 100644
--- a/roles/openshift_health_checker/test/curator_test.py
+++ b/roles/openshift_health_checker/test/curator_test.py
@@ -1,22 +1,6 @@
import pytest
-from openshift_checks.logging.curator import Curator
-
-
-def canned_curator(exec_oc=None):
- """Create a Curator check object with canned exec_oc method"""
- check = Curator("dummy") # fails if a module is actually invoked
- if exec_oc:
- check._exec_oc = exec_oc
- return check
-
-
-def assert_error(error, expect_error):
- if expect_error:
- assert error
- assert expect_error in error
- else:
- assert not error
+from openshift_checks.logging.curator import Curator, OpenShiftCheckException
plain_curator_pod = {
@@ -44,25 +28,30 @@ not_running_curator_pod = {
}
+def test_get_curator_pods():
+ check = Curator()
+ check.get_pods_for_component = lambda *_: [plain_curator_pod]
+ result = check.run()
+ assert "failed" not in result or not result["failed"]
+
+
@pytest.mark.parametrize('pods, expect_error', [
(
[],
- "no Curator pods",
- ),
- (
- [plain_curator_pod],
- None,
+ 'MissingComponentPods',
),
(
[not_running_curator_pod],
- "not currently in a running state",
+ 'CuratorNotRunning',
),
(
[plain_curator_pod, plain_curator_pod],
- "more than one Curator pod",
+ 'TooManyCurators',
),
])
-def test_get_curator_pods(pods, expect_error):
- check = canned_curator()
- error = check.check_curator(pods)
- assert_error(error, expect_error)
+def test_get_curator_pods_fail(pods, expect_error):
+ check = Curator()
+ check.get_pods_for_component = lambda *_: pods
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.run()
+ assert excinfo.value.name == expect_error
diff --git a/roles/openshift_health_checker/test/disk_availability_test.py b/roles/openshift_health_checker/test/disk_availability_test.py
index 945b9eafc..f4fd2dfed 100644
--- a/roles/openshift_health_checker/test/disk_availability_test.py
+++ b/roles/openshift_health_checker/test/disk_availability_test.py
@@ -17,26 +17,37 @@ def test_is_active(group_names, is_active):
task_vars = dict(
group_names=group_names,
)
- assert DiskAvailability.is_active(task_vars=task_vars) == is_active
+ assert DiskAvailability(None, task_vars).is_active() == is_active
-@pytest.mark.parametrize('ansible_mounts,extra_words', [
- ([], ['none']), # empty ansible_mounts
- ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths
- ([{'mount': '/var'}], ['/var']), # missing size_available
+@pytest.mark.parametrize('desc, ansible_mounts, expect_chunks', [
+ (
+ 'empty ansible_mounts',
+ [],
+ ['determine mount point', 'none'],
+ ),
+ (
+ 'missing relevant mount paths',
+ [{'mount': '/mnt'}],
+ ['determine mount point', '/mnt'],
+ ),
+ (
+ 'missing size_available',
+ [{'mount': '/var'}, {'mount': '/usr'}, {'mount': '/tmp'}],
+ ['missing', 'size_available'],
+ ),
])
-def test_cannot_determine_available_disk(ansible_mounts, extra_words):
+def test_cannot_determine_available_disk(desc, ansible_mounts, expect_chunks):
task_vars = dict(
group_names=['masters'],
ansible_mounts=ansible_mounts,
)
- check = DiskAvailability(execute_module=fake_execute_module)
with pytest.raises(OpenShiftCheckException) as excinfo:
- check.run(tmp=None, task_vars=task_vars)
+ DiskAvailability(fake_execute_module, task_vars).run()
- for word in 'determine disk availability'.split() + extra_words:
- assert word in str(excinfo.value)
+ for chunk in expect_chunks:
+ assert chunk in str(excinfo.value)
@pytest.mark.parametrize('group_names,configured_min,ansible_mounts', [
@@ -93,14 +104,14 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib
ansible_mounts=ansible_mounts,
)
- check = DiskAvailability(execute_module=fake_execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ result = DiskAvailability(fake_execute_module, task_vars).run()
assert not result.get('failed', False)
-@pytest.mark.parametrize('group_names,configured_min,ansible_mounts,extra_words', [
+@pytest.mark.parametrize('name,group_names,configured_min,ansible_mounts,expect_chunks', [
(
+ 'test with no space available',
['masters'],
0,
[{
@@ -110,6 +121,7 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib
['0.0 GB'],
),
(
+ 'test with a higher configured required value',
['masters'],
100, # set a higher threshold
[{
@@ -119,6 +131,7 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib
['100.0 GB'],
),
(
+ 'test with 1GB available, but "0" GB space requirement',
['nodes'],
0,
[{
@@ -128,6 +141,7 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib
['1.0 GB'],
),
(
+ 'test with no space available, but "0" GB space requirement',
['etcd'],
0,
[{
@@ -137,16 +151,17 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib
['0.0 GB'],
),
(
+ 'test with enough space for a node, but not for a master',
['nodes', 'masters'],
0,
[{
'mount': '/',
- # enough space for a node, not enough for a master
'size_available': 15 * 10**9 + 1,
}],
['15.0 GB'],
),
(
+ 'test failure with enough space on "/", but not enough on "/var"',
['etcd'],
0,
[{
@@ -160,20 +175,73 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib
}],
['0.0 GB'],
),
-])
-def test_fails_with_insufficient_disk_space(group_names, configured_min, ansible_mounts, extra_words):
+], ids=lambda argval: argval[0])
+def test_fails_with_insufficient_disk_space(name, group_names, configured_min, ansible_mounts, expect_chunks):
task_vars = dict(
group_names=group_names,
openshift_check_min_host_disk_gb=configured_min,
ansible_mounts=ansible_mounts,
)
- check = DiskAvailability(execute_module=fake_execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ result = DiskAvailability(fake_execute_module, task_vars).run()
assert result['failed']
- for word in 'below recommended'.split() + extra_words:
- assert word in result['msg']
+ for chunk in 'below recommended'.split() + expect_chunks:
+ assert chunk in result.get('msg', '')
+
+
+@pytest.mark.parametrize('name,group_names,context,ansible_mounts,failed,extra_words', [
+ (
+ 'test without enough space for master under "upgrade" context',
+ ['nodes', 'masters'],
+ "upgrade",
+ [{
+ 'mount': '/',
+ 'size_available': 1 * 10**9 + 1,
+ 'size_total': 21 * 10**9 + 1,
+ }],
+ True,
+ ["1.0 GB"],
+ ),
+ (
+ 'test with enough space for master under "upgrade" context',
+ ['nodes', 'masters'],
+ "upgrade",
+ [{
+ 'mount': '/',
+ 'size_available': 10 * 10**9 + 1,
+ 'size_total': 21 * 10**9 + 1,
+ }],
+ False,
+ [],
+ ),
+ (
+ 'test with not enough space for master, and non-upgrade context',
+ ['nodes', 'masters'],
+ "health",
+ [{
+ 'mount': '/',
+ # not enough space for a master,
+ # "health" context should not lower requirement
+ 'size_available': 20 * 10**9 + 1,
+ }],
+ True,
+ ["20.0 GB", "below minimum"],
+ ),
+], ids=lambda argval: argval[0])
+def test_min_required_space_changes_with_upgrade_context(name, group_names, context, ansible_mounts, failed, extra_words):
+ task_vars = dict(
+ r_openshift_health_checker_playbook_context=context,
+ group_names=group_names,
+ ansible_mounts=ansible_mounts,
+ )
+
+ check = DiskAvailability(fake_execute_module, task_vars)
+ result = check.run()
+
+ assert result.get("failed", False) == failed
+ for word in extra_words:
+ assert word in result.get('msg', '')
def fake_execute_module(*args):
diff --git a/roles/openshift_health_checker/test/docker_image_availability_test.py b/roles/openshift_health_checker/test/docker_image_availability_test.py
index 3b9e097fb..8d0a53df9 100644
--- a/roles/openshift_health_checker/test/docker_image_availability_test.py
+++ b/roles/openshift_health_checker/test/docker_image_availability_test.py
@@ -21,7 +21,7 @@ def test_is_active(deployment_type, is_containerized, group_names, expect_active
openshift_deployment_type=deployment_type,
group_names=group_names,
)
- assert DockerImageAvailability.is_active(task_vars=task_vars) == expect_active
+ assert DockerImageAvailability(None, task_vars).is_active() == expect_active
@pytest.mark.parametrize("is_containerized,is_atomic", [
@@ -31,7 +31,7 @@ def test_is_active(deployment_type, is_containerized, group_names, expect_active
(False, True),
])
def test_all_images_available_locally(is_containerized, is_atomic):
- def execute_module(module_name, module_args, task_vars):
+ def execute_module(module_name, module_args, *_):
if module_name == "yum":
return {"changed": True}
@@ -42,7 +42,7 @@ def test_all_images_available_locally(is_containerized, is_atomic):
'images': [module_args['name']],
}
- result = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict(
+ result = DockerImageAvailability(execute_module, task_vars=dict(
openshift=dict(
common=dict(
service_type='origin',
@@ -54,7 +54,7 @@ def test_all_images_available_locally(is_containerized, is_atomic):
openshift_deployment_type='origin',
openshift_image_tag='3.4',
group_names=['nodes', 'masters'],
- ))
+ )).run()
assert not result.get('failed', False)
@@ -64,12 +64,12 @@ def test_all_images_available_locally(is_containerized, is_atomic):
True,
])
def test_all_images_available_remotely(available_locally):
- def execute_module(module_name, module_args, task_vars):
+ def execute_module(module_name, *_):
if module_name == 'docker_image_facts':
return {'images': [], 'failed': available_locally}
return {'changed': False}
- result = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict(
+ result = DockerImageAvailability(execute_module, task_vars=dict(
openshift=dict(
common=dict(
service_type='origin',
@@ -81,13 +81,13 @@ def test_all_images_available_remotely(available_locally):
openshift_deployment_type='origin',
openshift_image_tag='v3.4',
group_names=['nodes', 'masters'],
- ))
+ )).run()
assert not result.get('failed', False)
def test_all_images_unavailable():
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(module_name=None, *_):
if module_name == "command":
return {
'failed': True,
@@ -97,8 +97,7 @@ def test_all_images_unavailable():
'changed': False,
}
- check = DockerImageAvailability(execute_module=execute_module)
- actual = check.run(tmp=None, task_vars=dict(
+ actual = DockerImageAvailability(execute_module, task_vars=dict(
openshift=dict(
common=dict(
service_type='origin',
@@ -110,7 +109,7 @@ def test_all_images_unavailable():
openshift_deployment_type="openshift-enterprise",
openshift_image_tag='latest',
group_names=['nodes', 'masters'],
- ))
+ )).run()
assert actual['failed']
assert "required Docker images are not available" in actual['msg']
@@ -127,7 +126,7 @@ def test_all_images_unavailable():
),
])
def test_skopeo_update_failure(message, extra_words):
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(module_name=None, *_):
if module_name == "yum":
return {
"failed": True,
@@ -137,7 +136,7 @@ def test_skopeo_update_failure(message, extra_words):
return {'changed': False}
- actual = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict(
+ actual = DockerImageAvailability(execute_module, task_vars=dict(
openshift=dict(
common=dict(
service_type='origin',
@@ -149,7 +148,7 @@ def test_skopeo_update_failure(message, extra_words):
openshift_deployment_type="openshift-enterprise",
openshift_image_tag='',
group_names=['nodes', 'masters'],
- ))
+ )).run()
assert actual["failed"]
for word in extra_words:
@@ -162,12 +161,12 @@ def test_skopeo_update_failure(message, extra_words):
("openshift-enterprise", []),
])
def test_registry_availability(deployment_type, registries):
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(module_name=None, *_):
return {
'changed': False,
}
- actual = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict(
+ actual = DockerImageAvailability(execute_module, task_vars=dict(
openshift=dict(
common=dict(
service_type='origin',
@@ -179,7 +178,7 @@ def test_registry_availability(deployment_type, registries):
openshift_deployment_type=deployment_type,
openshift_image_tag='',
group_names=['nodes', 'masters'],
- ))
+ )).run()
assert not actual.get("failed", False)
@@ -258,7 +257,7 @@ def test_required_images(deployment_type, is_containerized, groups, oreg_url, ex
openshift_image_tag='vtest',
)
- assert expected == DockerImageAvailability("DUMMY").required_images(task_vars)
+ assert expected == DockerImageAvailability("DUMMY", task_vars).required_images()
def test_containerized_etcd():
@@ -272,4 +271,4 @@ def test_containerized_etcd():
group_names=['etcd'],
)
expected = set(['registry.access.redhat.com/rhel7/etcd'])
- assert expected == DockerImageAvailability("DUMMY").required_images(task_vars)
+ assert expected == DockerImageAvailability("DUMMY", task_vars).required_images()
diff --git a/roles/openshift_health_checker/test/docker_storage_test.py b/roles/openshift_health_checker/test/docker_storage_test.py
index bb25e3f66..e0dccc062 100644
--- a/roles/openshift_health_checker/test/docker_storage_test.py
+++ b/roles/openshift_health_checker/test/docker_storage_test.py
@@ -4,12 +4,6 @@ from openshift_checks import OpenShiftCheckException
from openshift_checks.docker_storage import DockerStorage
-def dummy_check(execute_module=None):
- def dummy_exec(self, status, task_vars):
- raise Exception("dummy executor called")
- return DockerStorage(execute_module=execute_module or dummy_exec)
-
-
@pytest.mark.parametrize('is_containerized, group_names, is_active', [
(False, ["masters", "etcd"], False),
(False, ["masters", "nodes"], True),
@@ -20,10 +14,11 @@ def test_is_active(is_containerized, group_names, is_active):
openshift=dict(common=dict(is_containerized=is_containerized)),
group_names=group_names,
)
- assert DockerStorage.is_active(task_vars=task_vars) == is_active
+ assert DockerStorage(None, task_vars).is_active() == is_active
-non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
+def non_atomic_task_vars():
+ return {"openshift": {"common": {"is_atomic": False}}}
@pytest.mark.parametrize('docker_info, failed, expect_msg', [
@@ -56,7 +51,7 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
(
dict(info={
"Driver": "overlay2",
- "DriverStatus": []
+ "DriverStatus": [("Backing Filesystem", "xfs")],
}),
False,
[],
@@ -64,6 +59,27 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
(
dict(info={
"Driver": "overlay",
+ "DriverStatus": [("Backing Filesystem", "btrfs")],
+ }),
+ True,
+ ["storage is type 'btrfs'", "only supported with\n'xfs'"],
+ ),
+ (
+ dict(info={
+ "Driver": "overlay2",
+ "DriverStatus": [("Backing Filesystem", "xfs")],
+ "OperatingSystem": "Red Hat Enterprise Linux Server release 7.2 (Maipo)",
+ "KernelVersion": "3.10.0-327.22.2.el7.x86_64",
+ }),
+ True,
+ ["Docker reports kernel version 3.10.0-327"],
+ ),
+ (
+ dict(info={
+ "Driver": "overlay",
+ "DriverStatus": [("Backing Filesystem", "xfs")],
+ "OperatingSystem": "CentOS",
+ "KernelVersion": "3.10.0-514",
}),
False,
[],
@@ -77,16 +93,17 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
),
])
def test_check_storage_driver(docker_info, failed, expect_msg):
- def execute_module(module_name, module_args, tmp=None, task_vars=None):
+ def execute_module(module_name, *_):
if module_name == "yum":
return {}
if module_name != "docker_info":
raise ValueError("not expecting module " + module_name)
return docker_info
- check = dummy_check(execute_module=execute_module)
- check._check_dm_usage = lambda status, task_vars: dict() # stub out for this test
- result = check.run(tmp=None, task_vars=non_atomic_task_vars)
+ check = DockerStorage(execute_module, non_atomic_task_vars())
+ check.check_dm_usage = lambda status: dict() # stub out for this test
+ check.check_overlay_usage = lambda info: dict() # stub out for this test
+ result = check.run()
if failed:
assert result["failed"]
@@ -145,9 +162,9 @@ not_enough_space = {
),
])
def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg):
- check = dummy_check()
- check._get_vg_free = lambda pool, task_vars: vg_free
- result = check._check_dm_usage(driver_status, task_vars)
+ check = DockerStorage(None, task_vars)
+ check.get_vg_free = lambda pool: vg_free
+ result = check.check_dm_usage(driver_status)
result_success = not result.get("failed")
assert result_success is success
@@ -187,18 +204,18 @@ def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg):
)
])
def test_vg_free(pool, command_returns, raises, returns):
- def execute_module(module_name, module_args, tmp=None, task_vars=None):
+ def execute_module(module_name, *_):
if module_name != "command":
raise ValueError("not expecting module " + module_name)
return command_returns
- check = dummy_check(execute_module=execute_module)
+ check = DockerStorage(execute_module)
if raises:
with pytest.raises(OpenShiftCheckException) as err:
- check._get_vg_free(pool, {})
+ check.get_vg_free(pool)
assert raises in str(err.value)
else:
- ret = check._get_vg_free(pool, {})
+ ret = check.get_vg_free(pool)
assert ret == returns
@@ -209,7 +226,7 @@ def test_vg_free(pool, command_returns, raises, returns):
("12g", 12.0 * 1024**3),
])
def test_convert_to_bytes(string, expect_bytes):
- got = DockerStorage._convert_to_bytes(string)
+ got = DockerStorage.convert_to_bytes(string)
assert got == expect_bytes
@@ -219,6 +236,70 @@ def test_convert_to_bytes(string, expect_bytes):
])
def test_convert_to_bytes_error(string):
with pytest.raises(ValueError) as err:
- DockerStorage._convert_to_bytes(string)
+ DockerStorage.convert_to_bytes(string)
assert "Cannot convert" in str(err.value)
assert string in str(err.value)
+
+
+ansible_mounts_enough = [{
+ 'mount': '/var/lib/docker',
+ 'size_available': 50 * 10**9,
+ 'size_total': 50 * 10**9,
+}]
+ansible_mounts_not_enough = [{
+ 'mount': '/var/lib/docker',
+ 'size_available': 0,
+ 'size_total': 50 * 10**9,
+}]
+ansible_mounts_missing_fields = [dict(mount='/var/lib/docker')]
+ansible_mounts_zero_size = [{
+ 'mount': '/var/lib/docker',
+ 'size_available': 0,
+ 'size_total': 0,
+}]
+
+
+@pytest.mark.parametrize('ansible_mounts, threshold, expect_fail, expect_msg', [
+ (
+ ansible_mounts_enough,
+ None,
+ False,
+ [],
+ ),
+ (
+ ansible_mounts_not_enough,
+ None,
+ True,
+ ["usage percentage", "higher than threshold"],
+ ),
+ (
+ ansible_mounts_not_enough,
+ "bogus percent",
+ True,
+ ["is not a percentage"],
+ ),
+ (
+ ansible_mounts_missing_fields,
+ None,
+ True,
+ ["Ansible bug"],
+ ),
+ (
+ ansible_mounts_zero_size,
+ None,
+ True,
+ ["Ansible bug"],
+ ),
+])
+def test_overlay_usage(ansible_mounts, threshold, expect_fail, expect_msg):
+ task_vars = non_atomic_task_vars()
+ task_vars["ansible_mounts"] = ansible_mounts
+ if threshold is not None:
+ task_vars["max_overlay_usage_percent"] = threshold
+ check = DockerStorage(None, task_vars)
+ docker_info = dict(DockerRootDir="/var/lib/docker", Driver="overlay")
+ result = check.check_overlay_usage(docker_info)
+
+ assert expect_fail == bool(result.get("failed"))
+ for msg in expect_msg:
+ assert msg in result["msg"]
diff --git a/roles/openshift_health_checker/test/elasticsearch_test.py b/roles/openshift_health_checker/test/elasticsearch_test.py
index b9d375d8c..09bacd9ac 100644
--- a/roles/openshift_health_checker/test/elasticsearch_test.py
+++ b/roles/openshift_health_checker/test/elasticsearch_test.py
@@ -1,25 +1,26 @@
import pytest
import json
-from openshift_checks.logging.elasticsearch import Elasticsearch
+from openshift_checks.logging.elasticsearch import Elasticsearch, OpenShiftCheckExceptionList
+
task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin')))
-def canned_elasticsearch(exec_oc=None):
- """Create an Elasticsearch check object with canned exec_oc method"""
- check = Elasticsearch("dummy") # fails if a module is actually invoked
+def canned_elasticsearch(task_vars=None, exec_oc=None):
+ """Create an Elasticsearch check object with stubbed exec_oc method"""
+ check = Elasticsearch(None, task_vars or {})
if exec_oc:
- check._exec_oc = exec_oc
+ check.exec_oc = exec_oc
return check
-def assert_error(error, expect_error):
- if expect_error:
- assert error
- assert expect_error in error
- else:
- assert not error
+def assert_error_in_list(expect_err, errorlist):
+ assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist))
+
+
+def pods_by_name(pods):
+ return {pod['metadata']['name']: pod for pod in pods}
plain_es_pod = {
@@ -27,6 +28,7 @@ plain_es_pod = {
"labels": {"component": "es", "deploymentconfig": "logging-es"},
"name": "logging-es",
},
+ "spec": {},
"status": {
"conditions": [{"status": "True", "type": "Ready"}],
"containerStatuses": [{"ready": True}],
@@ -40,6 +42,7 @@ split_es_pod = {
"labels": {"component": "es", "deploymentconfig": "logging-es-2"},
"name": "logging-es-2",
},
+ "spec": {},
"status": {
"conditions": [{"status": "True", "type": "Ready"}],
"containerStatuses": [{"ready": True}],
@@ -48,12 +51,28 @@ split_es_pod = {
"_test_master_name_str": "name logging-es-2",
}
+unready_es_pod = {
+ "metadata": {
+ "labels": {"component": "es", "deploymentconfig": "logging-es-3"},
+ "name": "logging-es-3",
+ },
+ "spec": {},
+ "status": {
+ "conditions": [{"status": "False", "type": "Ready"}],
+ "containerStatuses": [{"ready": False}],
+ "podIP": "10.10.10.10",
+ },
+ "_test_master_name_str": "BAD_NAME_RESPONSE",
+}
+
def test_check_elasticsearch():
- assert 'No logging Elasticsearch pods' in canned_elasticsearch().check_elasticsearch([], {})
+ with pytest.raises(OpenShiftCheckExceptionList) as excinfo:
+ canned_elasticsearch().check_elasticsearch([])
+ assert_error_in_list('NoRunningPods', excinfo.value)
# canned oc responses to match so all the checks pass
- def _exec_oc(cmd, args, task_vars):
+ def exec_oc(cmd, args):
if '_cat/master' in cmd:
return 'name logging-es'
elif '/_nodes' in cmd:
@@ -65,33 +84,41 @@ def test_check_elasticsearch():
else:
raise Exception(cmd)
- assert not canned_elasticsearch(_exec_oc).check_elasticsearch([plain_es_pod], {})
+ check = canned_elasticsearch({}, exec_oc)
+ check.get_pods_for_component = lambda *_: [plain_es_pod]
+ assert {} == check.run()
-def pods_by_name(pods):
- return {pod['metadata']['name']: pod for pod in pods}
+def test_check_running_es_pods():
+ pods, errors = Elasticsearch().running_elasticsearch_pods([plain_es_pod, unready_es_pod])
+ assert plain_es_pod in pods
+ assert_error_in_list('PodNotRunning', errors)
+
+
+def test_check_elasticsearch_masters():
+ pods = [plain_es_pod]
+ check = canned_elasticsearch(task_vars_config_base, lambda *_: plain_es_pod['_test_master_name_str'])
+ assert not check.check_elasticsearch_masters(pods_by_name(pods))
@pytest.mark.parametrize('pods, expect_error', [
(
[],
- 'No logging Elasticsearch masters',
+ 'NoMasterFound',
),
(
- [plain_es_pod],
- None,
+ [unready_es_pod],
+ 'NoMasterName',
),
(
[plain_es_pod, split_es_pod],
- 'Found multiple Elasticsearch masters',
+ 'SplitBrainMasters',
),
])
-def test_check_elasticsearch_masters(pods, expect_error):
+def test_check_elasticsearch_masters_error(pods, expect_error):
test_pods = list(pods)
- check = canned_elasticsearch(lambda cmd, args, task_vars: test_pods.pop(0)['_test_master_name_str'])
-
- errors = check._check_elasticsearch_masters(pods_by_name(pods), task_vars_config_base)
- assert_error(''.join(errors), expect_error)
+ check = canned_elasticsearch(task_vars_config_base, lambda *_: test_pods.pop(0)['_test_master_name_str'])
+ assert_error_in_list(expect_error, check.check_elasticsearch_masters(pods_by_name(pods)))
es_node_list = {
@@ -101,80 +128,76 @@ es_node_list = {
}}}
+def test_check_elasticsearch_node_list():
+ check = canned_elasticsearch(task_vars_config_base, lambda *_: json.dumps(es_node_list))
+ assert not check.check_elasticsearch_node_list(pods_by_name([plain_es_pod]))
+
+
@pytest.mark.parametrize('pods, node_list, expect_error', [
(
[],
{},
- 'No logging Elasticsearch masters',
- ),
- (
- [plain_es_pod],
- es_node_list,
- None,
+ 'MissingComponentPods',
),
(
[plain_es_pod],
{}, # empty list of nodes triggers KeyError
- "Failed to query",
+ 'MissingNodeList',
),
(
[split_es_pod],
es_node_list,
- 'does not correspond to any known ES pod',
+ 'EsPodNodeMismatch',
),
])
-def test_check_elasticsearch_node_list(pods, node_list, expect_error):
- check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(node_list))
+def test_check_elasticsearch_node_list_errors(pods, node_list, expect_error):
+ check = canned_elasticsearch(task_vars_config_base, lambda cmd, args: json.dumps(node_list))
+ assert_error_in_list(expect_error, check.check_elasticsearch_node_list(pods_by_name(pods)))
- errors = check._check_elasticsearch_node_list(pods_by_name(pods), task_vars_config_base)
- assert_error(''.join(errors), expect_error)
+
+def test_check_elasticsearch_cluster_health():
+ test_health_data = [{"status": "green"}]
+ check = canned_elasticsearch(exec_oc=lambda *_: json.dumps(test_health_data.pop(0)))
+ assert not check.check_es_cluster_health(pods_by_name([plain_es_pod]))
@pytest.mark.parametrize('pods, health_data, expect_error', [
(
[plain_es_pod],
- [{"status": "green"}],
- None,
- ),
- (
- [plain_es_pod],
[{"no-status": "should bomb"}],
- 'Could not retrieve cluster health status',
+ 'BadEsResponse',
),
(
[plain_es_pod, split_es_pod],
[{"status": "green"}, {"status": "red"}],
- 'Elasticsearch cluster health status is RED',
+ 'EsClusterHealthRed',
),
])
-def test_check_elasticsearch_cluster_health(pods, health_data, expect_error):
+def test_check_elasticsearch_cluster_health_errors(pods, health_data, expect_error):
test_health_data = list(health_data)
- check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(test_health_data.pop(0)))
+ check = canned_elasticsearch(exec_oc=lambda *_: json.dumps(test_health_data.pop(0)))
+ assert_error_in_list(expect_error, check.check_es_cluster_health(pods_by_name(pods)))
- errors = check._check_es_cluster_health(pods_by_name(pods), task_vars_config_base)
- assert_error(''.join(errors), expect_error)
+
+def test_check_elasticsearch_diskspace():
+ check = canned_elasticsearch(exec_oc=lambda *_: 'IUse% Use%\n 3% 4%\n')
+ assert not check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod]))
@pytest.mark.parametrize('disk_data, expect_error', [
(
'df: /elasticsearch/persistent: No such file or directory\n',
- 'Could not retrieve storage usage',
- ),
- (
- 'IUse% Use%\n 3% 4%\n',
- None,
+ 'BadDfResponse',
),
(
'IUse% Use%\n 95% 40%\n',
- 'Inode percent usage on the storage volume',
+ 'InodeUsageTooHigh',
),
(
'IUse% Use%\n 3% 94%\n',
- 'Disk percent usage on the storage volume',
+ 'DiskUsageTooHigh',
),
])
-def test_check_elasticsearch_diskspace(disk_data, expect_error):
- check = canned_elasticsearch(lambda cmd, args, task_vars: disk_data)
-
- errors = check._check_elasticsearch_diskspace(pods_by_name([plain_es_pod]), task_vars_config_base)
- assert_error(''.join(errors), expect_error)
+def test_check_elasticsearch_diskspace_errors(disk_data, expect_error):
+ check = canned_elasticsearch(exec_oc=lambda *_: disk_data)
+ assert_error_in_list(expect_error, check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod])))
diff --git a/roles/openshift_health_checker/test/etcd_imagedata_size_test.py b/roles/openshift_health_checker/test/etcd_imagedata_size_test.py
index df9d52d41..d3aae98f2 100644
--- a/roles/openshift_health_checker/test/etcd_imagedata_size_test.py
+++ b/roles/openshift_health_checker/test/etcd_imagedata_size_test.py
@@ -1,7 +1,8 @@
import pytest
from collections import namedtuple
-from openshift_checks.etcd_imagedata_size import EtcdImageDataSize, OpenShiftCheckException
+from openshift_checks.etcd_imagedata_size import EtcdImageDataSize
+from openshift_checks import OpenShiftCheckException
from etcdkeysize import check_etcd_key_size
@@ -51,12 +52,12 @@ def test_cannot_determine_available_mountpath(ansible_mounts, extra_words):
task_vars = dict(
ansible_mounts=ansible_mounts,
)
- check = EtcdImageDataSize(execute_module=fake_execute_module)
+ check = EtcdImageDataSize(fake_execute_module, task_vars)
with pytest.raises(OpenShiftCheckException) as excinfo:
- check.run(tmp=None, task_vars=task_vars)
+ check.run()
- for word in 'determine valid etcd mountpath'.split() + extra_words:
+ for word in ['Unable to determine mount point'] + extra_words:
assert word in str(excinfo.value)
@@ -111,14 +112,14 @@ def test_cannot_determine_available_mountpath(ansible_mounts, extra_words):
)
])
def test_check_etcd_key_size_calculates_correct_limit(ansible_mounts, tree, size_limit, should_fail, extra_words):
- def execute_module(module_name, args, tmp=None, task_vars=None):
+ def execute_module(module_name, module_args, *_):
if module_name != "etcdkeysize":
return {
"changed": False,
}
client = fake_etcd_client(tree)
- s, limit_exceeded = check_etcd_key_size(client, tree["key"], args["size_limit_bytes"])
+ s, limit_exceeded = check_etcd_key_size(client, tree["key"], module_args["size_limit_bytes"])
return {"size_limit_exceeded": limit_exceeded}
@@ -133,7 +134,7 @@ def test_check_etcd_key_size_calculates_correct_limit(ansible_mounts, tree, size
if size_limit is None:
task_vars.pop("etcd_max_image_data_size_bytes")
- check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars)
+ check = EtcdImageDataSize(execute_module, task_vars).run()
if should_fail:
assert check["failed"]
@@ -267,14 +268,14 @@ def test_check_etcd_key_size_calculates_correct_limit(ansible_mounts, tree, size
),
])
def test_etcd_key_size_check_calculates_correct_size(ansible_mounts, tree, root_path, expected_size, extra_words):
- def execute_module(module_name, args, tmp=None, task_vars=None):
+ def execute_module(module_name, module_args, *_):
if module_name != "etcdkeysize":
return {
"changed": False,
}
client = fake_etcd_client(tree)
- size, limit_exceeded = check_etcd_key_size(client, root_path, args["size_limit_bytes"])
+ size, limit_exceeded = check_etcd_key_size(client, root_path, module_args["size_limit_bytes"])
assert size == expected_size
return {
@@ -289,12 +290,12 @@ def test_etcd_key_size_check_calculates_correct_size(ansible_mounts, tree, root_
)
)
- check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars)
+ check = EtcdImageDataSize(execute_module, task_vars).run()
assert not check.get("failed", False)
def test_etcdkeysize_module_failure():
- def execute_module(module_name, tmp=None, task_vars=None):
+ def execute_module(module_name, *_):
if module_name != "etcdkeysize":
return {
"changed": False,
@@ -317,7 +318,7 @@ def test_etcdkeysize_module_failure():
)
)
- check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars)
+ check = EtcdImageDataSize(execute_module, task_vars).run()
assert check["failed"]
for word in "Failed to retrieve stats":
diff --git a/roles/openshift_health_checker/test/etcd_traffic_test.py b/roles/openshift_health_checker/test/etcd_traffic_test.py
new file mode 100644
index 000000000..fae3e578d
--- /dev/null
+++ b/roles/openshift_health_checker/test/etcd_traffic_test.py
@@ -0,0 +1,72 @@
+import pytest
+
+from openshift_checks.etcd_traffic import EtcdTraffic
+
+
+@pytest.mark.parametrize('group_names,version,is_active', [
+ (['masters'], "3.5", False),
+ (['masters'], "3.6", False),
+ (['nodes'], "3.4", False),
+ (['etcd'], "3.4", True),
+ (['etcd'], "1.5", True),
+ (['etcd'], "3.1", False),
+ (['masters', 'nodes'], "3.5", False),
+ (['masters', 'etcd'], "3.5", True),
+ ([], "3.4", False),
+])
+def test_is_active(group_names, version, is_active):
+ task_vars = dict(
+ group_names=group_names,
+ openshift_image_tag=version,
+ )
+ assert EtcdTraffic(task_vars=task_vars).is_active() == is_active
+
+
+@pytest.mark.parametrize('group_names,matched,failed,extra_words', [
+ (["masters"], True, True, ["Higher than normal", "traffic"]),
+ (["masters", "etcd"], False, False, []),
+ (["etcd"], False, False, []),
+])
+def test_log_matches_high_traffic_msg(group_names, matched, failed, extra_words):
+ def execute_module(module_name, *_):
+ return {
+ "matched": matched,
+ "failed": failed,
+ }
+
+ task_vars = dict(
+ group_names=group_names,
+ openshift=dict(
+ common=dict(service_type="origin", is_containerized=False),
+ )
+ )
+
+ result = EtcdTraffic(execute_module, task_vars).run()
+
+ for word in extra_words:
+ assert word in result.get("msg", "")
+
+ assert result.get("failed", False) == failed
+
+
+@pytest.mark.parametrize('is_containerized,expected_unit_value', [
+ (False, "etcd"),
+ (True, "etcd_container"),
+])
+def test_systemd_unit_matches_deployment_type(is_containerized, expected_unit_value):
+ task_vars = dict(
+ openshift=dict(
+ common=dict(is_containerized=is_containerized),
+ )
+ )
+
+ def execute_module(module_name, args, *_):
+ assert module_name == "search_journalctl"
+ matchers = args["log_matchers"]
+
+ for matcher in matchers:
+ assert matcher["unit"] == expected_unit_value
+
+ return {"failed": False}
+
+ EtcdTraffic(execute_module, task_vars).run()
diff --git a/roles/openshift_health_checker/test/etcd_volume_test.py b/roles/openshift_health_checker/test/etcd_volume_test.py
index 917045526..077cea3ea 100644
--- a/roles/openshift_health_checker/test/etcd_volume_test.py
+++ b/roles/openshift_health_checker/test/etcd_volume_test.py
@@ -1,6 +1,7 @@
import pytest
-from openshift_checks.etcd_volume import EtcdVolume, OpenShiftCheckException
+from openshift_checks.etcd_volume import EtcdVolume
+from openshift_checks import OpenShiftCheckException
@pytest.mark.parametrize('ansible_mounts,extra_words', [
@@ -11,12 +12,11 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words):
task_vars = dict(
ansible_mounts=ansible_mounts,
)
- check = EtcdVolume(execute_module=fake_execute_module)
with pytest.raises(OpenShiftCheckException) as excinfo:
- check.run(tmp=None, task_vars=task_vars)
+ EtcdVolume(fake_execute_module, task_vars).run()
- for word in 'Unable to find etcd storage mount point'.split() + extra_words:
+ for word in ['Unable to determine mount point'] + extra_words:
assert word in str(excinfo.value)
@@ -76,8 +76,7 @@ def test_succeeds_with_recommended_disk_space(size_limit, ansible_mounts):
if task_vars["etcd_device_usage_threshold_percent"] is None:
task_vars.pop("etcd_device_usage_threshold_percent")
- check = EtcdVolume(execute_module=fake_execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ result = EtcdVolume(fake_execute_module, task_vars).run()
assert not result.get('failed', False)
@@ -137,8 +136,7 @@ def test_fails_with_insufficient_disk_space(size_limit_percent, ansible_mounts,
if task_vars["etcd_device_usage_threshold_percent"] is None:
task_vars.pop("etcd_device_usage_threshold_percent")
- check = EtcdVolume(execute_module=fake_execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ result = EtcdVolume(fake_execute_module, task_vars).run()
assert result['failed']
for word in extra_words:
diff --git a/roles/openshift_health_checker/test/fluentd_config_test.py b/roles/openshift_health_checker/test/fluentd_config_test.py
new file mode 100644
index 000000000..10db253bc
--- /dev/null
+++ b/roles/openshift_health_checker/test/fluentd_config_test.py
@@ -0,0 +1,348 @@
+import pytest
+
+from openshift_checks.logging.fluentd_config import FluentdConfig, OpenShiftCheckException
+
+
+def canned_fluentd_pod(containers):
+ return {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-1",
+ },
+ "spec": {
+ "host": "node1",
+ "nodeName": "node1",
+ "containers": containers,
+ },
+ "status": {
+ "phase": "Running",
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+ }
+
+
+fluentd_pod = {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-1",
+ },
+ "spec": {
+ "host": "node1",
+ "nodeName": "node1",
+ "containers": [
+ {
+ "name": "container1",
+ "env": [
+ {
+ "name": "USE_JOURNAL",
+ "value": "true",
+ }
+ ],
+ }
+ ],
+ },
+ "status": {
+ "phase": "Running",
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
+not_running_fluentd_pod = {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-2",
+ },
+ "status": {
+ "phase": "Unknown",
+ "containerStatuses": [{"ready": True}, {"ready": False}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
+
+@pytest.mark.parametrize('name, use_journald, logging_driver, extra_words', [
+ (
+ 'test success with use_journald=false, and docker config set to use "json-file"',
+ False,
+ "json-file",
+ [],
+ ),
+], ids=lambda argvals: argvals[0])
+def test_check_logging_config_non_master(name, use_journald, logging_driver, extra_words):
+ def execute_module(module_name, args):
+ if module_name == "docker_info":
+ return {
+ "info": {
+ "LoggingDriver": logging_driver,
+ }
+ }
+
+ return {}
+
+ task_vars = dict(
+ group_names=["nodes", "etcd"],
+ openshift_logging_fluentd_use_journal=use_journald,
+ openshift=dict(
+ common=dict(config_base=""),
+ ),
+ )
+
+ check = FluentdConfig(execute_module, task_vars)
+ check.execute_module = execute_module
+ error = check.check_logging_config()
+
+ assert error is None
+
+
+@pytest.mark.parametrize('name, use_journald, logging_driver, words', [
+ (
+ 'test failure with use_journald=false, but docker config set to use "journald"',
+ False,
+ "journald",
+ ['json log files', 'has been set to use "journald"'],
+ ),
+ (
+ 'test failure with use_journald=false, but docker config set to use an "unsupported" driver',
+ False,
+ "unsupported",
+ ["json log files", 'has been set to use "unsupported"'],
+ ),
+ (
+ 'test failure with use_journald=true, but docker config set to use "json-file"',
+ True,
+ "json-file",
+ ['logs from "journald"', 'has been set to use "json-file"'],
+ ),
+], ids=lambda argvals: argvals[0])
+def test_check_logging_config_non_master_failed(name, use_journald, logging_driver, words):
+ def execute_module(module_name, args):
+ if module_name == "docker_info":
+ return {
+ "info": {
+ "LoggingDriver": logging_driver,
+ }
+ }
+
+ return {}
+
+ task_vars = dict(
+ group_names=["nodes", "etcd"],
+ openshift_logging_fluentd_use_journal=use_journald,
+ openshift=dict(
+ common=dict(config_base=""),
+ ),
+ )
+
+ check = FluentdConfig(execute_module, task_vars)
+ check.execute_module = execute_module
+ error = check.check_logging_config()
+
+ assert error is not None
+ for word in words:
+ assert word in error
+
+
+@pytest.mark.parametrize('name, pods, logging_driver, extra_words', [
+ # use_journald returns false (not using journald), but check succeeds
+ # since docker is set to use json-file
+ (
+ 'test success with use_journald=false, and docker config set to use default driver "json-file"',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [{
+ "name": "USE_JOURNAL",
+ "value": "false",
+ }],
+ },
+ ]
+ )],
+ "json-file",
+ [],
+ ),
+ (
+ 'test success with USE_JOURNAL env var missing and docker config set to use default driver "json-file"',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [{
+ "name": "RANDOM",
+ "value": "value",
+ }],
+ },
+ ]
+ )],
+ "json-file",
+ [],
+ ),
+], ids=lambda argvals: argvals[0])
+def test_check_logging_config_master(name, pods, logging_driver, extra_words):
+ def execute_module(module_name, args):
+ if module_name == "docker_info":
+ return {
+ "info": {
+ "LoggingDriver": logging_driver,
+ }
+ }
+
+ return {}
+
+ task_vars = dict(
+ group_names=["masters"],
+ openshift=dict(
+ common=dict(config_base=""),
+ ),
+ )
+
+ check = FluentdConfig(execute_module, task_vars)
+ check.execute_module = execute_module
+ check.get_pods_for_component = lambda _: pods
+ error = check.check_logging_config()
+
+ assert error is None
+
+
+@pytest.mark.parametrize('name, pods, logging_driver, words', [
+ (
+ 'test failure with use_journald=false, but docker config set to use "journald"',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [{
+ "name": "USE_JOURNAL",
+ "value": "false",
+ }],
+ },
+ ]
+ )],
+ "journald",
+ ['json log files', 'has been set to use "journald"'],
+ ),
+ (
+ 'test failure with use_journald=true, but docker config set to use "json-file"',
+ [fluentd_pod],
+ "json-file",
+ ['logs from "journald"', 'has been set to use "json-file"'],
+ ),
+ (
+ 'test failure with use_journald=false, but docker set to use an "unsupported" driver',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [{
+ "name": "USE_JOURNAL",
+ "value": "false",
+ }],
+ },
+ ]
+ )],
+ "unsupported",
+ ["json log files", 'has been set to use "unsupported"'],
+ ),
+ (
+ 'test failure with USE_JOURNAL env var missing and docker config set to use "journald"',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [{
+ "name": "RANDOM",
+ "value": "value",
+ }],
+ },
+ ]
+ )],
+ "journald",
+ ["configuration is set to", "json log files"],
+ ),
+], ids=lambda argvals: argvals[0])
+def test_check_logging_config_master_failed(name, pods, logging_driver, words):
+ def execute_module(module_name, args):
+ if module_name == "docker_info":
+ return {
+ "info": {
+ "LoggingDriver": logging_driver,
+ }
+ }
+
+ return {}
+
+ task_vars = dict(
+ group_names=["masters"],
+ openshift=dict(
+ common=dict(config_base=""),
+ ),
+ )
+
+ check = FluentdConfig(execute_module, task_vars)
+ check.execute_module = execute_module
+ check.get_pods_for_component = lambda _: pods
+ error = check.check_logging_config()
+
+ assert error is not None
+ for word in words:
+ assert word in error
+
+
+@pytest.mark.parametrize('name, pods, response, logging_driver, extra_words', [
+ (
+ 'test OpenShiftCheckException with no running containers',
+ [canned_fluentd_pod([])],
+ {
+ "failed": True,
+ "result": "unexpected",
+ },
+ "json-file",
+ ['no running containers'],
+ ),
+ (
+ 'test OpenShiftCheckException one container and no env vars set',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [],
+ },
+ ]
+ )],
+ {
+ "failed": True,
+ "result": "unexpected",
+ },
+ "json-file",
+ ['no environment variables'],
+ ),
+], ids=lambda argvals: argvals[0])
+def test_check_logging_config_master_fails_on_unscheduled_deployment(name, pods, response, logging_driver, extra_words):
+ def execute_module(module_name, args):
+ if module_name == "docker_info":
+ return {
+ "info": {
+ "LoggingDriver": logging_driver,
+ }
+ }
+
+ return {}
+
+ task_vars = dict(
+ group_names=["masters"],
+ openshift=dict(
+ common=dict(config_base=""),
+ ),
+ )
+
+ check = FluentdConfig(execute_module, task_vars)
+ check.get_pods_for_component = lambda _: pods
+
+ with pytest.raises(OpenShiftCheckException) as error:
+ check.check_logging_config()
+
+ assert error is not None
+ for word in extra_words:
+ assert word in str(error)
diff --git a/roles/openshift_health_checker/test/fluentd_test.py b/roles/openshift_health_checker/test/fluentd_test.py
index d151c0b19..e7bf9818b 100644
--- a/roles/openshift_health_checker/test/fluentd_test.py
+++ b/roles/openshift_health_checker/test/fluentd_test.py
@@ -1,23 +1,11 @@
import pytest
import json
-from openshift_checks.logging.fluentd import Fluentd
+from openshift_checks.logging.fluentd import Fluentd, OpenShiftCheckExceptionList, OpenShiftCheckException
-def canned_fluentd(exec_oc=None):
- """Create a Fluentd check object with canned exec_oc method"""
- check = Fluentd("dummy") # fails if a module is actually invoked
- if exec_oc:
- check._exec_oc = exec_oc
- return check
-
-
-def assert_error(error, expect_error):
- if expect_error:
- assert error
- assert expect_error in error
- else:
- assert not error
+def assert_error_in_list(expect_err, errorlist):
+ assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist))
fluentd_pod_node1 = {
@@ -65,45 +53,60 @@ fluentd_node3_unlabeled = {
}
+def test_get_fluentd_pods():
+ check = Fluentd()
+ check.exec_oc = lambda *_: json.dumps(dict(items=[fluentd_node1]))
+ check.get_pods_for_component = lambda *_: [fluentd_pod_node1]
+ assert not check.run()
+
+
@pytest.mark.parametrize('pods, nodes, expect_error', [
(
[],
[],
- 'No nodes appear to be defined',
+ 'NoNodesDefined',
),
(
[],
[fluentd_node3_unlabeled],
- 'There are no nodes with the fluentd label',
+ 'NoNodesLabeled',
),
(
[],
[fluentd_node1, fluentd_node3_unlabeled],
- 'Fluentd will not aggregate logs from these nodes.',
+ 'NodesUnlabeled',
),
(
[],
[fluentd_node2],
- "nodes are supposed to have a Fluentd pod but do not",
+ 'MissingFluentdPod',
),
(
[fluentd_pod_node1, fluentd_pod_node1],
[fluentd_node1],
- 'more Fluentd pods running than nodes labeled',
+ 'TooManyFluentdPods',
),
(
[fluentd_pod_node2_down],
[fluentd_node2],
- "Fluentd pods are supposed to be running",
- ),
- (
- [fluentd_pod_node1],
- [fluentd_node1],
- None,
+ 'FluentdNotRunning',
),
])
-def test_get_fluentd_pods(pods, nodes, expect_error):
- check = canned_fluentd(lambda cmd, args, task_vars: json.dumps(dict(items=nodes)))
+def test_get_fluentd_pods_errors(pods, nodes, expect_error):
+ check = Fluentd()
+ check.exec_oc = lambda *_: json.dumps(dict(items=nodes))
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.check_fluentd(pods)
+ if isinstance(excinfo.value, OpenShiftCheckExceptionList):
+ assert_error_in_list(expect_error, excinfo.value)
+ else:
+ assert expect_error == excinfo.value.name
+
- error = check.check_fluentd(pods, {})
- assert_error(error, expect_error)
+def test_bad_oc_node_list():
+ check = Fluentd()
+ check.exec_oc = lambda *_: "this isn't even json"
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.get_nodes_by_name()
+ assert 'BadOcNodeList' == excinfo.value.name
diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py
index 40a5d19d8..04a5e89c4 100644
--- a/roles/openshift_health_checker/test/kibana_test.py
+++ b/roles/openshift_health_checker/test/kibana_test.py
@@ -8,23 +8,7 @@ except ImportError:
from urllib.error import HTTPError, URLError
import urllib.request as urllib2
-from openshift_checks.logging.kibana import Kibana
-
-
-def canned_kibana(exec_oc=None):
- """Create a Kibana check object with canned exec_oc method"""
- check = Kibana("dummy") # fails if a module is actually invoked
- if exec_oc:
- check._exec_oc = exec_oc
- return check
-
-
-def assert_error(error, expect_error):
- if expect_error:
- assert error
- assert expect_error in error
- else:
- assert not error
+from openshift_checks.logging.kibana import Kibana, OpenShiftCheckException
plain_kibana_pod = {
@@ -49,39 +33,45 @@ not_running_kibana_pod = {
}
+def test_check_kibana():
+ # should run without exception:
+ Kibana().check_kibana([plain_kibana_pod])
+
+
@pytest.mark.parametrize('pods, expect_error', [
(
[],
- "There are no Kibana pods deployed",
- ),
- (
- [plain_kibana_pod],
- None,
+ "MissingComponentPods",
),
(
[not_running_kibana_pod],
- "No Kibana pod is in a running state",
+ "NoRunningPods",
),
(
[plain_kibana_pod, not_running_kibana_pod],
- "The following Kibana pods are not currently in a running state",
+ "PodNotRunning",
),
])
-def test_check_kibana(pods, expect_error):
- check = canned_kibana()
- error = check.check_kibana(pods)
- assert_error(error, expect_error)
+def test_check_kibana_error(pods, expect_error):
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ Kibana().check_kibana(pods)
+ assert expect_error == excinfo.value.name
-@pytest.mark.parametrize('route, expect_url, expect_error', [
+@pytest.mark.parametrize('comment, route, expect_error', [
(
+ "No route returned",
None,
- None,
- 'no_route_exists',
+ "no_route_exists",
),
- # test route with no ingress
(
+ "broken route response",
+ {"status": {}},
+ "get_route_failed",
+ ),
+ (
+ "route with no ingress",
{
"metadata": {
"labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
@@ -94,12 +84,11 @@ def test_check_kibana(pods, expect_error):
"host": "hostname",
}
},
- None,
- 'route_not_accepted',
+ "route_not_accepted",
),
- # test route with no host
(
+ "route with no host",
{
"metadata": {
"labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
@@ -112,12 +101,21 @@ def test_check_kibana(pods, expect_error):
},
"spec": {},
},
- None,
- 'route_missing_host',
+ "route_missing_host",
),
+])
+def test_get_kibana_url_error(comment, route, expect_error):
+ check = Kibana()
+ check.exec_oc = lambda *_: json.dumps(route) if route else ""
- # test route that looks fine
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check._get_kibana_url()
+ assert excinfo.value.name == expect_error
+
+
+@pytest.mark.parametrize('comment, route, expect_url', [
(
+ "test route that looks fine",
{
"metadata": {
"labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
@@ -133,61 +131,57 @@ def test_check_kibana(pods, expect_error):
},
},
"https://hostname/",
- None,
),
])
-def test_get_kibana_url(route, expect_url, expect_error):
- check = canned_kibana(lambda cmd, args, task_vars: json.dumps(route) if route else "")
-
- url, error = check._get_kibana_url({})
- if expect_url:
- assert url == expect_url
- else:
- assert not url
- if expect_error:
- assert error == expect_error
- else:
- assert not error
+def test_get_kibana_url(comment, route, expect_url):
+ check = Kibana()
+ check.exec_oc = lambda *_: json.dumps(route)
+ assert expect_url == check._get_kibana_url()
@pytest.mark.parametrize('exec_result, expect', [
(
'urlopen error [Errno 111] Connection refused',
- 'at least one router routing to it?',
+ 'FailedToConnectInternal',
),
(
'urlopen error [Errno -2] Name or service not known',
- 'DNS configured for the Kibana hostname?',
+ 'FailedToResolveInternal',
),
(
'Status code was not [302]: HTTP Error 500: Server error',
- 'did not return the correct status code',
+ 'WrongReturnCodeInternal',
),
(
'bork bork bork',
- 'bork bork bork', # should pass through
+ 'MiscRouteErrorInternal',
),
])
def test_verify_url_internal_failure(exec_result, expect):
- check = Kibana(execute_module=lambda module_name, args, tmp, task_vars: dict(failed=True, msg=exec_result))
- check._get_kibana_url = lambda task_vars: ('url', None)
+ check = Kibana(execute_module=lambda *_: dict(failed=True, msg=exec_result))
+ check._get_kibana_url = lambda: 'url'
- error = check._check_kibana_route({})
- assert_error(error, expect)
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.check_kibana_route()
+ assert expect == excinfo.value.name
@pytest.mark.parametrize('lib_result, expect', [
(
- HTTPError('url', 500, "it broke", hdrs=None, fp=None),
- 'it broke',
+ HTTPError('url', 500, 'it broke', hdrs=None, fp=None),
+ 'MiscRouteError',
),
(
- URLError('it broke'),
- 'it broke',
+ URLError('urlopen error [Errno 111] Connection refused'),
+ 'FailedToConnect',
+ ),
+ (
+ URLError('urlopen error [Errno -2] Name or service not known'),
+ 'FailedToResolve',
),
(
302,
- 'returned the wrong error code',
+ 'WrongReturnCode',
),
(
200,
@@ -210,9 +204,41 @@ def test_verify_url_external_failure(lib_result, expect, monkeypatch):
raise lib_result
monkeypatch.setattr(urllib2, 'urlopen', urlopen)
- check = canned_kibana()
- check._get_kibana_url = lambda task_vars: ('url', None)
- check._verify_url_internal = lambda url, task_vars: None
+ check = Kibana()
+ check._get_kibana_url = lambda: 'url'
+ check._verify_url_internal = lambda url: None
+
+ if not expect:
+ check.check_kibana_route()
+ return
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.check_kibana_route()
+ assert expect == excinfo.value.name
+
+
+def test_verify_url_external_skip():
+ check = Kibana(lambda *_: {}, dict(openshift_check_efk_kibana_external="false"))
+ check._get_kibana_url = lambda: 'url'
+ check.check_kibana_route()
+
+
+# this is kind of silly but it adds coverage for the run() method...
+def test_run():
+ pods = ["foo"]
+ ran = dict(check_kibana=False, check_route=False)
+
+ def check_kibana(pod_list):
+ ran["check_kibana"] = True
+ assert pod_list == pods
+
+ def check_kibana_route():
+ ran["check_route"] = True
+
+ check = Kibana()
+ check.get_pods_for_component = lambda *_: pods
+ check.check_kibana = check_kibana
+ check.check_kibana_route = check_kibana_route
- error = check._check_kibana_route({})
- assert_error(error, expect)
+ check.run()
+ assert ran["check_kibana"] and ran["check_route"]
diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py
index 128b76b12..1a1c190f6 100644
--- a/roles/openshift_health_checker/test/logging_check_test.py
+++ b/roles/openshift_health_checker/test/logging_check_test.py
@@ -1,18 +1,14 @@
import pytest
import json
-from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck, MissingComponentPods, CouldNotUseOc
task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin')))
-logging_namespace = "logging"
-
-
-def canned_loggingcheck(exec_oc=None):
+def canned_loggingcheck(exec_oc=None, execute_module=None):
"""Create a LoggingCheck object with canned exec_oc method"""
- check = LoggingCheck("dummy") # fails if a module is actually invoked
- check.logging_namespace = 'logging'
+ check = LoggingCheck(execute_module)
if exec_oc:
check.exec_oc = exec_oc
return check
@@ -90,15 +86,15 @@ plain_curator_pod = {
("Permission denied", "Unexpected error using `oc`"),
])
def test_oc_failure(problem, expect):
- def execute_module(module_name, args, tmp, task_vars):
+ def execute_module(module_name, *_):
if module_name == "ocutil":
return dict(failed=True, result=problem)
return dict(changed=False)
- check = LoggingCheck({})
+ check = LoggingCheck(execute_module, task_vars_config_base)
- with pytest.raises(OpenShiftCheckException) as excinfo:
- check.exec_oc(execute_module, logging_namespace, 'get foo', [], task_vars=task_vars_config_base)
+ with pytest.raises(CouldNotUseOc) as excinfo:
+ check.exec_oc('get foo', [])
assert expect in str(excinfo)
@@ -121,30 +117,35 @@ def test_is_active(groups, logging_deployed, is_active):
openshift_hosted_logging_deploy=logging_deployed,
)
- assert LoggingCheck.is_active(task_vars=task_vars) == is_active
+ assert LoggingCheck(None, task_vars).is_active() == is_active
+
+
+@pytest.mark.parametrize('pod_output, expect_pods', [
+ (
+ json.dumps({'items': [plain_es_pod]}),
+ [plain_es_pod],
+ ),
+])
+def test_get_pods_for_component(pod_output, expect_pods):
+ check = canned_loggingcheck(lambda *_: pod_output)
+ pods = check.get_pods_for_component("es")
+ assert pods == expect_pods
-@pytest.mark.parametrize('pod_output, expect_pods, expect_error', [
+@pytest.mark.parametrize('exec_oc_output, expect_error', [
(
'No resources found.',
- None,
- 'There are no pods in the logging namespace',
+ MissingComponentPods,
),
(
- json.dumps({'items': [plain_kibana_pod, plain_es_pod, plain_curator_pod, fluentd_pod_node1]}),
- [plain_es_pod],
- None,
+ '{"items": null}',
+ MissingComponentPods,
),
])
-def test_get_pods_for_component(pod_output, expect_pods, expect_error):
- check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: pod_output)
- pods, error = check.get_pods_for_component(
- lambda name, args, task_vars: {},
- logging_namespace,
- "es",
- {}
- )
- assert_error(error, expect_error)
+def test_get_pods_for_component_fail(exec_oc_output, expect_error):
+ check = canned_loggingcheck(lambda *_: exec_oc_output)
+ with pytest.raises(expect_error):
+ check.get_pods_for_component("es")
@pytest.mark.parametrize('name, pods, expected_pods', [
@@ -161,7 +162,7 @@ def test_get_pods_for_component(pod_output, expect_pods, expect_error):
], ids=lambda argvals: argvals[0])
def test_get_not_running_pods_no_container_status(name, pods, expected_pods):
- check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: '')
+ check = canned_loggingcheck(lambda *_: '')
result = check.not_running_pods(pods)
assert result == expected_pods
diff --git a/roles/openshift_health_checker/test/logging_index_time_test.py b/roles/openshift_health_checker/test/logging_index_time_test.py
new file mode 100644
index 000000000..22566b295
--- /dev/null
+++ b/roles/openshift_health_checker/test/logging_index_time_test.py
@@ -0,0 +1,170 @@
+import json
+
+import pytest
+
+from openshift_checks.logging.logging_index_time import LoggingIndexTime, OpenShiftCheckException
+
+
+SAMPLE_UUID = "unique-test-uuid"
+
+
+def canned_loggingindextime(exec_oc=None):
+ """Create a check object with a canned exec_oc method"""
+ check = LoggingIndexTime() # fails if a module is actually invoked
+ if exec_oc:
+ check.exec_oc = exec_oc
+ return check
+
+
+plain_running_elasticsearch_pod = {
+ "metadata": {
+ "labels": {"component": "es", "deploymentconfig": "logging-es-data-master"},
+ "name": "logging-es-data-master-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": True}],
+ "phase": "Running",
+ }
+}
+plain_running_kibana_pod = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": True}],
+ "phase": "Running",
+ }
+}
+not_running_kibana_pod = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-2",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": False}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "phase": "pending",
+ }
+}
+
+
+@pytest.mark.parametrize('pods, expect_pods', [
+ (
+ [not_running_kibana_pod],
+ [],
+ ),
+ (
+ [plain_running_kibana_pod],
+ [plain_running_kibana_pod],
+ ),
+ (
+ [],
+ [],
+ )
+])
+def test_check_running_pods(pods, expect_pods):
+ check = canned_loggingindextime()
+ pods = check.running_pods(pods)
+ assert pods == expect_pods
+
+
+def test_bad_config_param():
+ with pytest.raises(OpenShiftCheckException) as error:
+ LoggingIndexTime(task_vars=dict(openshift_check_logging_index_timeout_seconds="foo")).run()
+ assert 'InvalidTimeout' == error.value.name
+
+
+def test_no_running_pods():
+ check = LoggingIndexTime()
+ check.get_pods_for_component = lambda *_: [not_running_kibana_pod]
+ with pytest.raises(OpenShiftCheckException) as error:
+ check.run()
+ assert 'kibanaNoRunningPods' == error.value.name
+
+
+def test_with_running_pods():
+ check = LoggingIndexTime()
+ check.get_pods_for_component = lambda *_: [plain_running_kibana_pod, plain_running_elasticsearch_pod]
+ check.curl_kibana_with_uuid = lambda *_: SAMPLE_UUID
+ check.wait_until_cmd_or_err = lambda *_: None
+ assert not check.run().get("failed")
+
+
+@pytest.mark.parametrize('name, json_response, uuid, timeout', [
+ (
+ 'valid count in response',
+ {
+ "count": 1,
+ },
+ SAMPLE_UUID,
+ 0.001,
+ ),
+], ids=lambda argval: argval[0])
+def test_wait_until_cmd_or_err_succeeds(name, json_response, uuid, timeout):
+ check = canned_loggingindextime(lambda *_: json.dumps(json_response))
+ check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, uuid, timeout)
+
+
+@pytest.mark.parametrize('name, json_response, timeout, expect_error', [
+ (
+ 'invalid json response',
+ {
+ "invalid_field": 1,
+ },
+ 0.001,
+ 'esInvalidResponse',
+ ),
+ (
+ 'empty response',
+ {},
+ 0.001,
+ 'esInvalidResponse',
+ ),
+ (
+ 'valid response but invalid match count',
+ {
+ "count": 0,
+ },
+ 0.005,
+ 'NoMatchFound',
+ )
+], ids=lambda argval: argval[0])
+def test_wait_until_cmd_or_err(name, json_response, timeout, expect_error):
+ check = canned_loggingindextime(lambda *_: json.dumps(json_response))
+ with pytest.raises(OpenShiftCheckException) as error:
+ check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, SAMPLE_UUID, timeout)
+
+ assert expect_error == error.value.name
+
+
+def test_curl_kibana_with_uuid():
+ check = canned_loggingindextime(lambda *_: json.dumps({"statusCode": 404}))
+ check.generate_uuid = lambda: SAMPLE_UUID
+ assert SAMPLE_UUID == check.curl_kibana_with_uuid(plain_running_kibana_pod)
+
+
+@pytest.mark.parametrize('name, json_response, expect_error', [
+ (
+ 'invalid json response',
+ {
+ "invalid_field": "invalid",
+ },
+ 'kibanaInvalidResponse',
+ ),
+ (
+ 'wrong error code in response',
+ {
+ "statusCode": 500,
+ },
+ 'kibanaInvalidReturnCode',
+ ),
+], ids=lambda argval: argval[0])
+def test_failed_curl_kibana_with_uuid(name, json_response, expect_error):
+ check = canned_loggingindextime(lambda *_: json.dumps(json_response))
+ check.generate_uuid = lambda: SAMPLE_UUID
+
+ with pytest.raises(OpenShiftCheckException) as error:
+ check.curl_kibana_with_uuid(plain_running_kibana_pod)
+
+ assert expect_error == error.value.name
diff --git a/roles/openshift_health_checker/test/memory_availability_test.py b/roles/openshift_health_checker/test/memory_availability_test.py
index 4fbaea0a9..aee2f0416 100644
--- a/roles/openshift_health_checker/test/memory_availability_test.py
+++ b/roles/openshift_health_checker/test/memory_availability_test.py
@@ -17,7 +17,7 @@ def test_is_active(group_names, is_active):
task_vars = dict(
group_names=group_names,
)
- assert MemoryAvailability.is_active(task_vars=task_vars) == is_active
+ assert MemoryAvailability(None, task_vars).is_active() == is_active
@pytest.mark.parametrize('group_names,configured_min,ansible_memtotal_mb', [
@@ -59,8 +59,7 @@ def test_succeeds_with_recommended_memory(group_names, configured_min, ansible_m
ansible_memtotal_mb=ansible_memtotal_mb,
)
- check = MemoryAvailability(execute_module=fake_execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ result = MemoryAvailability(fake_execute_module, task_vars).run()
assert not result.get('failed', False)
@@ -117,8 +116,7 @@ def test_fails_with_insufficient_memory(group_names, configured_min, ansible_mem
ansible_memtotal_mb=ansible_memtotal_mb,
)
- check = MemoryAvailability(execute_module=fake_execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ result = MemoryAvailability(fake_execute_module, task_vars).run()
assert result.get('failed', False)
for word in 'below recommended'.split() + extra_words:
diff --git a/roles/openshift_health_checker/test/mixins_test.py b/roles/openshift_health_checker/test/mixins_test.py
index 2d83e207d..b1a41ca3c 100644
--- a/roles/openshift_health_checker/test/mixins_test.py
+++ b/roles/openshift_health_checker/test/mixins_test.py
@@ -14,10 +14,10 @@ class NotContainerizedCheck(NotContainerizedMixin, OpenShiftCheck):
(dict(openshift=dict(common=dict(is_containerized=True))), False),
])
def test_is_active(task_vars, expected):
- assert NotContainerizedCheck.is_active(task_vars) == expected
+ assert NotContainerizedCheck(None, task_vars).is_active() == expected
def test_is_active_missing_task_vars():
with pytest.raises(OpenShiftCheckException) as excinfo:
- NotContainerizedCheck.is_active(task_vars={})
+ NotContainerizedCheck().is_active()
assert 'is_containerized' in str(excinfo.value)
diff --git a/roles/openshift_health_checker/test/openshift_check_test.py b/roles/openshift_health_checker/test/openshift_check_test.py
index e3153979c..789784c77 100644
--- a/roles/openshift_health_checker/test/openshift_check_test.py
+++ b/roles/openshift_health_checker/test/openshift_check_test.py
@@ -1,7 +1,7 @@
import pytest
from openshift_checks import OpenShiftCheck, OpenShiftCheckException
-from openshift_checks import load_checks, get_var
+from openshift_checks import load_checks
# Fixtures
@@ -28,34 +28,23 @@ def test_OpenShiftCheck_init():
name = "test_check"
run = NotImplemented
- # initialization requires at least one argument (apart from self)
- with pytest.raises(TypeError) as excinfo:
- TestCheck()
+ # execute_module required at init if it will be used
+ with pytest.raises(RuntimeError) as excinfo:
+ TestCheck().execute_module("foo")
assert 'execute_module' in str(excinfo.value)
- assert 'module_executor' in str(excinfo.value)
execute_module = object()
# initialize with positional argument
check = TestCheck(execute_module)
- # new recommended name
- assert check.execute_module == execute_module
- # deprecated attribute name
- assert check.module_executor == execute_module
+ assert check._execute_module == execute_module
- # initialize with keyword argument, recommended name
+ # initialize with keyword argument
check = TestCheck(execute_module=execute_module)
- # new recommended name
- assert check.execute_module == execute_module
- # deprecated attribute name
- assert check.module_executor == execute_module
+ assert check._execute_module == execute_module
- # initialize with keyword argument, deprecated name
- check = TestCheck(module_executor=execute_module)
- # new recommended name
- assert check.execute_module == execute_module
- # deprecated attribute name
- assert check.module_executor == execute_module
+ assert check.task_vars == {}
+ assert check.tmp is None
def test_subclasses():
@@ -81,19 +70,49 @@ def test_load_checks():
assert modules
+def dummy_check(task_vars):
+ class TestCheck(OpenShiftCheck):
+ name = "dummy"
+ run = NotImplemented
+
+ return TestCheck(task_vars=task_vars)
+
+
@pytest.mark.parametrize("keys,expected", [
(("foo",), 42),
(("bar", "baz"), "openshift"),
+ (("bar.baz",), "openshift"),
])
def test_get_var_ok(task_vars, keys, expected):
- assert get_var(task_vars, *keys) == expected
+ assert dummy_check(task_vars).get_var(*keys) == expected
def test_get_var_error(task_vars, missing_keys):
with pytest.raises(OpenShiftCheckException):
- get_var(task_vars, *missing_keys)
+ dummy_check(task_vars).get_var(*missing_keys)
def test_get_var_default(task_vars, missing_keys):
default = object()
- assert get_var(task_vars, *missing_keys, default=default) == default
+ assert dummy_check(task_vars).get_var(*missing_keys, default=default) == default
+
+
+@pytest.mark.parametrize("keys, convert, expected", [
+ (("foo",), str, "42"),
+ (("foo",), float, 42.0),
+ (("bar", "baz"), bool, False),
+])
+def test_get_var_convert(task_vars, keys, convert, expected):
+ assert dummy_check(task_vars).get_var(*keys, convert=convert) == expected
+
+
+@pytest.mark.parametrize("keys, convert", [
+ (("bar", "baz"), int),
+ (("bar.baz"), float),
+ (("foo"), "bogus"),
+ (("foo"), lambda a, b: 1),
+ (("foo"), lambda a: 1 / 0),
+])
+def test_get_var_convert_error(task_vars, keys, convert):
+ with pytest.raises(OpenShiftCheckException):
+ dummy_check(task_vars).get_var(*keys, convert=convert)
diff --git a/roles/openshift_health_checker/test/ovs_version_test.py b/roles/openshift_health_checker/test/ovs_version_test.py
index 6494e1c06..e1bf29d2a 100644
--- a/roles/openshift_health_checker/test/ovs_version_test.py
+++ b/roles/openshift_health_checker/test/ovs_version_test.py
@@ -4,7 +4,7 @@ from openshift_checks.ovs_version import OvsVersion, OpenShiftCheckException
def test_openshift_version_not_supported():
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(*_):
return {}
openshift_release = '111.7.0'
@@ -16,15 +16,14 @@ def test_openshift_version_not_supported():
openshift_deployment_type='origin',
)
- check = OvsVersion(execute_module=execute_module)
with pytest.raises(OpenShiftCheckException) as excinfo:
- check.run(tmp=None, task_vars=task_vars)
+ OvsVersion(execute_module, task_vars).run()
assert "no recommended version of Open vSwitch" in str(excinfo.value)
def test_invalid_openshift_release_format():
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(*_):
return {}
task_vars = dict(
@@ -33,15 +32,14 @@ def test_invalid_openshift_release_format():
openshift_deployment_type='origin',
)
- check = OvsVersion(execute_module=execute_module)
with pytest.raises(OpenShiftCheckException) as excinfo:
- check.run(tmp=None, task_vars=task_vars)
+ OvsVersion(execute_module, task_vars).run()
assert "invalid version" in str(excinfo.value)
@pytest.mark.parametrize('openshift_release,expected_ovs_version', [
- ("3.5", "2.6"),
- ("3.6", "2.6"),
+ ("3.5", ["2.6", "2.7"]),
+ ("3.6", ["2.6", "2.7"]),
("3.4", "2.4"),
("3.3", "2.4"),
("1.0", "2.4"),
@@ -54,7 +52,7 @@ def test_ovs_package_version(openshift_release, expected_ovs_version):
)
return_value = object()
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(module_name=None, module_args=None, *_):
assert module_name == 'rpm_version'
assert "package_list" in module_args
@@ -64,8 +62,7 @@ def test_ovs_package_version(openshift_release, expected_ovs_version):
return return_value
- check = OvsVersion(execute_module=execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ result = OvsVersion(execute_module, task_vars).run()
assert result is return_value
@@ -86,4 +83,4 @@ def test_ovs_version_skip_when_not_master_nor_node(group_names, is_containerized
group_names=group_names,
openshift=dict(common=dict(is_containerized=is_containerized)),
)
- assert OvsVersion.is_active(task_vars=task_vars) == is_active
+ assert OvsVersion(None, task_vars).is_active() == is_active
diff --git a/roles/openshift_health_checker/test/package_availability_test.py b/roles/openshift_health_checker/test/package_availability_test.py
index f7e916a46..1fe648b75 100644
--- a/roles/openshift_health_checker/test/package_availability_test.py
+++ b/roles/openshift_health_checker/test/package_availability_test.py
@@ -14,7 +14,7 @@ def test_is_active(pkg_mgr, is_containerized, is_active):
ansible_pkg_mgr=pkg_mgr,
openshift=dict(common=dict(is_containerized=is_containerized)),
)
- assert PackageAvailability.is_active(task_vars=task_vars) == is_active
+ assert PackageAvailability(None, task_vars).is_active() == is_active
@pytest.mark.parametrize('task_vars,must_have_packages,must_not_have_packages', [
@@ -51,13 +51,12 @@ def test_is_active(pkg_mgr, is_containerized, is_active):
def test_package_availability(task_vars, must_have_packages, must_not_have_packages):
return_value = object()
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(module_name=None, module_args=None, *_):
assert module_name == 'check_yum_update'
assert 'packages' in module_args
assert set(module_args['packages']).issuperset(must_have_packages)
assert not set(module_args['packages']).intersection(must_not_have_packages)
return return_value
- check = PackageAvailability(execute_module=execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ result = PackageAvailability(execute_module, task_vars).run()
assert result is return_value
diff --git a/roles/openshift_health_checker/test/package_update_test.py b/roles/openshift_health_checker/test/package_update_test.py
index 5e000cff5..06489b0d7 100644
--- a/roles/openshift_health_checker/test/package_update_test.py
+++ b/roles/openshift_health_checker/test/package_update_test.py
@@ -4,13 +4,12 @@ from openshift_checks.package_update import PackageUpdate
def test_package_update():
return_value = object()
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(module_name=None, module_args=None, *_):
assert module_name == 'check_yum_update'
assert 'packages' in module_args
# empty list of packages means "generic check if 'yum update' will work"
assert module_args['packages'] == []
return return_value
- check = PackageUpdate(execute_module=execute_module)
- result = check.run(tmp=None, task_vars=None)
+ result = PackageUpdate(execute_module).run()
assert result is return_value
diff --git a/roles/openshift_health_checker/test/package_version_test.py b/roles/openshift_health_checker/test/package_version_test.py
index 1bb6371ae..6054d3f3e 100644
--- a/roles/openshift_health_checker/test/package_version_test.py
+++ b/roles/openshift_health_checker/test/package_version_test.py
@@ -3,61 +3,56 @@ import pytest
from openshift_checks.package_version import PackageVersion, OpenShiftCheckException
-@pytest.mark.parametrize('openshift_release, extra_words', [
- ('111.7.0', ["no recommended version of Open vSwitch"]),
- ('0.0.0', ["no recommended version of Docker"]),
-])
-def test_openshift_version_not_supported(openshift_release, extra_words):
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
- return {}
-
- task_vars = dict(
- openshift=dict(common=dict(service_type='origin')),
+def task_vars_for(openshift_release, deployment_type):
+ return dict(
+ openshift=dict(common=dict(service_type=deployment_type)),
openshift_release=openshift_release,
openshift_image_tag='v' + openshift_release,
- openshift_deployment_type='origin',
+ openshift_deployment_type=deployment_type,
)
- check = PackageVersion(execute_module=execute_module)
+
+def test_openshift_version_not_supported():
+ check = PackageVersion(None, task_vars_for("1.2.3", 'origin'))
+ check.get_openshift_version_tuple = lambda: (3, 4, 1) # won't be in the dict
+
with pytest.raises(OpenShiftCheckException) as excinfo:
- check.run(tmp=None, task_vars=task_vars)
+ check.get_required_ovs_version()
+ assert "no recommended version of Open vSwitch" in str(excinfo.value)
- for word in extra_words:
- assert word in str(excinfo.value)
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.get_required_docker_version()
+ assert "no recommended version of Docker" in str(excinfo.value)
def test_invalid_openshift_release_format():
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
- return {}
-
task_vars = dict(
openshift=dict(common=dict(service_type='origin')),
openshift_image_tag='v0',
openshift_deployment_type='origin',
)
- check = PackageVersion(execute_module=execute_module)
+ check = PackageVersion(lambda *_: {}, task_vars)
with pytest.raises(OpenShiftCheckException) as excinfo:
- check.run(tmp=None, task_vars=task_vars)
+ check.run()
assert "invalid version" in str(excinfo.value)
@pytest.mark.parametrize('openshift_release', [
- "3.5",
+ "111.7.0",
+ "3.7",
"3.6",
+ "3.5.1.2.3",
+ "3.5",
"3.4",
"3.3",
+ "2.1.0",
])
def test_package_version(openshift_release):
- task_vars = dict(
- openshift=dict(common=dict(service_type='origin')),
- openshift_release=openshift_release,
- openshift_image_tag='v' + openshift_release,
- openshift_deployment_type='origin',
- )
+
return_value = object()
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *_):
assert module_name == 'aos_version'
assert "package_list" in module_args
@@ -67,29 +62,24 @@ def test_package_version(openshift_release):
return return_value
- check = PackageVersion(execute_module=execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ check = PackageVersion(execute_module, task_vars_for(openshift_release, 'origin'))
+ result = check.run()
assert result is return_value
@pytest.mark.parametrize('deployment_type,openshift_release,expected_docker_version', [
("origin", "3.5", "1.12"),
+ ("origin", "1.3", "1.10"),
+ ("origin", "1.1", "1.8"),
("openshift-enterprise", "3.4", "1.12"),
- ("origin", "3.3", "1.10"),
("openshift-enterprise", "3.2", "1.10"),
- ("origin", "3.1", "1.8"),
("openshift-enterprise", "3.1", "1.8"),
])
def test_docker_package_version(deployment_type, openshift_release, expected_docker_version):
- task_vars = dict(
- openshift=dict(common=dict(service_type='origin')),
- openshift_release=openshift_release,
- openshift_image_tag='v' + openshift_release,
- openshift_deployment_type=deployment_type,
- )
+
return_value = object()
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(module_name=None, module_args=None, *_):
assert module_name == 'aos_version'
assert "package_list" in module_args
@@ -99,8 +89,8 @@ def test_docker_package_version(deployment_type, openshift_release, expected_doc
return return_value
- check = PackageVersion(execute_module=execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ check = PackageVersion(execute_module, task_vars_for(openshift_release, deployment_type))
+ result = check.run()
assert result is return_value
@@ -121,4 +111,4 @@ def test_package_version_skip_when_not_master_nor_node(group_names, is_container
group_names=group_names,
openshift=dict(common=dict(is_containerized=is_containerized)),
)
- assert PackageVersion.is_active(task_vars=task_vars) == is_active
+ assert PackageVersion(None, task_vars).is_active() == is_active
diff --git a/roles/openshift_health_checker/test/rpm_version_test.py b/roles/openshift_health_checker/test/rpm_version_test.py
index 2f09ef965..2c1bcf876 100644
--- a/roles/openshift_health_checker/test/rpm_version_test.py
+++ b/roles/openshift_health_checker/test/rpm_version_test.py
@@ -49,7 +49,7 @@ def test_check_pkg_found(pkgs, expect_not_found):
},
{
"eggs": {
- "required_version": "3.2",
+ "required_versions": ["3.2"],
"found_versions": ["3.3"],
}
}, # not the right version
@@ -61,11 +61,11 @@ def test_check_pkg_found(pkgs, expect_not_found):
},
{
"eggs": {
- "required_version": "3.2",
+ "required_versions": ["3.2"],
"found_versions": ["3.3", "1.2"],
},
"spam": {
- "required_version": "3.2",
+ "required_versions": ["3.2"],
"found_versions": ["3.1", "3.3"],
}
}, # not the right version
diff --git a/roles/openshift_health_checker/test/search_journalctl_test.py b/roles/openshift_health_checker/test/search_journalctl_test.py
new file mode 100644
index 000000000..724928aa1
--- /dev/null
+++ b/roles/openshift_health_checker/test/search_journalctl_test.py
@@ -0,0 +1,157 @@
+import pytest
+import search_journalctl
+
+
+def canned_search_journalctl(get_log_output=None):
+ """Create a search_journalctl object with canned get_log_output method"""
+ module = search_journalctl
+ if get_log_output:
+ module.get_log_output = get_log_output
+ return module
+
+
+DEFAULT_TIMESTAMP = 1496341364
+
+
+def get_timestamp(modifier=0):
+ return DEFAULT_TIMESTAMP + modifier
+
+
+def get_timestamp_microseconds(modifier=0):
+ return get_timestamp(modifier) * 1000000
+
+
+def create_test_log_object(stamp, msg):
+ return '{{"__REALTIME_TIMESTAMP": "{}", "MESSAGE": "{}"}}'.format(stamp, msg)
+
+
+@pytest.mark.parametrize('name,matchers,log_input,expected_matches,expected_errors', [
+ (
+ 'test with valid params',
+ [
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"test log message",
+ "unit": "test",
+ },
+ ],
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+ ],
+ ["test log message"],
+ [],
+ ),
+ (
+ 'test with invalid json in log input',
+ [
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"test log message",
+ "unit": "test-unit",
+ },
+ ],
+ [
+ '{__REALTIME_TIMESTAMP: ' + str(get_timestamp_microseconds()) + ', "MESSAGE": "test log message"}',
+ ],
+ [],
+ [
+ ["invalid json", "test-unit", "test log message"],
+ ],
+ ),
+ (
+ 'test with invalid regexp',
+ [
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"test [ log message",
+ "unit": "test",
+ },
+ ],
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+ ],
+ [],
+ [
+ ["invalid regular expression"],
+ ],
+ ),
+], ids=lambda argval: argval[0])
+def test_get_log_matches(name, matchers, log_input, expected_matches, expected_errors):
+ def get_log_output(matcher):
+ return log_input
+
+ module = canned_search_journalctl(get_log_output)
+ matched_regexp, errors = module.get_log_matches(matchers, 500, 60 * 60)
+
+ assert set(matched_regexp) == set(expected_matches)
+ assert len(expected_errors) == len(errors)
+
+ for idx, partial_err_set in enumerate(expected_errors):
+ for partial_err_msg in partial_err_set:
+ assert partial_err_msg in errors[idx]
+
+
+@pytest.mark.parametrize('name,matcher,log_count_lim,stamp_lim_seconds,log_input,expected_match', [
+ (
+ 'test with matching log message, but out of bounds of log_count_lim',
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"dummy log message",
+ "unit": "test",
+ },
+ 3,
+ get_timestamp(-100 * 60 * 60),
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+ ],
+ None,
+ ),
+ (
+ 'test with matching log message, but with timestamp too old',
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"dummy log message",
+ "unit": "test",
+ },
+ 100,
+ get_timestamp(-10),
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"),
+ ],
+ None,
+ ),
+ (
+ 'test with matching log message, and timestamp within time limit',
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"dummy log message",
+ "unit": "test",
+ },
+ 100,
+ get_timestamp(-1010),
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"),
+ ],
+ create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+ ),
+], ids=lambda argval: argval[0])
+def test_find_matches_skips_logs(name, matcher, log_count_lim, stamp_lim_seconds, log_input, expected_match):
+ match = search_journalctl.find_matches(log_input, matcher, log_count_lim, stamp_lim_seconds)
+ assert match == expected_match