diff options
Diffstat (limited to 'roles/openshift_health_checker')
52 files changed, 3575 insertions, 1325 deletions
diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py index a62e4331e..05e53333d 100644 --- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py +++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py @@ -13,6 +13,7 @@ except ImportError: display = Display() from ansible.plugins.action import ActionBase +from ansible.module_utils.six import string_types # Augment sys.path so that we can import checks from a directory relative to # this callback plugin. @@ -37,49 +38,48 @@ class ActionModule(ActionBase): return result try: - known_checks = self.load_known_checks() + known_checks = self.load_known_checks(tmp, task_vars) + args = self._task.args + requested_checks = normalize(args.get('checks', [])) + resolved_checks = resolve_checks(requested_checks, known_checks.values()) except OpenShiftCheckException as e: result["failed"] = True result["msg"] = str(e) return result - args = self._task.args - resolved_checks = resolve_checks(args.get("checks", []), known_checks.values()) - result["checks"] = check_results = {} - user_disabled_checks = [ - check.strip() - for check in task_vars.get("openshift_disable_check", "").split(",") - ] + user_disabled_checks = normalize(task_vars.get('openshift_disable_check', [])) for check_name in resolved_checks: display.banner("CHECK [{} : {}]".format(check_name, task_vars["ansible_host"])) check = known_checks[check_name] - if not check.is_active(task_vars): + if not check.is_active(): r = dict(skipped=True, skipped_reason="Not active for this host") elif check_name in user_disabled_checks: r = dict(skipped=True, skipped_reason="Disabled by user request") else: try: - r = check.run(tmp, task_vars) + r = check.run() except OpenShiftCheckException as e: r = dict( failed=True, msg=str(e), ) + if check.changed: + r["changed"] = True check_results[check_name] = r - if r.get("failed", False): - result["failed"] = True - result["msg"] = "One or more checks failed" + result["changed"] = any(r.get("changed") for r in check_results.values()) + if any(r.get("failed") for r in check_results.values()): + result["failed"] = True + result["msg"] = "One or more checks failed" - result["changed"] = any(r.get("changed", False) for r in check_results.values()) return result - def load_known_checks(self): + def load_known_checks(self, tmp, task_vars): load_checks() known_checks = {} @@ -92,7 +92,7 @@ class ActionModule(ActionBase): check_name, cls.__module__, cls.__name__, other_cls.__module__, other_cls.__name__)) - known_checks[check_name] = cls(execute_module=self._execute_module) + known_checks[check_name] = cls(execute_module=self._execute_module, tmp=tmp, task_vars=task_vars) return known_checks @@ -135,3 +135,14 @@ def resolve_checks(names, all_checks): resolved.update(tag_to_checks[tag]) return resolved + + +def normalize(checks): + """Return a clean list of check names. + + The input may be a comma-separated string or a sequence. Leading and + trailing whitespace characters are removed. Empty items are discarded. + """ + if isinstance(checks, string_types): + checks = checks.split(',') + return [name.strip() for name in checks if name.strip()] diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py index 64c29a8d9..d10200719 100644 --- a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py +++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py @@ -1,6 +1,6 @@ -''' -Ansible callback plugin. -''' +""" +Ansible callback plugin to give a nicely formatted summary of failures. +""" # Reason: In several locations below we disable pylint protected-access # for Ansible objects that do not give us any public way @@ -16,11 +16,11 @@ from ansible.utils.color import stringc class CallbackModule(CallbackBase): - ''' + """ This callback plugin stores task results and summarizes failures. The file name is prefixed with `zz_` to make this plugin be loaded last by Ansible, thus making its output the last thing that users see. - ''' + """ CALLBACK_VERSION = 2.0 CALLBACK_TYPE = 'aggregate' @@ -39,7 +39,8 @@ class CallbackModule(CallbackBase): def v2_runner_on_failed(self, result, ignore_errors=False): super(CallbackModule, self).v2_runner_on_failed(result, ignore_errors) - self.__failures.append(dict(result=result, ignore_errors=ignore_errors)) + if not ignore_errors: + self.__failures.append(dict(result=result, ignore_errors=ignore_errors)) def v2_playbook_on_stats(self, stats): super(CallbackModule, self).v2_playbook_on_stats(stats) @@ -47,7 +48,7 @@ class CallbackModule(CallbackBase): self._print_failure_details(self.__failures) def _print_failure_details(self, failures): - '''Print a summary of failed tasks or checks.''' + """Print a summary of failed tasks or checks.""" self._display.display(u'\nFailure summary:\n') width = len(str(len(failures))) @@ -68,7 +69,9 @@ class CallbackModule(CallbackBase): playbook_context = None # re: result attrs see top comment # pylint: disable=protected-access for failure in failures: - # get context from check task result since callback plugins cannot access task vars + # Get context from check task result since callback plugins cannot access task vars. + # NOTE: thus context is not known unless checks run. Failures prior to checks running + # don't have playbook_context in the results. But we only use it now when checks fail. playbook_context = playbook_context or failure['result']._result.get('playbook_context') failed_checks.update( name @@ -80,8 +83,11 @@ class CallbackModule(CallbackBase): def _print_check_failure_summary(self, failed_checks, context): checks = ','.join(sorted(failed_checks)) - # NOTE: context is not set if all failures occurred prior to checks task - summary = ( + # The purpose of specifying context is to vary the output depending on what the user was + # expecting to happen (based on which playbook they ran). The only use currently is to + # vary the message depending on whether the user was deliberately running checks or was + # trying to install/upgrade and checks are just included. Other use cases may arise. + summary = ( # default to explaining what checks are in the first place '\n' 'The execution of "{playbook}"\n' 'includes checks designed to fail early if the requirements\n' @@ -93,27 +99,26 @@ class CallbackModule(CallbackBase): 'Some checks may be configurable by variables if your requirements\n' 'are different from the defaults; consult check documentation.\n' 'Variables can be set in the inventory or passed on the\n' - 'command line using the -e flag to ansible-playbook.\n' + 'command line using the -e flag to ansible-playbook.\n\n' ).format(playbook=self._playbook_file, checks=checks) if context in ['pre-install', 'health']: - summary = ( + summary = ( # user was expecting to run checks, less explanation needed '\n' 'You may choose to configure or disable failing checks by\n' 'setting Ansible variables. To disable those above:\n\n' ' openshift_disable_check={checks}\n\n' 'Consult check documentation for configurable variables.\n' 'Variables can be set in the inventory or passed on the\n' - 'command line using the -e flag to ansible-playbook.\n' + 'command line using the -e flag to ansible-playbook.\n\n' ).format(checks=checks) - # other expected contexts: install, upgrade self._display.display(summary) # re: result attrs see top comment # pylint: disable=protected-access def _format_failure(failure): - '''Return a list of pretty-formatted text entries describing a failure, including + """Return a list of pretty-formatted text entries describing a failure, including relevant information about it. Expect that the list of text entries will be joined - by a newline separator when output to the user.''' + by a newline separator when output to the user.""" result = failure['result'] host = result._host.get_name() play = _get_play(result._task) @@ -134,7 +139,7 @@ def _format_failure(failure): def _format_failed_checks(checks): - '''Return pretty-formatted text describing checks that failed.''' + """Return pretty-formatted text describing checks that failed.""" failed_check_msgs = [] for check, body in checks.items(): if body.get('failed', False): # only show the failed checks @@ -149,7 +154,7 @@ def _format_failed_checks(checks): # This is inspired by ansible.playbook.base.Base.dump_me. # re: play/task/block attrs see top comment # pylint: disable=protected-access def _get_play(obj): - '''Given a task or block, recursively tries to find its parent play.''' + """Given a task or block, recursively try to find its parent play.""" if hasattr(obj, '_play'): return obj._play if getattr(obj, '_parent'): diff --git a/roles/openshift_health_checker/library/aos_version.py b/roles/openshift_health_checker/library/aos_version.py index 4c205e48c..f9babebb9 100755..100644 --- a/roles/openshift_health_checker/library/aos_version.py +++ b/roles/openshift_health_checker/library/aos_version.py @@ -1,5 +1,5 @@ #!/usr/bin/python -''' +""" Ansible module for yum-based systems determining if multiple releases of an OpenShift package are available, and if the release requested (if any) is available down to the given precision. @@ -16,9 +16,13 @@ of release availability already. Without duplicating all that, we would like the user to have a helpful error message if we detect things will not work out right. Note that if openshift_release is not specified in the inventory, the version comparison checks just pass. -''' +""" from ansible.module_utils.basic import AnsibleModule +# NOTE: because of the dependency on yum (Python 2-only), this module does not +# work under Python 3. But since we run unit tests against both Python 2 and +# Python 3, we use six for cross compatibility in this module alone: +from ansible.module_utils.six import string_types IMPORT_EXCEPTION = None try: @@ -28,7 +32,7 @@ except ImportError as err: class AosVersionException(Exception): - '''Base exception class for package version problems''' + """Base exception class for package version problems""" def __init__(self, message, problem_pkgs=None): Exception.__init__(self, message) self.problem_pkgs = problem_pkgs @@ -122,12 +126,15 @@ def _check_precise_version_found(pkgs, expected_pkgs_dict): for pkg in pkgs: if pkg.name not in expected_pkgs_dict: continue - # does the version match, to the precision requested? - # and, is it strictly greater, at the precision requested? - expected_pkg_version = expected_pkgs_dict[pkg.name]["version"] - match_version = '.'.join(pkg.version.split('.')[:expected_pkg_version.count('.') + 1]) - if match_version == expected_pkg_version: - pkgs_precise_version_found.add(pkg.name) + expected_pkg_versions = expected_pkgs_dict[pkg.name]["version"] + if isinstance(expected_pkg_versions, string_types): + expected_pkg_versions = [expected_pkg_versions] + for expected_pkg_version in expected_pkg_versions: + # does the version match, to the precision requested? + # and, is it strictly greater, at the precision requested? + match_version = '.'.join(pkg.version.split('.')[:expected_pkg_version.count('.') + 1]) + if match_version == expected_pkg_version: + pkgs_precise_version_found.add(pkg.name) not_found = [] for name, pkg in expected_pkgs_dict.items(): @@ -157,8 +164,13 @@ def _check_higher_version_found(pkgs, expected_pkgs_dict): for pkg in pkgs: if pkg.name not in expected_pkg_names: continue - expected_pkg_version = expected_pkgs_dict[pkg.name]["version"] - req_release_arr = [int(segment) for segment in expected_pkg_version.split(".")] + expected_pkg_versions = expected_pkgs_dict[pkg.name]["version"] + if isinstance(expected_pkg_versions, string_types): + expected_pkg_versions = [expected_pkg_versions] + # NOTE: the list of versions is assumed to be sorted so that the highest + # desirable version is the last. + highest_desirable_version = expected_pkg_versions[-1] + req_release_arr = [int(segment) for segment in highest_desirable_version.split(".")] version = [int(segment) for segment in pkg.version.split(".")] too_high = version[:len(req_release_arr)] > req_release_arr higher_than_seen = version > higher_version_for_pkg.get(pkg.name, []) diff --git a/roles/openshift_health_checker/library/check_yum_update.py b/roles/openshift_health_checker/library/check_yum_update.py index 433795b67..433795b67 100755..100644 --- a/roles/openshift_health_checker/library/check_yum_update.py +++ b/roles/openshift_health_checker/library/check_yum_update.py diff --git a/roles/openshift_health_checker/library/docker_info.py b/roles/openshift_health_checker/library/docker_info.py index 7f712bcff..0d0ddae8b 100644 --- a/roles/openshift_health_checker/library/docker_info.py +++ b/roles/openshift_health_checker/library/docker_info.py @@ -1,4 +1,3 @@ -# pylint: disable=missing-docstring """ Ansible module for determining information about the docker host. @@ -13,6 +12,7 @@ from ansible.module_utils.docker_common import AnsibleDockerClient def main(): + """Entrypoint for running an Ansible module.""" client = AnsibleDockerClient() client.module.exit_json( diff --git a/roles/openshift_health_checker/library/rpm_version.py b/roles/openshift_health_checker/library/rpm_version.py index 8ea223055..c24fbba3b 100644 --- a/roles/openshift_health_checker/library/rpm_version.py +++ b/roles/openshift_health_checker/library/rpm_version.py @@ -4,6 +4,7 @@ Ansible module for rpm-based systems determining existing package version inform """ from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.six import string_types IMPORT_EXCEPTION = None try: @@ -82,11 +83,16 @@ def _check_pkg_versions(found_pkgs_dict, expected_pkgs_dict): continue found_versions = [_parse_version(version) for version in found_pkgs_dict[pkg_name]] - expected_version = _parse_version(pkg["version"]) - if expected_version not in found_versions: + + if isinstance(pkg["version"], string_types): + expected_versions = [_parse_version(pkg["version"])] + else: + expected_versions = [_parse_version(version) for version in pkg["version"]] + + if not set(expected_versions) & set(found_versions): invalid_pkg_versions[pkg_name] = { "found_versions": found_versions, - "required_version": expected_version, + "required_versions": expected_versions, } if not_found_pkgs: @@ -106,7 +112,7 @@ def _check_pkg_versions(found_pkgs_dict, expected_pkgs_dict): "The following packages were found to be installed with an incorrect version: {}".format('\n'.join([ " \n{}\n Required version: {}\n Found versions: {}".format( pkg_name, - pkg["required_version"], + ', '.join(pkg["required_versions"]), ', '.join([version for version in pkg["found_versions"]])) for pkg_name, pkg in invalid_pkg_versions.items() ])) diff --git a/roles/openshift_health_checker/library/search_journalctl.py b/roles/openshift_health_checker/library/search_journalctl.py new file mode 100644 index 000000000..3631f71c8 --- /dev/null +++ b/roles/openshift_health_checker/library/search_journalctl.py @@ -0,0 +1,150 @@ +#!/usr/bin/python +"""Interface to journalctl.""" + +from time import time +import json +import re +import subprocess + +from ansible.module_utils.basic import AnsibleModule + + +class InvalidMatcherRegexp(Exception): + """Exception class for invalid matcher regexp.""" + pass + + +class InvalidLogEntry(Exception): + """Exception class for invalid / non-json log entries.""" + pass + + +class LogInputSubprocessError(Exception): + """Exception class for errors that occur while executing a subprocess.""" + pass + + +def main(): + """Scan a given list of "log_matchers" for journalctl messages containing given patterns. + "log_matchers" is a list of dicts consisting of three keys that help fine-tune log searching: + 'start_regexp', 'regexp', and 'unit'. + + Sample "log_matchers" list: + + [ + { + 'start_regexp': r'Beginning of systemd unit', + 'regexp': r'the specific log message to find', + 'unit': 'etcd', + } + ] + """ + module = AnsibleModule( + argument_spec=dict( + log_count_limit=dict(type="int", default=500), + log_matchers=dict(type="list", required=True), + ), + ) + + timestamp_limit_seconds = time() - 60 * 60 # 1 hour + + log_count_limit = module.params["log_count_limit"] + log_matchers = module.params["log_matchers"] + + matched_regexp, errors = get_log_matches(log_matchers, log_count_limit, timestamp_limit_seconds) + + module.exit_json( + changed=False, + failed=bool(errors), + errors=errors, + matched=matched_regexp, + ) + + +def get_log_matches(matchers, log_count_limit, timestamp_limit_seconds): + """Return a list of up to log_count_limit matches for each matcher. + + Log entries are only considered if newer than timestamp_limit_seconds. + """ + matched_regexp = [] + errors = [] + + for matcher in matchers: + try: + log_output = get_log_output(matcher) + except LogInputSubprocessError as err: + errors.append(str(err)) + continue + + try: + matched = find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds) + if matched: + matched_regexp.append(matcher.get("regexp", "")) + except InvalidMatcherRegexp as err: + errors.append(str(err)) + except InvalidLogEntry as err: + errors.append(str(err)) + + return matched_regexp, errors + + +def get_log_output(matcher): + """Return an iterator on the logs of a given matcher.""" + try: + cmd_output = subprocess.Popen(list([ + '/bin/journalctl', + '-ru', matcher.get("unit", ""), + '--output', 'json', + ]), stdout=subprocess.PIPE) + + return iter(cmd_output.stdout.readline, '') + + except subprocess.CalledProcessError as exc: + msg = "Could not obtain journalctl logs for the specified systemd unit: {}: {}" + raise LogInputSubprocessError(msg.format(matcher.get("unit", "<missing>"), str(exc))) + except OSError as exc: + raise LogInputSubprocessError(str(exc)) + + +def find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds): + """Return log messages matched in iterable log_output by a given matcher. + + Ignore any log_output items older than timestamp_limit_seconds. + """ + try: + regexp = re.compile(matcher.get("regexp", "")) + start_regexp = re.compile(matcher.get("start_regexp", "")) + except re.error as err: + msg = "A log matcher object was provided with an invalid regular expression: {}" + raise InvalidMatcherRegexp(msg.format(str(err))) + + matched = None + + for log_count, line in enumerate(log_output): + if log_count >= log_count_limit: + break + + try: + obj = json.loads(line) + + # don't need to look past the most recent service restart + if start_regexp.match(obj["MESSAGE"]): + break + + log_timestamp_seconds = float(obj["__REALTIME_TIMESTAMP"]) / 1000000 + if log_timestamp_seconds < timestamp_limit_seconds: + break + + if regexp.match(obj["MESSAGE"]): + matched = line + break + + except ValueError: + msg = "Log entry for systemd unit {} contained invalid json syntax: {}" + raise InvalidLogEntry(msg.format(matcher.get("unit"), line)) + + return matched + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/meta/main.yml b/roles/openshift_health_checker/meta/main.yml index 4d141974c..bc8e7bdcf 100644 --- a/roles/openshift_health_checker/meta/main.yml +++ b/roles/openshift_health_checker/meta/main.yml @@ -1,5 +1,3 @@ --- dependencies: - - role: openshift_facts - - role: openshift_repos - - role: openshift_version +- role: openshift_facts diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index 5c9949ced..07ec6f7ef 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -10,24 +10,56 @@ from importlib import import_module from ansible.module_utils import six from ansible.module_utils.six.moves import reduce # pylint: disable=import-error,redefined-builtin +from ansible.plugins.filter.core import to_bool as ansible_to_bool class OpenShiftCheckException(Exception): - """Raised when a check cannot proceed.""" - pass + """Raised when a check encounters a failure condition.""" + + def __init__(self, name, msg=None): + # msg is for the message the user will see when this is raised. + # name is for test code to identify the error without looking at msg text. + if msg is None: # for parameter backward compatibility + msg = name + name = self.__class__.__name__ + self.name = name + super(OpenShiftCheckException, self).__init__(msg) + + +class OpenShiftCheckExceptionList(OpenShiftCheckException): + """A container for multiple logging errors that may be detected in one check.""" + def __init__(self, errors): + self.errors = errors + super(OpenShiftCheckExceptionList, self).__init__( + 'OpenShiftCheckExceptionList', + '\n'.join(str(msg) for msg in errors) + ) + + # make iterable + def __getitem__(self, index): + return self.errors[index] @six.add_metaclass(ABCMeta) class OpenShiftCheck(object): - """A base class for defining checks for an OpenShift cluster environment.""" + """ + A base class for defining checks for an OpenShift cluster environment. + + Expect optional params: method execute_module, dict task_vars, and string tmp. + execute_module is expected to have a signature compatible with _execute_module + from ansible plugins/action/__init__.py, e.g.: + def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *args): + This is stored so that it can be invoked in subclasses via check.execute_module("name", args) + which provides the check's stored task_vars and tmp. + """ + + def __init__(self, execute_module=None, task_vars=None, tmp=None): + self._execute_module = execute_module + self.task_vars = task_vars or {} + self.tmp = tmp - def __init__(self, execute_module=None, module_executor=None): - if execute_module is module_executor is None: - raise TypeError( - "__init__() takes either execute_module (recommended) " - "or module_executor (deprecated), none given") - self.execute_module = execute_module or module_executor - self.module_executor = self.execute_module + # set to True when the check changes the host, for accurate total "changed" count + self.changed = False @abstractproperty def name(self): @@ -43,13 +75,13 @@ class OpenShiftCheck(object): """ return [] - @classmethod - def is_active(cls, task_vars): # pylint: disable=unused-argument + @staticmethod + def is_active(): """Returns true if this check applies to the ansible-playbook run.""" return True @abstractmethod - def run(self, tmp, task_vars): + def run(self): """Executes a check, normally implemented as a module.""" return {} @@ -62,6 +94,134 @@ class OpenShiftCheck(object): for subclass in subclass.subclasses(): yield subclass + def execute_module(self, module_name=None, module_args=None): + """Invoke an Ansible module from a check. + + Invoke stored _execute_module, normally copied from the action + plugin, with its params and the task_vars and tmp given at + check initialization. No positional parameters beyond these + are specified. If it's necessary to specify any of the other + parameters to _execute_module then that should just be invoked + directly (with awareness of changes in method signature per + Ansible version). + + So e.g. check.execute_module("foo", dict(arg1=...)) + Return: result hash from module execution. + """ + if self._execute_module is None: + raise NotImplementedError( + self.__class__.__name__ + + " invoked execute_module without providing the method at initialization." + ) + return self._execute_module(module_name, module_args, self.tmp, self.task_vars) + + def get_var(self, *keys, **kwargs): + """Get deeply nested values from task_vars. + + Ansible task_vars structures are Python dicts, often mapping strings to + other dicts. This helper makes it easier to get a nested value, raising + OpenShiftCheckException when a key is not found. + + Keyword args: + default: + On missing key, return this as default value instead of raising exception. + convert: + Supply a function to apply to normalize the value before returning it. + None is the default (return as-is). + This function should raise ValueError if the user has provided a value + that cannot be converted, or OpenShiftCheckException if some other + problem needs to be described to the user. + """ + if len(keys) == 1: + keys = keys[0].split(".") + + try: + value = reduce(operator.getitem, keys, self.task_vars) + except (KeyError, TypeError): + if "default" not in kwargs: + raise OpenShiftCheckException( + "This check expects the '{}' inventory variable to be defined\n" + "in order to proceed, but it is undefined. There may be a bug\n" + "in Ansible, the checks, or their dependencies." + "".format(".".join(map(str, keys))) + ) + value = kwargs["default"] + + convert = kwargs.get("convert", None) + try: + if convert is None: + return value + elif convert is bool: # interpret bool as Ansible does, instead of python truthiness + return ansible_to_bool(value) + else: + return convert(value) + + except ValueError as error: # user error in specifying value + raise OpenShiftCheckException( + 'Cannot convert inventory variable to expected type:\n' + ' "{var}={value}"\n' + '{error}'.format(var=".".join(keys), value=value, error=error) + ) + + except OpenShiftCheckException: # some other check-specific problem + raise + + except Exception as error: # probably a bug in the function + raise OpenShiftCheckException( + 'There is a bug in this check. While trying to convert variable \n' + ' "{var}={value}"\n' + 'the given converter cannot be used or failed unexpectedly:\n' + '{error}'.format(var=".".join(keys), value=value, error=error) + ) + + @staticmethod + def get_major_minor_version(openshift_image_tag): + """Parse and return the deployed version of OpenShift as a tuple.""" + if openshift_image_tag and openshift_image_tag[0] == 'v': + openshift_image_tag = openshift_image_tag[1:] + + # map major release versions across releases + # to a common major version + openshift_major_release_version = { + "1": "3", + } + + components = openshift_image_tag.split(".") + if not components or len(components) < 2: + msg = "An invalid version of OpenShift was found for this host: {}" + raise OpenShiftCheckException(msg.format(openshift_image_tag)) + + if components[0] in openshift_major_release_version: + components[0] = openshift_major_release_version[components[0]] + + components = tuple(int(x) for x in components[:2]) + return components + + def find_ansible_mount(self, path): + """Return the mount point for path from ansible_mounts.""" + + # reorganize list of mounts into dict by path + mount_for_path = { + mount['mount']: mount + for mount + in self.get_var('ansible_mounts') + } + + # NOTE: including base cases '/' and '' to ensure the loop ends + mount_targets = set(mount_for_path.keys()) | {'/', ''} + mount_point = path + while mount_point not in mount_targets: + mount_point = os.path.dirname(mount_point) + + try: + return mount_for_path[mount_point] + except KeyError: + known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) + raise OpenShiftCheckException( + 'Unable to determine mount point for path "{}".\n' + 'Known mount points: {}.'.format(path, known_mounts or 'none') + ) + LOADER_EXCLUDES = ( "__init__.py", @@ -86,20 +246,3 @@ def load_checks(path=None, subpkg=""): modules.append(import_module(__package__ + subpkg + "." + name[:-3])) return modules - - -def get_var(task_vars, *keys, **kwargs): - """Helper function to get deeply nested values from task_vars. - - Ansible task_vars structures are Python dicts, often mapping strings to - other dicts. This helper makes it easier to get a nested value, raising - OpenShiftCheckException when a key is not found or returning a default value - provided as a keyword argument. - """ - try: - value = reduce(operator.getitem, keys, task_vars) - except (KeyError, TypeError): - if "default" in kwargs: - return kwargs["default"] - raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys)))) - return value diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py index 962148cb8..6d1dea9ce 100644 --- a/roles/openshift_health_checker/openshift_checks/disk_availability.py +++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py @@ -1,9 +1,11 @@ -# pylint: disable=missing-docstring -from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var -from openshift_checks.mixins import NotContainerizedMixin +"""Check that there is enough disk space in predefined paths.""" +import tempfile -class DiskAvailability(NotContainerizedMixin, OpenShiftCheck): +from openshift_checks import OpenShiftCheck, OpenShiftCheckException + + +class DiskAvailability(OpenShiftCheck): """Check that recommended disk space is available before a first-time install.""" name = "disk_availability" @@ -12,56 +14,126 @@ class DiskAvailability(NotContainerizedMixin, OpenShiftCheck): # Values taken from the official installation documentation: # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements recommended_disk_space_bytes = { - "masters": 40 * 10**9, - "nodes": 15 * 10**9, - "etcd": 20 * 10**9, + '/var': { + 'masters': 40 * 10**9, + 'nodes': 15 * 10**9, + 'etcd': 20 * 10**9, + }, + # Used to copy client binaries into, + # see roles/openshift_cli/library/openshift_container_binary_sync.py. + '/usr/local/bin': { + 'masters': 1 * 10**9, + 'nodes': 1 * 10**9, + 'etcd': 1 * 10**9, + }, + # Used as temporary storage in several cases. + tempfile.gettempdir(): { + 'masters': 1 * 10**9, + 'nodes': 1 * 10**9, + 'etcd': 1 * 10**9, + }, + } + + # recommended disk space for each location under an upgrade context + recommended_disk_upgrade_bytes = { + '/var': { + 'masters': 10 * 10**9, + 'nodes': 5 * 10 ** 9, + 'etcd': 5 * 10 ** 9, + }, } - @classmethod - def is_active(cls, task_vars): + def is_active(self): """Skip hosts that do not have recommended disk space requirements.""" - group_names = get_var(task_vars, "group_names", default=[]) - has_disk_space_recommendation = bool(set(group_names).intersection(cls.recommended_disk_space_bytes)) - return super(DiskAvailability, cls).is_active(task_vars) and has_disk_space_recommendation - - def run(self, tmp, task_vars): - group_names = get_var(task_vars, "group_names") - ansible_mounts = get_var(task_vars, "ansible_mounts") - free_bytes = self.openshift_available_disk(ansible_mounts) - - recommended_min = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names) - configured_min = int(get_var(task_vars, "openshift_check_min_host_disk_gb", default=0)) * 10**9 - min_free_bytes = configured_min or recommended_min - - if free_bytes < min_free_bytes: - return { - 'failed': True, - 'msg': ( - 'Available disk space ({:.1f} GB) for the volume containing ' - '"/var" is below minimum recommended space ({:.1f} GB)' - ).format(float(free_bytes) / 10**9, float(min_free_bytes) / 10**9) + group_names = self.get_var("group_names", default=[]) + active_groups = set() + for recommendation in self.recommended_disk_space_bytes.values(): + active_groups.update(recommendation.keys()) + has_disk_space_recommendation = bool(active_groups.intersection(group_names)) + return super(DiskAvailability, self).is_active() and has_disk_space_recommendation + + def run(self): + group_names = self.get_var("group_names") + user_config = self.get_var("openshift_check_min_host_disk_gb", default={}) + try: + # For backwards-compatibility, if openshift_check_min_host_disk_gb + # is a number, then it overrides the required config for '/var'. + number = float(user_config) + user_config = { + '/var': { + 'masters': number, + 'nodes': number, + 'etcd': number, + }, } + except TypeError: + # If it is not a number, then it should be a nested dict. + pass - return {} + # TODO: as suggested in + # https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021, + # maybe we could support checking disk availability in paths that are + # not part of the official recommendation but present in the user + # configuration. + for path, recommendation in self.recommended_disk_space_bytes.items(): + free_bytes = self.free_bytes(path) + recommended_bytes = max(recommendation.get(name, 0) for name in group_names) - @staticmethod - def openshift_available_disk(ansible_mounts): - """Determine the available disk space for an OpenShift installation. + config = user_config.get(path, {}) + # NOTE: the user config is in GB, but we compare bytes, thus the + # conversion. + config_bytes = max(config.get(name, 0) for name in group_names) * 10**9 + recommended_bytes = config_bytes or recommended_bytes - ansible_mounts should be a list of dicts like the 'setup' Ansible module - returns. - """ - # priority list in descending order - supported_mnt_paths = ["/var", "/"] - available_mnts = {mnt.get("mount"): mnt for mnt in ansible_mounts} + # if an "upgrade" context is set, update the minimum disk requirement + # as this signifies an in-place upgrade - the node might have the + # required total disk space, but some of that space may already be + # in use by the existing OpenShift deployment. + context = self.get_var("r_openshift_health_checker_playbook_context", default="") + if context == "upgrade": + recommended_upgrade_paths = self.recommended_disk_upgrade_bytes.get(path, {}) + if recommended_upgrade_paths: + recommended_bytes = config_bytes or max(recommended_upgrade_paths.get(name, 0) + for name in group_names) + if free_bytes < recommended_bytes: + free_gb = float(free_bytes) / 10**9 + recommended_gb = float(recommended_bytes) / 10**9 + msg = ( + 'Available disk space in "{}" ({:.1f} GB) ' + 'is below minimum recommended ({:.1f} GB)' + ).format(path, free_gb, recommended_gb) + + # warn if check failed under an "upgrade" context + # due to limits imposed by the user config + if config_bytes and context == "upgrade": + msg += ('\n\nMake sure to account for decreased disk space during an upgrade\n' + 'due to an existing OpenShift deployment. Please check the value of\n' + ' openshift_check_min_host_disk_gb={}\n' + 'in your Ansible inventory, and lower the recommended disk space availability\n' + 'if necessary for this upgrade.').format(config_bytes) + + return { + 'failed': True, + 'msg': ( + 'Available disk space in "{}" ({:.1f} GB) ' + 'is below minimum recommended ({:.1f} GB)' + ).format(path, free_gb, recommended_gb) + } + + return {} + + def free_bytes(self, path): + """Return the size available in path based on ansible_mounts.""" + mount = self.find_ansible_mount(path) try: - for path in supported_mnt_paths: - if path in available_mnts: - return available_mnts[path]["size_available"] + return mount['size_available'] except KeyError: - pass - - paths = ''.join(sorted(available_mnts)) or 'none' - msg = "Unable to determine available disk space. Paths mounted: {}.".format(paths) - raise OpenShiftCheckException(msg) + raise OpenShiftCheckException( + 'Unable to retrieve disk availability for "{path}".\n' + 'Ansible facts included a matching mount point for this path:\n' + ' {mount}\n' + 'however it is missing the size_available field.\n' + 'To investigate, you can inspect the output of `ansible -m setup <host>`' + ''.format(path=path, mount=mount) + ) diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py index 4588ed634..85a922f86 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py +++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py @@ -1,68 +1,65 @@ -# pylint: disable=missing-docstring -from openshift_checks import OpenShiftCheck, get_var +"""Check that required Docker images are available.""" +from openshift_checks import OpenShiftCheck +from openshift_checks.mixins import DockerHostMixin -class DockerImageAvailability(OpenShiftCheck): + +NODE_IMAGE_SUFFIXES = ["haproxy-router", "docker-registry", "deployer", "pod"] +DEPLOYMENT_IMAGE_INFO = { + "origin": { + "namespace": "openshift", + "name": "origin", + "registry_console_image": "cockpit/kubernetes", + }, + "openshift-enterprise": { + "namespace": "openshift3", + "name": "ose", + "registry_console_image": "registry.access.redhat.com/openshift3/registry-console", + }, +} + + +class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): """Check that required Docker images are available. - This check attempts to ensure that required docker images are - either present locally, or able to be pulled down from available - registries defined in a host machine. + Determine docker images that an install would require and check that they + are either present in the host's docker index, or available for the host to pull + with known registries as defined in our inventory file (or defaults). """ name = "docker_image_availability" tags = ["preflight"] + # we use python-docker-py to check local docker for images, and skopeo + # to look for images available remotely without waiting to pull them. + dependencies = ["python-docker-py", "skopeo"] - dependencies = ["skopeo", "python-docker-py"] - - deployment_image_info = { - "origin": { - "namespace": "openshift", - "name": "origin", - }, - "openshift-enterprise": { - "namespace": "openshift3", - "name": "ose", - }, - } - - @classmethod - def is_active(cls, task_vars): + def is_active(self): """Skip hosts with unsupported deployment types.""" - deployment_type = get_var(task_vars, "openshift_deployment_type") - has_valid_deployment_type = deployment_type in cls.deployment_image_info + deployment_type = self.get_var("openshift_deployment_type") + has_valid_deployment_type = deployment_type in DEPLOYMENT_IMAGE_INFO - return super(DockerImageAvailability, cls).is_active(task_vars) and has_valid_deployment_type + return super(DockerImageAvailability, self).is_active() and has_valid_deployment_type - def run(self, tmp, task_vars): - msg, failed, changed = self.ensure_dependencies(task_vars) - - # exit early if Skopeo update fails + def run(self): + msg, failed = self.ensure_dependencies() if failed: - if "No package matching" in msg: - msg = "Ensure that all required dependencies can be installed via `yum`.\n" return { "failed": True, - "changed": changed, - "msg": ( - "Unable to update or install required dependency packages on this host;\n" - "These are required in order to check Docker image availability:" - "\n {deps}\n{msg}" - ).format(deps=',\n '.join(self.dependencies), msg=msg), + "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg } - required_images = self.required_images(task_vars) - missing_images = set(required_images) - set(self.local_images(required_images, task_vars)) + required_images = self.required_images() + missing_images = set(required_images) - set(self.local_images(required_images)) # exit early if all images were found locally if not missing_images: - return {"changed": changed} + return {} - registries = self.known_docker_registries(task_vars) + registries = self.known_docker_registries() if not registries: - return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed} + return {"failed": True, "msg": "Unable to retrieve any docker registries."} - available_images = self.available_images(missing_images, registries, task_vars) + available_images = self.available_images(missing_images, registries) unavailable_images = set(missing_images) - set(available_images) if unavailable_images: @@ -72,77 +69,81 @@ class DockerImageAvailability(OpenShiftCheck): "One or more required Docker images are not available:\n {}\n" "Configured registries: {}" ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)), - "changed": changed, } - return {"changed": changed} - - def required_images(self, task_vars): - deployment_type = get_var(task_vars, "openshift_deployment_type") - image_info = self.deployment_image_info[deployment_type] - - openshift_release = get_var(task_vars, "openshift_release", default="latest") - openshift_image_tag = get_var(task_vars, "openshift_image_tag") - is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") - - images = set(self.required_docker_images( - image_info["namespace"], - image_info["name"], - ["registry-console"] if "enterprise" in deployment_type else [], # include enterprise-only image names - openshift_release, - is_containerized, - )) - - # append images with qualified image tags to our list of required images. - # these are images with a (v0.0.0.0) tag, rather than a standard release - # format tag (v0.0). We want to check this set in both containerized and - # non-containerized installations. - images.update( - self.required_qualified_docker_images( - image_info["namespace"], - image_info["name"], - openshift_image_tag, - ), - ) - - return images - - @staticmethod - def required_docker_images(namespace, name, additional_image_names, version, is_containerized): - if is_containerized: - return ["{}/{}:{}".format(namespace, name, version)] if name else [] - - # include additional non-containerized images specific to the current deployment type - return ["{}/{}:{}".format(namespace, img_name, version) for img_name in additional_image_names] - - @staticmethod - def required_qualified_docker_images(namespace, name, version): - # pylint: disable=invalid-name - return [ - "{}/{}-{}:{}".format(namespace, name, suffix, version) - for suffix in ["haproxy-router", "docker-registry", "deployer", "pod"] - ] - - def local_images(self, images, task_vars): + return {} + + def required_images(self): + """ + Determine which images we expect to need for this host. + Returns: a set of required images like 'openshift/origin:v3.6' + + The thorny issue of determining the image names from the variables is under consideration + via https://github.com/openshift/openshift-ansible/issues/4415 + + For now we operate as follows: + * For containerized components (master, node, ...) we look at the deployment type and + use openshift/origin or openshift3/ose as the base for those component images. The + version is openshift_image_tag as determined by the openshift_version role. + * For OpenShift-managed infrastructure (router, registry...) we use oreg_url if + it is defined; otherwise we again use the base that depends on the deployment type. + Registry is not included in constructed images. It may be in oreg_url or etcd image. + """ + required = set() + deployment_type = self.get_var("openshift_deployment_type") + host_groups = self.get_var("group_names") + # containerized etcd may not have openshift_image_tag, see bz 1466622 + image_tag = self.get_var("openshift_image_tag", default="latest") + image_info = DEPLOYMENT_IMAGE_INFO[deployment_type] + if not image_info: + return required + + # template for images that run on top of OpenShift + image_url = "{}/{}-{}:{}".format(image_info["namespace"], image_info["name"], "${component}", "${version}") + image_url = self.get_var("oreg_url", default="") or image_url + if 'nodes' in host_groups: + for suffix in NODE_IMAGE_SUFFIXES: + required.add(image_url.replace("${component}", suffix).replace("${version}", image_tag)) + # The registry-console is for some reason not prefixed with ose- like the other components. + # Nor is it versioned the same, so just look for latest. + # Also a completely different name is used for Origin. + required.add(image_info["registry_console_image"]) + + # images for containerized components + if self.get_var("openshift", "common", "is_containerized"): + components = set() + if 'nodes' in host_groups: + components.update(["node", "openvswitch"]) + if 'masters' in host_groups: # name is "origin" or "ose" + components.add(image_info["name"]) + for component in components: + required.add("{}/{}:{}".format(image_info["namespace"], component, image_tag)) + if 'etcd' in host_groups: # special case, note it is the same for origin/enterprise + required.add("registry.access.redhat.com/rhel7/etcd") # and no image tag + + return required + + def local_images(self, images): """Filter a list of images and return those available locally.""" return [ image for image in images - if self.is_image_local(image, task_vars) + if self.is_image_local(image) ] - def is_image_local(self, image, task_vars): - result = self.module_executor("docker_image_facts", {"name": image}, task_vars) + def is_image_local(self, image): + """Check if image is already in local docker index.""" + result = self.execute_module("docker_image_facts", {"name": image}) if result.get("failed", False): return False return bool(result.get("images", [])) - @staticmethod - def known_docker_registries(task_vars): - docker_facts = get_var(task_vars, "openshift", "docker") + def known_docker_registries(self): + """Build a list of docker registries available according to inventory vars.""" + docker_facts = self.get_var("openshift", "docker") regs = set(docker_facts["additional_registries"]) - deployment_type = get_var(task_vars, "openshift_deployment_type") + deployment_type = self.get_var("openshift_deployment_type") if deployment_type == "origin": regs.update(["docker.io"]) elif "enterprise" in deployment_type: @@ -150,30 +151,26 @@ class DockerImageAvailability(OpenShiftCheck): return list(regs) - def available_images(self, images, registries, task_vars): - """Inspect existing images using Skopeo and return all images successfully inspected.""" + def available_images(self, images, default_registries): + """Search remotely for images. Returns: list of images found.""" return [ image for image in images - if any(self.is_available_skopeo_image(image, registry, task_vars) for registry in registries) + if self.is_available_skopeo_image(image, default_registries) ] - def is_available_skopeo_image(self, image, registry, task_vars): - """Uses Skopeo to determine if required image exists in a given registry.""" - - cmd_str = "skopeo inspect docker://{registry}/{image}".format( - registry=registry, - image=image, - ) + def is_available_skopeo_image(self, image, default_registries): + """Use Skopeo to determine if required image exists in known registry(s).""" + registries = default_registries - args = {"_raw_params": cmd_str} - result = self.module_executor("command", args, task_vars) - return not result.get("failed", False) and result.get("rc", 0) == 0 + # if image already includes a registry, only use that + if image.count("/") > 1: + registry, image = image.split("/", 1) + registries = [registry] - # ensures that the skopeo and python-docker-py packages exist - # check is skipped on atomic installations - def ensure_dependencies(self, task_vars): - if get_var(task_vars, "openshift", "common", "is_atomic"): - return "", False, False + for registry in registries: + args = {"_raw_params": "skopeo inspect --tls-verify=false docker://{}/{}".format(registry, image)} + result = self.execute_module("command", args) + if result.get("rc", 0) == 0 and not result.get("failed"): + return True - result = self.module_executor("yum", {"name": self.dependencies, "state": "latest"}, task_vars) - return result.get("msg", ""), result.get("failed", False) or result.get("rc", 0) != 0, result.get("changed") + return False diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py new file mode 100644 index 000000000..0558ddf14 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -0,0 +1,276 @@ +"""Check Docker storage driver and usage.""" +import json +import re +from openshift_checks import OpenShiftCheck, OpenShiftCheckException +from openshift_checks.mixins import DockerHostMixin + + +class DockerStorage(DockerHostMixin, OpenShiftCheck): + """Check Docker storage driver compatibility. + + This check ensures that Docker is using a supported storage driver, + and that loopback is not being used (if using devicemapper). + Also that storage usage is not above threshold. + """ + + name = "docker_storage" + tags = ["pre-install", "health", "preflight"] + + dependencies = ["python-docker-py"] + storage_drivers = ["devicemapper", "overlay", "overlay2"] + max_thinpool_data_usage_percent = 90.0 + max_thinpool_meta_usage_percent = 90.0 + max_overlay_usage_percent = 90.0 + + # TODO(lmeyer): mention these in the output when check fails + configuration_variables = [ + ( + "max_thinpool_data_usage_percent", + "For 'devicemapper' storage driver, usage threshold percentage for data. " + "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent), + ), + ( + "max_thinpool_meta_usage_percent", + "For 'devicemapper' storage driver, usage threshold percentage for metadata. " + "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent), + ), + ( + "max_overlay_usage_percent", + "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. " + "Format: float. Default: {:.1f}".format(max_overlay_usage_percent), + ), + ] + + def run(self): + msg, failed = self.ensure_dependencies() + if failed: + return { + "failed": True, + "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg + } + + # attempt to get the docker info hash from the API + docker_info = self.execute_module("docker_info", {}) + if docker_info.get("failed"): + return {"failed": True, + "msg": "Failed to query Docker API. Is docker running on this host?"} + if not docker_info.get("info"): # this would be very strange + return {"failed": True, + "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))} + docker_info = docker_info["info"] + + # check if the storage driver we saw is valid + driver = docker_info.get("Driver", "[NONE]") + if driver not in self.storage_drivers: + msg = ( + "Detected unsupported Docker storage driver '{driver}'.\n" + "Supported storage drivers are: {drivers}" + ).format(driver=driver, drivers=', '.join(self.storage_drivers)) + return {"failed": True, "msg": msg} + + # driver status info is a list of tuples; convert to dict and validate based on driver + driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])} + + result = {} + + if driver == "devicemapper": + result = self.check_devicemapper_support(driver_status) + + if driver in ['overlay', 'overlay2']: + result = self.check_overlay_support(docker_info, driver_status) + + return result + + def check_devicemapper_support(self, driver_status): + """Check if dm storage driver is supported as configured. Return: result dict.""" + if driver_status.get("Data loop file"): + msg = ( + "Use of loopback devices with the Docker devicemapper storage driver\n" + "(the default storage configuration) is unsupported in production.\n" + "Please use docker-storage-setup to configure a backing storage volume.\n" + "See http://red.ht/2rNperO for further information." + ) + return {"failed": True, "msg": msg} + result = self.check_dm_usage(driver_status) + return result + + def check_dm_usage(self, driver_status): + """Check usage thresholds for Docker dm storage driver. Return: result dict. + Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool + implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures + devicemapper storage. The LV is "thin" because it does not use all available storage + from its VG, instead expanding as needed; so to determine available space, we gather + current usage as the Docker API reports for the driver as well as space available for + expansion in the pool's VG. + Usage within the LV is divided into pools allocated to data and metadata, either of which + could run out of space first; so we check both. + """ + vals = dict( + vg_free=self.get_vg_free(driver_status.get("Pool Name")), + data_used=driver_status.get("Data Space Used"), + data_total=driver_status.get("Data Space Total"), + metadata_used=driver_status.get("Metadata Space Used"), + metadata_total=driver_status.get("Metadata Space Total"), + ) + + # convert all human-readable strings to bytes + for key, value in vals.copy().items(): + try: + vals[key + "_bytes"] = self.convert_to_bytes(value) + except ValueError as err: # unlikely to hit this from API info, but just to be safe + return { + "failed": True, + "values": vals, + "msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err)) + } + + # determine the threshold percentages which usage should not exceed + for name, default in [("data", self.max_thinpool_data_usage_percent), + ("metadata", self.max_thinpool_meta_usage_percent)]: + percent = self.get_var("max_thinpool_" + name + "_usage_percent", default=default) + try: + vals[name + "_threshold"] = float(percent) + except ValueError: + return { + "failed": True, + "msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent) + } + + # test whether the thresholds are exceeded + messages = [] + for name in ["data", "metadata"]: + vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / ( + vals[name + "_total_bytes"] + vals["vg_free_bytes"]) + if vals[name + "_pct_used"] > vals[name + "_threshold"]: + messages.append( + "Docker thinpool {name} usage percentage {pct:.1f} " + "is higher than threshold {thresh:.1f}.".format( + name=name, + pct=vals[name + "_pct_used"], + thresh=vals[name + "_threshold"], + )) + vals["failed"] = True + + vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."]) + return vals + + def get_vg_free(self, pool): + """Determine which VG to examine according to the pool name. Return: size vgs reports. + Pool name is the only indicator currently available from the Docker API driver info. + We assume a name that looks like "vg--name-docker--pool"; + vg and lv names with inner hyphens doubled, joined by a hyphen. + """ + match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen + if not match: # unlikely, but... be clear if we assumed wrong + raise OpenShiftCheckException( + "This host's Docker reports it is using a storage pool named '{}'.\n" + "However this name does not have the expected format of 'vgname-lvname'\n" + "so the available storage in the VG cannot be determined.".format(pool) + ) + vg_name = match.groups()[0].replace("--", "-") + vgs_cmd = "/sbin/vgs --noheadings -o vg_free --units g --select vg_name=" + vg_name + # should return free space like " 12.00g" if the VG exists; empty if it does not + + ret = self.execute_module("command", {"_raw_params": vgs_cmd}) + if ret.get("failed") or ret.get("rc", 0) != 0: + raise OpenShiftCheckException( + "Is LVM installed? Failed to run /sbin/vgs " + "to determine docker storage usage:\n" + ret.get("msg", "") + ) + size = ret.get("stdout", "").strip() + if not size: + raise OpenShiftCheckException( + "This host's Docker reports it is using a storage pool named '{pool}'.\n" + "which we expect to come from local VG '{vg}'.\n" + "However, /sbin/vgs did not find this VG. Is Docker for this host" + "running and using the storage on the host?".format(pool=pool, vg=vg_name) + ) + return size + + @staticmethod + def convert_to_bytes(string): + """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes.""" + units = dict( + b=1, + k=1024, + m=1024**2, + g=1024**3, + t=1024**4, + p=1024**5, + ) + string = string or "" + match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string) # float followed by optional unit + if not match: + raise ValueError("Cannot convert to a byte size: " + string) + + number, unit = match.groups() + multiplier = 1 if not unit else units.get(unit.lower()) + if not multiplier: + raise ValueError("Cannot convert to a byte size: " + string) + + return float(number) * multiplier + + def check_overlay_support(self, docker_info, driver_status): + """Check if overlay storage driver is supported for this host. Return: result dict.""" + # check for xfs as backing store + backing_fs = driver_status.get("Backing Filesystem", "[NONE]") + if backing_fs != "xfs": + msg = ( + "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n" + "'xfs' as the backing storage, but this host's storage is type '{fs}'." + ).format(fs=backing_fs) + return {"failed": True, "msg": msg} + + # check support for OS and kernel version + o_s = docker_info.get("OperatingSystem", "[NONE]") + if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s: + # keep it simple, only check enterprise kernel versions; assume everyone else is good + kernel = docker_info.get("KernelVersion", "[NONE]") + kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)] + if kernel_arr < [3, 10, 0, 514]: # rhel < 7.3 + msg = ( + "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n" + "kernel version 3.10.0-514; but Docker reports kernel version {version}." + ).format(version=kernel) + return {"failed": True, "msg": msg} + # NOTE: we could check for --selinux-enabled here but docker won't even start with + # that option until it's supported in the kernel so we don't need to. + + return self.check_overlay_usage(docker_info) + + def check_overlay_usage(self, docker_info): + """Check disk usage on OverlayFS backing store volume. Return: result dict.""" + path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"] + + threshold = self.get_var("max_overlay_usage_percent", default=self.max_overlay_usage_percent) + try: + threshold = float(threshold) + except ValueError: + return { + "failed": True, + "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold), + } + + mount = self.find_ansible_mount(path) + try: + free_bytes = mount['size_available'] + total_bytes = mount['size_total'] + usage = 100.0 * (total_bytes - free_bytes) / total_bytes + except (KeyError, ZeroDivisionError): + return { + "failed": True, + "msg": "The ansible_mount found for path {} is invalid.\n" + "This is likely to be an Ansible bug. The record was:\n" + "{}".format(path, json.dumps(mount, indent=2)), + } + + if usage > threshold: + return { + "failed": True, + "msg": ( + "For Docker OverlayFS mount point {path},\n" + "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}." + ).format(path=mount["mount"], pct=usage, thresh=threshold) + } + + return {} diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py index c04a69765..f4296753a 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py @@ -2,7 +2,7 @@ Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster. """ -from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var +from openshift_checks import OpenShiftCheck class EtcdImageDataSize(OpenShiftCheck): @@ -11,24 +11,25 @@ class EtcdImageDataSize(OpenShiftCheck): name = "etcd_imagedata_size" tags = ["etcd"] - def run(self, tmp, task_vars): - etcd_mountpath = self._get_etcd_mountpath(get_var(task_vars, "ansible_mounts")) + def run(self): + etcd_mountpath = self.find_ansible_mount("/var/lib/etcd") etcd_avail_diskspace = etcd_mountpath["size_available"] etcd_total_diskspace = etcd_mountpath["size_total"] - etcd_imagedata_size_limit = get_var(task_vars, - "etcd_max_image_data_size_bytes", - default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace))) + etcd_imagedata_size_limit = self.get_var( + "etcd_max_image_data_size_bytes", + default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace)) + ) - etcd_is_ssl = get_var(task_vars, "openshift", "master", "etcd_use_ssl", default=False) - etcd_port = get_var(task_vars, "openshift", "master", "etcd_port", default=2379) - etcd_hosts = get_var(task_vars, "openshift", "master", "etcd_hosts") + etcd_is_ssl = self.get_var("openshift", "master", "etcd_use_ssl", default=False) + etcd_port = self.get_var("openshift", "master", "etcd_port", default=2379) + etcd_hosts = self.get_var("openshift", "master", "etcd_hosts") - config_base = get_var(task_vars, "openshift", "common", "config_base") + config_base = self.get_var("openshift", "common", "config_base") - cert = task_vars.get("etcd_client_cert", config_base + "/master/master.etcd-client.crt") - key = task_vars.get("etcd_client_key", config_base + "/master/master.etcd-client.key") - ca_cert = task_vars.get("etcd_client_ca_cert", config_base + "/master/master.etcd-ca.crt") + cert = self.get_var("etcd_client_cert", default=config_base + "/master/master.etcd-client.crt") + key = self.get_var("etcd_client_key", default=config_base + "/master/master.etcd-client.key") + ca_cert = self.get_var("etcd_client_ca_cert", default=config_base + "/master/master.etcd-ca.crt") for etcd_host in list(etcd_hosts): args = { @@ -46,7 +47,7 @@ class EtcdImageDataSize(OpenShiftCheck): }, } - etcdkeysize = self.module_executor("etcdkeysize", args, task_vars) + etcdkeysize = self.execute_module("etcdkeysize", args) if etcdkeysize.get("rc", 0) != 0 or etcdkeysize.get("failed"): msg = 'Failed to retrieve stats for etcd host "{host}": {reason}' @@ -55,7 +56,7 @@ class EtcdImageDataSize(OpenShiftCheck): reason = etcdkeysize["module_stderr"] msg = msg.format(host=etcd_host, reason=reason) - return {"failed": True, "changed": False, "msg": msg} + return {"failed": True, "msg": msg} if etcdkeysize["size_limit_exceeded"]: limit = self._to_gigabytes(etcd_imagedata_size_limit) @@ -64,20 +65,7 @@ class EtcdImageDataSize(OpenShiftCheck): "Use the `oadm prune images` command to cleanup unused Docker images.") return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)} - return {"changed": False} - - @staticmethod - def _get_etcd_mountpath(ansible_mounts): - valid_etcd_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"] - - mount_for_path = {mnt.get("mount"): mnt for mnt in ansible_mounts} - for path in valid_etcd_mount_paths: - if path in mount_for_path: - return mount_for_path[path] - - paths = ', '.join(sorted(mount_for_path)) or 'none' - msg = "Unable to determine a valid etcd mountpath. Paths mounted: {}.".format(paths) - raise OpenShiftCheckException(msg) + return {} @staticmethod def _to_gigabytes(byte_size): diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py new file mode 100644 index 000000000..b4c8957e9 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py @@ -0,0 +1,44 @@ +"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic.""" + +from openshift_checks import OpenShiftCheck + + +class EtcdTraffic(OpenShiftCheck): + """Check if host is being affected by an increase in etcd traffic.""" + + name = "etcd_traffic" + tags = ["health", "etcd"] + + def is_active(self): + """Skip hosts that do not have etcd in their group names.""" + group_names = self.get_var("group_names", default=[]) + valid_group_names = "etcd" in group_names + + version = self.get_major_minor_version(self.get_var("openshift_image_tag")) + valid_version = version in ((3, 4), (3, 5)) + + return super(EtcdTraffic, self).is_active() and valid_group_names and valid_version + + def run(self): + is_containerized = self.get_var("openshift", "common", "is_containerized") + unit = "etcd_container" if is_containerized else "etcd" + + log_matchers = [{ + "start_regexp": r"Starting Etcd Server", + "regexp": r"etcd: sync duration of [^,]+, expected less than 1s", + "unit": unit + }] + + match = self.execute_module("search_journalctl", {"log_matchers": log_matchers}) + + if match.get("matched"): + msg = ("Higher than normal etcd traffic detected.\n" + "OpenShift 3.4 introduced an increase in etcd traffic.\n" + "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n" + "Please refer to https://access.redhat.com/solutions/2916381 for more information.") + return {"failed": True, "msg": msg} + + if match.get("failed"): + return {"failed": True, "msg": "\n".join(match.get("errors"))} + + return {} diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index 7452c9cc1..e5d93ff3f 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -1,6 +1,6 @@ """A health check for OpenShift clusters.""" -from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var +from openshift_checks import OpenShiftCheck class EtcdVolume(OpenShiftCheck): @@ -11,24 +11,21 @@ class EtcdVolume(OpenShiftCheck): # Default device usage threshold. Value should be in the range [0, 100]. default_threshold_percent = 90 - # Where to find ectd data, higher priority first. - supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"] - - @classmethod - def is_active(cls, task_vars): - etcd_hosts = get_var(task_vars, "groups", "etcd", default=[]) or get_var(task_vars, "groups", "masters", - default=[]) or [] - is_etcd_host = get_var(task_vars, "ansible_ssh_host") in etcd_hosts - return super(EtcdVolume, cls).is_active(task_vars) and is_etcd_host - - def run(self, tmp, task_vars): - mount_info = self._etcd_mount_info(task_vars) + # Where to find etcd data + etcd_mount_path = "/var/lib/etcd" + + def is_active(self): + etcd_hosts = self.get_var("groups", "etcd", default=[]) or self.get_var("groups", "masters", default=[]) or [] + is_etcd_host = self.get_var("ansible_ssh_host") in etcd_hosts + return super(EtcdVolume, self).is_active() and is_etcd_host + + def run(self): + mount_info = self.find_ansible_mount(self.etcd_mount_path) available = mount_info["size_available"] total = mount_info["size_total"] used = total - available - threshold = get_var( - task_vars, + threshold = self.get_var( "etcd_device_usage_threshold_percent", default=self.default_threshold_percent ) @@ -43,16 +40,4 @@ class EtcdVolume(OpenShiftCheck): ) return {"failed": True, "msg": msg} - return {"changed": False} - - def _etcd_mount_info(self, task_vars): - ansible_mounts = get_var(task_vars, "ansible_mounts") - mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts} - - for path in self.supported_mount_paths: - if path in mounts: - return mounts[path] - - paths = ', '.join(sorted(mounts)) or 'none' - msg = "Unable to find etcd storage mount point. Paths mounted: {}.".format(paths) - raise OpenShiftCheckException(msg) + return {} diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py index c9fc59896..b27f97172 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/curator.py +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -1,61 +1,43 @@ -""" -Module for performing checks on an Curator logging deployment -""" +"""Check for an aggregated logging Curator deployment""" -from openshift_checks import get_var -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck class Curator(LoggingCheck): - """Module that checks an integrated logging Curator deployment""" + """Check for an aggregated logging Curator deployment""" name = "curator" tags = ["health", "logging"] - logging_namespace = None - - def run(self, tmp, task_vars): + def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") - curator_pods, error = super(Curator, self).get_pods_for_component( - self.module_executor, - self.logging_namespace, - "curator", - task_vars - ) - if error: - return {"failed": True, "changed": False, "msg": error} - check_error = self.check_curator(curator_pods) - - if check_error: - msg = ("The following Curator deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} - + curator_pods = self.get_pods_for_component("curator") + self.check_curator(curator_pods) # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'} + + return {} def check_curator(self, pods): """Check to see if curator is up and working. Returns: error string""" if not pods: - return ( + raise OpenShiftCheckException( + "MissingComponentPods", "There are no Curator pods for the logging stack,\n" "so nothing will prune Elasticsearch indexes.\n" "Is Curator correctly deployed?" ) - not_running = super(Curator, self).not_running_pods(pods) + not_running = self.not_running_pods(pods) if len(not_running) == len(pods): - return ( + raise OpenShiftCheckException( + "CuratorNotRunning", "The Curator pod is not currently in a running state,\n" "so Elasticsearch indexes may increase without bound." ) if len(pods) - len(not_running) > 1: - return ( + raise OpenShiftCheckException( + "TooManyCurators", "There is more than one Curator pod running. This should not normally happen.\n" "Although this doesn't cause any problems, you may want to investigate." ) - - return None diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py index 01cb35b81..7fc843fd7 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -1,190 +1,193 @@ -""" -Module for performing checks on an Elasticsearch logging deployment -""" +"""Check for an aggregated logging Elasticsearch deployment""" import json import re -from openshift_checks import get_var +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList from openshift_checks.logging.logging import LoggingCheck class Elasticsearch(LoggingCheck): - """Module that checks an integrated logging Elasticsearch deployment""" + """Check for an aggregated logging Elasticsearch deployment""" name = "elasticsearch" tags = ["health", "logging"] - logging_namespace = None - - def run(self, tmp, task_vars): + def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") - es_pods, error = super(Elasticsearch, self).get_pods_for_component( - self.execute_module, - self.logging_namespace, - "es", - task_vars, - ) - if error: - return {"failed": True, "changed": False, "msg": error} - check_error = self.check_elasticsearch(es_pods, task_vars) - - if check_error: - msg = ("The following Elasticsearch deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} - + es_pods = self.get_pods_for_component("es") + self.check_elasticsearch(es_pods) # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'} - def _not_running_elasticsearch_pods(self, es_pods): - """Returns: list of running pods, list of errors about non-running pods""" - not_running = super(Elasticsearch, self).not_running_pods(es_pods) - if not_running: - return not_running, [( - 'The following Elasticsearch pods are not running:\n' - '{pods}' - 'These pods will not aggregate logs from their nodes.' - ).format(pods=''.join( - " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) - for pod in not_running - ))] - return not_running, [] - - def check_elasticsearch(self, es_pods, task_vars): - """Various checks for elasticsearch. Returns: error string""" - not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods) - running_pods = [pod for pod in es_pods if pod not in not_running_pods] + return {} + + def check_elasticsearch(self, es_pods): + """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors.""" + running_pods, errors = self.running_elasticsearch_pods(es_pods) pods_by_name = { pod['metadata']['name']: pod for pod in running_pods # Filter out pods that are not members of a DC if pod['metadata'].get('labels', {}).get('deploymentconfig') } if not pods_by_name: - return 'No logging Elasticsearch pods were found. Is logging deployed?' - error_msgs += self._check_elasticsearch_masters(pods_by_name, task_vars) - error_msgs += self._check_elasticsearch_node_list(pods_by_name, task_vars) - error_msgs += self._check_es_cluster_health(pods_by_name, task_vars) - error_msgs += self._check_elasticsearch_diskspace(pods_by_name, task_vars) - return '\n'.join(error_msgs) + # nothing running, cannot run the rest of the check + errors.append(OpenShiftCheckException( + 'NoRunningPods', + 'No logging Elasticsearch pods were found running, so no logs are being aggregated.' + )) + raise OpenShiftCheckExceptionList(errors) + + errors += self.check_elasticsearch_masters(pods_by_name) + errors += self.check_elasticsearch_node_list(pods_by_name) + errors += self.check_es_cluster_health(pods_by_name) + errors += self.check_elasticsearch_diskspace(pods_by_name) + if errors: + raise OpenShiftCheckExceptionList(errors) + + def running_elasticsearch_pods(self, es_pods): + """Returns: list of running pods, list of errors about non-running pods""" + not_running = self.not_running_pods(es_pods) + running_pods = [pod for pod in es_pods if pod not in not_running] + if not_running: + return running_pods, [OpenShiftCheckException( + 'PodNotRunning', + 'The following Elasticsearch pods are defined but not running:\n' + '{pods}'.format(pods=''.join( + " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) + for pod in not_running + )) + )] + return running_pods, [] @staticmethod def _build_es_curl_cmd(pod_name, url): base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'" return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) - def _check_elasticsearch_masters(self, pods_by_name, task_vars): - """Check that Elasticsearch masters are sane. Returns: list of error strings""" + def check_elasticsearch_masters(self, pods_by_name): + """Check that Elasticsearch masters are sane. Returns: list of errors""" es_master_names = set() - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): # Compare what each ES node reports as master and compare for split brain get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") - master_name_str = self._exec_oc(get_master_cmd, [], task_vars) + master_name_str = self.exec_oc(get_master_cmd, []) master_names = (master_name_str or '').split(' ') if len(master_names) > 1: es_master_names.add(master_names[1]) else: - error_msgs.append( - 'No master? Elasticsearch {pod} returned bad string when asked master name:\n' + errors.append(OpenShiftCheckException( + 'NoMasterName', + 'Elasticsearch {pod} gave unexpected response when asked master name:\n' ' {response}'.format(pod=pod_name, response=master_name_str) - ) + )) if not es_master_names: - error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?') - return '\n'.join(error_msgs) + errors.append(OpenShiftCheckException( + 'NoMasterFound', + 'No logging Elasticsearch masters were found.' + )) + return errors if len(es_master_names) > 1: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'SplitBrainMasters', 'Found multiple Elasticsearch masters according to the pods:\n' '{master_list}\n' 'This implies that the masters have "split brain" and are not correctly\n' 'replicating data for the logging cluster. Log loss is likely to occur.' .format(master_list='\n'.join(' ' + master for master in es_master_names)) - ) + )) - return error_msgs + return errors - def _check_elasticsearch_node_list(self, pods_by_name, task_vars): - """Check that reported ES masters are accounted for by pods. Returns: list of error strings""" + def check_elasticsearch_node_list(self, pods_by_name): + """Check that reported ES masters are accounted for by pods. Returns: list of errors""" if not pods_by_name: - return ['No logging Elasticsearch masters were found. Is logging deployed?'] + return [OpenShiftCheckException( + 'MissingComponentPods', + 'No logging Elasticsearch pods were found.' + )] # get ES cluster nodes node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') - cluster_node_data = self._exec_oc(node_cmd, [], task_vars) + cluster_node_data = self.exec_oc(node_cmd, []) try: cluster_nodes = json.loads(cluster_node_data)['nodes'] except (ValueError, KeyError): - return [ + return [OpenShiftCheckException( + 'MissingNodeList', 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' + cluster_node_data - ] + )] # Try to match all ES-reported node hosts to known pods. - error_msgs = [] + errors = [] for node in cluster_nodes.values(): # Note that with 1.4/3.4 the pod IP may be used as the master name if not any(node['host'] in (pod_name, pod['status'].get('podIP')) for pod_name, pod in pods_by_name.items()): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'EsPodNodeMismatch', 'The Elasticsearch cluster reports a member node "{node}"\n' 'that does not correspond to any known ES pod.'.format(node=node['host']) - ) + )) - return error_msgs + return errors - def _check_es_cluster_health(self, pods_by_name, task_vars): + def check_es_cluster_health(self, pods_by_name): """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') - cluster_health_data = self._exec_oc(cluster_health_cmd, [], task_vars) + cluster_health_data = self.exec_oc(cluster_health_cmd, []) try: health_res = json.loads(cluster_health_data) if not health_res or not health_res.get('status'): raise ValueError() except ValueError: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'BadEsResponse', 'Could not retrieve cluster health status from logging ES pod "{pod}".\n' 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data) - ) + )) continue if health_res['status'] not in ['green', 'yellow']: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'EsClusterHealthRed', 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name) - ) + )) - return error_msgs + return errors - def _check_elasticsearch_diskspace(self, pods_by_name, task_vars): + def check_elasticsearch_diskspace(self, pods_by_name): """ Exec into an ES pod and query the diskspace on the persistent volume. Returns: list of errors """ - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) - disk_output = self._exec_oc(df_cmd, [], task_vars) + disk_output = self.exec_oc(df_cmd, []) lines = disk_output.splitlines() # expecting one header looking like 'IUse% Use%' and one body line body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'BadDfResponse', 'Could not retrieve storage usage from logging ES pod "{pod}".\n' 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output) - ) + )) continue inode_pct, disk_pct = re.match(body_re, lines[1]).groups() - inode_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_inode_pct', default='90') + inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90') if int(inode_pct) >= int(inode_pct_thresh): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'InodeUsageTooHigh', 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n' ' is {pct}, greater than threshold {limit}.\n' ' Note: threshold can be specified in inventory with {param}'.format( @@ -192,10 +195,11 @@ class Elasticsearch(LoggingCheck): pct=str(inode_pct), limit=str(inode_pct_thresh), param='openshift_check_efk_es_inode_pct', - )) - disk_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_storage_pct', default='80') + ))) + disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80') if int(disk_pct) >= int(disk_pct_thresh): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'DiskUsageTooHigh', 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n' ' is {pct}, greater than threshold {limit}.\n' ' Note: threshold can be specified in inventory with {param}'.format( @@ -203,15 +207,6 @@ class Elasticsearch(LoggingCheck): pct=str(disk_pct), limit=str(disk_pct_thresh), param='openshift_check_efk_es_storage_pct', - )) - - return error_msgs - - def _exec_oc(self, cmd_str, extra_args, task_vars): - return super(Elasticsearch, self).exec_oc( - self.execute_module, - self.logging_namespace, - cmd_str, - extra_args, - task_vars, - ) + ))) + + return errors diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py index 627567293..3b192a281 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -1,76 +1,109 @@ -""" -Module for performing checks on an Fluentd logging deployment -""" +"""Check for an aggregated logging Fluentd deployment""" import json -from openshift_checks import get_var + +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList from openshift_checks.logging.logging import LoggingCheck class Fluentd(LoggingCheck): - """Module that checks an integrated logging Fluentd deployment""" + """Check for an aggregated logging Fluentd deployment""" + name = "fluentd" tags = ["health", "logging"] - logging_namespace = None + def run(self): + """Check the Fluentd deployment and raise an error if any problems are found.""" + + fluentd_pods = self.get_pods_for_component("fluentd") + self.check_fluentd(fluentd_pods) + return {} - def run(self, tmp, task_vars): - """Check various things and gather errors. Returns: result as hash""" + def check_fluentd(self, pods): + """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found.""" - self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") - fluentd_pods, error = super(Fluentd, self).get_pods_for_component( - self.execute_module, - self.logging_namespace, - "fluentd", - task_vars, + node_selector = self.get_var( + 'openshift_logging_fluentd_nodeselector', + default='logging-infra-fluentd=true' ) - if error: - return {"failed": True, "changed": False, "msg": error} - check_error = self.check_fluentd(fluentd_pods, task_vars) - if check_error: - msg = ("The following Fluentd deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} + nodes_by_name = self.get_nodes_by_name() + fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector) + + errors = [] + errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) + errors += self.check_nodes_have_fluentd(pods, fluentd_nodes) + errors += self.check_fluentd_pods_running(pods) + + # Make sure there are no extra fluentd pods + if len(pods) > len(fluentd_nodes): + errors.append(OpenShiftCheckException( + 'TooManyFluentdPods', + 'There are more Fluentd pods running than nodes labeled.\n' + 'This may not cause problems with logging but it likely indicates something wrong.' + )) + + if errors: + raise OpenShiftCheckExceptionList(errors) - # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'} + def get_nodes_by_name(self): + """Retrieve all the node definitions. Returns: dict(name: node)""" + nodes_json = self.exec_oc("get nodes -o json", []) + try: + nodes = json.loads(nodes_json) + except ValueError: # no valid json - should not happen + raise OpenShiftCheckException( + "BadOcNodeList", + "Could not obtain a list of nodes to validate fluentd.\n" + "Output from oc get:\n" + nodes_json + ) + if not nodes or not nodes.get('items'): # also should not happen + raise OpenShiftCheckException( + "NoNodesDefined", + "No nodes appear to be defined according to the API." + ) + return { + node['metadata']['name']: node + for node in nodes['items'] + } @staticmethod - def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector): - """Filter to all nodes with fluentd label. Returns dict(name: node), error string""" + def filter_fluentd_labeled_nodes(nodes_by_name, node_selector): + """Filter to all nodes with fluentd label. Returns dict(name: node)""" label, value = node_selector.split('=', 1) fluentd_nodes = { name: node for name, node in nodes_by_name.items() if node['metadata']['labels'].get(label) == value } if not fluentd_nodes: - return None, ( + raise OpenShiftCheckException( + 'NoNodesLabeled', 'There are no nodes with the fluentd label {label}.\n' - 'This means no logs will be aggregated from the nodes.' - ).format(label=node_selector) - return fluentd_nodes, None + 'This means no logs will be aggregated from the nodes.'.format(label=node_selector) + ) + return fluentd_nodes - @staticmethod - def _check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars): - """Note if nodes are not labeled as expected. Returns: error string""" - intended_nodes = get_var(task_vars, 'openshift_logging_fluentd_hosts', default=['--all']) + def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): + """Note if nodes are not labeled as expected. Returns: error list""" + intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all']) if not intended_nodes or '--all' in intended_nodes: intended_nodes = nodes_by_name.keys() nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) if nodes_missing_labels: - return ( + return [OpenShiftCheckException( + 'NodesUnlabeled', 'The following nodes are supposed to be labeled with {label} but are not:\n' ' {nodes}\n' - 'Fluentd will not aggregate logs from these nodes.' - ).format(label=node_selector, nodes=', '.join(nodes_missing_labels)) - return None + 'Fluentd will not aggregate logs from these nodes.'.format( + label=node_selector, nodes=', '.join(nodes_missing_labels) + ))] + + return [] @staticmethod - def _check_nodes_have_fluentd(pods, fluentd_nodes): - """Make sure fluentd is on all the labeled nodes. Returns: error string""" + def check_nodes_have_fluentd(pods, fluentd_nodes): + """Make sure fluentd is on all the labeled nodes. Returns: error list""" unmatched_nodes = fluentd_nodes.copy() node_names_by_label = { node['metadata']['labels']['kubernetes.io/hostname']: name @@ -90,81 +123,32 @@ class Fluentd(LoggingCheck): ]: unmatched_nodes.pop(name, None) if unmatched_nodes: - return ( + return [OpenShiftCheckException( + 'MissingFluentdPod', 'The following nodes are supposed to have a Fluentd pod but do not:\n' - '{nodes}' - 'These nodes will not have their logs aggregated.' - ).format(nodes=''.join( - " {}\n".format(name) - for name in unmatched_nodes.keys() - )) - return None + ' {nodes}\n' + 'These nodes will not have their logs aggregated.'.format( + nodes='\n '.join(unmatched_nodes.keys()) + ))] + + return [] - def _check_fluentd_pods_running(self, pods): + def check_fluentd_pods_running(self, pods): """Make sure all fluentd pods are running. Returns: error string""" not_running = super(Fluentd, self).not_running_pods(pods) if not_running: - return ( + return [OpenShiftCheckException( + 'FluentdNotRunning', 'The following Fluentd pods are supposed to be running but are not:\n' - '{pods}' - 'These pods will not aggregate logs from their nodes.' - ).format(pods=''.join( - " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) - for pod in not_running - )) - return None - - def check_fluentd(self, pods, task_vars): - """Verify fluentd is running everywhere. Returns: error string""" - - node_selector = get_var(task_vars, 'openshift_logging_fluentd_nodeselector', - default='logging-infra-fluentd=true') - - nodes_by_name, error = self.get_nodes_by_name(task_vars) - - if error: - return error - fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector) - if error: - return error - - error_msgs = [] - error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars) - if error: - error_msgs.append(error) - error = self._check_nodes_have_fluentd(pods, fluentd_nodes) - if error: - error_msgs.append(error) - error = self._check_fluentd_pods_running(pods) - if error: - error_msgs.append(error) - - # Make sure there are no extra fluentd pods - if len(pods) > len(fluentd_nodes): - error_msgs.append( - 'There are more Fluentd pods running than nodes labeled.\n' - 'This may not cause problems with logging but it likely indicates something wrong.' - ) - - return '\n'.join(error_msgs) - - def get_nodes_by_name(self, task_vars): - """Retrieve all the node definitions. Returns: dict(name: node), error string""" - nodes_json = self._exec_oc("get nodes -o json", [], task_vars) - try: - nodes = json.loads(nodes_json) - except ValueError: # no valid json - should not happen - return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json - if not nodes or not nodes.get('items'): # also should not happen - return None, "No nodes appear to be defined according to the API." - return { - node['metadata']['name']: node - for node in nodes['items'] - }, None - - def _exec_oc(self, cmd_str, extra_args, task_vars): - return super(Fluentd, self).exec_oc(self.execute_module, - self.logging_namespace, - cmd_str, - extra_args, - task_vars) + ' {pods}\n' + 'These pods will not aggregate logs from their nodes.'.format( + pods='\n'.join( + " {name} ({host})".format( + name=pod['metadata']['name'], + host=pod['spec'].get('host', 'None') + ) + for pod in not_running + ) + ))] + + return [] diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py new file mode 100644 index 000000000..d783e6760 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py @@ -0,0 +1,131 @@ +""" +Module for performing checks on a Fluentd logging deployment configuration +""" + +from openshift_checks import OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck + + +class FluentdConfig(LoggingCheck): + """Module that checks logging configuration of an integrated logging Fluentd deployment""" + name = "fluentd_config" + tags = ["health"] + + def is_active(self): + logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) + + try: + version = self.get_major_minor_version(self.get_var("openshift_image_tag")) + except ValueError: + # if failed to parse OpenShift version, perform check anyway (if logging enabled) + return logging_deployed + + return logging_deployed and version < (3, 6) + + def run(self): + """Check that Fluentd has running pods, and that its logging config matches Docker's logging config.""" + config_error = self.check_logging_config() + if config_error: + msg = ("The following Fluentd logging configuration problem was found:" + "\n{}".format(config_error)) + return {"failed": True, "msg": msg} + + return {} + + def check_logging_config(self): + """Ensure that the configured Docker logging driver matches fluentd settings. + This means that, at least for now, if the following condition is met: + + openshift_logging_fluentd_use_journal == True + + then the value of the configured Docker logging driver should be "journald". + Otherwise, the value of the Docker logging driver should be "json-file". + Returns an error string if the above condition is not met, or None otherwise.""" + use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True) + + # if check is running on a master, retrieve all running pods + # and check any pod's container for the env var "USE_JOURNAL" + group_names = self.get_var("group_names") + if "masters" in group_names: + use_journald = self.check_fluentd_env_var() + + docker_info = self.execute_module("docker_info", {}) + try: + logging_driver = docker_info["info"]["LoggingDriver"] + except KeyError: + return "Unable to determine Docker logging driver." + + logging_driver = docker_info["info"]["LoggingDriver"] + recommended_logging_driver = "journald" + error = None + + # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver` + # option as an inventory file variable, or adding the log driver value as part of the + # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that + # can be passed to the Docker binary; the only other recommendation that can be made, would be + # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running + # individual containers. + if use_journald and logging_driver != "journald": + error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n' + 'This differs from your Docker configuration, which has been set to use "{driver}" ' + 'as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + elif not use_journald and logging_driver != "json-file": + recommended_logging_driver = "json-file" + error = ('Your Fluentd configuration is set to aggregate Docker container logs from ' + 'individual json log files per container.\n ' + 'This differs from your Docker configuration, which has been set to use ' + '"{driver}" as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + + if error: + error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n' + ' openshift_docker_options="--log-driver={driver}"\n\n' + 'Alternatively, you can add the following option to your Docker configuration, located in' + '"/etc/docker/daemon.json":\n\n' + '{{ "log-driver": "{driver}" }}\n\n' + 'See https://docs.docker.com/engine/admin/logging/json-file ' + 'for more information.').format(driver=recommended_logging_driver) + + return error + + def check_fluentd_env_var(self): + """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod.""" + running_pods = self.running_fluentd_pods() + + try: + pod_containers = running_pods[0]["spec"]["containers"] + except KeyError: + return "Unable to detect running containers on selected Fluentd pod." + + if not pod_containers: + msg = ('There are no running containers on selected Fluentd pod "{}".\n' + 'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", "")) + raise OpenShiftCheckException(msg) + + pod_env = pod_containers[0].get("env") + if not pod_env: + msg = ('There are no environment variables set on the Fluentd container "{}".\n' + 'Unable to calculate expected logging driver.').format(pod_containers[0].get("name")) + raise OpenShiftCheckException(msg) + + for env in pod_env: + if env["name"] == "USE_JOURNAL": + return env.get("value", "false") != "false" + + return False + + def running_fluentd_pods(self): + """Return a list of running fluentd pods.""" + fluentd_pods = self.get_pods_for_component("fluentd") + + running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running'] + if not running_fluentd_pods: + raise OpenShiftCheckException( + 'No Fluentd pods were found to be in the "Running" state. ' + 'At least one Fluentd pod is required in order to perform this check.' + ) + + return running_fluentd_pods diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py index 442f407b1..3b1cf8baa 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -12,8 +12,7 @@ except ImportError: from urllib.error import HTTPError, URLError import urllib.request as urllib2 -from openshift_checks import get_var -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException class Kibana(LoggingCheck): @@ -22,35 +21,17 @@ class Kibana(LoggingCheck): name = "kibana" tags = ["health", "logging"] - logging_namespace = None - - def run(self, tmp, task_vars): + def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") - kibana_pods, error = super(Kibana, self).get_pods_for_component( - self.execute_module, - self.logging_namespace, - "kibana", - task_vars, - ) - if error: - return {"failed": True, "changed": False, "msg": error} - check_error = self.check_kibana(kibana_pods) - - if not check_error: - check_error = self._check_kibana_route(task_vars) - - if check_error: - msg = ("The following Kibana deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} - + kibana_pods = self.get_pods_for_component("kibana") + self.check_kibana(kibana_pods) + self.check_kibana_route() # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'} - def _verify_url_internal(self, url, task_vars): + return {} + + def _verify_url_internal(self, url): """ Try to reach a URL from the host. Returns: success (bool), reason (for failure) @@ -62,7 +43,7 @@ class Kibana(LoggingCheck): # TODO(lmeyer): give users option to validate certs status_code=302, ) - result = self.execute_module('uri', args, task_vars) + result = self.execute_module('uri', args) if result.get('failed'): return result['msg'] return None @@ -71,7 +52,7 @@ class Kibana(LoggingCheck): def _verify_url_external(url): """ Try to reach a URL from ansible control host. - Returns: success (bool), reason (for failure) + Raise an OpenShiftCheckException if anything goes wrong. """ # This actually checks from the ansible control host, which may or may not # really be "external" to the cluster. @@ -97,133 +78,149 @@ class Kibana(LoggingCheck): return None def check_kibana(self, pods): - """Check to see if Kibana is up and working. Returns: error string.""" + """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not.""" if not pods: - return "There are no Kibana pods deployed, so no access to the logging UI." + raise OpenShiftCheckException( + "MissingComponentPods", + "There are no Kibana pods deployed, so no access to the logging UI." + ) not_running = self.not_running_pods(pods) if len(not_running) == len(pods): - return "No Kibana pod is in a running state, so there is no access to the logging UI." + raise OpenShiftCheckException( + "NoRunningPods", + "No Kibana pod is in a running state, so there is no access to the logging UI." + ) elif not_running: - return ( + raise OpenShiftCheckException( + "PodNotRunning", "The following Kibana pods are not currently in a running state:\n" - "{pods}" - "However at least one is, so service may not be impacted." - ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running)) + " {pods}\n" + "However at least one is, so service may not be impacted.".format( + pods="\n ".join(pod['metadata']['name'] for pod in not_running) + ) + ) - return None - - def _get_kibana_url(self, task_vars): + def _get_kibana_url(self): """ Get kibana route or report error. - Returns: url (or empty), reason for failure + Returns: url """ # Get logging url - get_route = self._exec_oc("get route logging-kibana -o json", [], task_vars) + get_route = self.exec_oc("get route logging-kibana -o json", []) if not get_route: - return None, 'no_route_exists' + raise OpenShiftCheckException( + 'no_route_exists', + 'No route is defined for Kibana in the logging namespace,\n' + 'so the logging stack is not accessible. Is logging deployed?\n' + 'Did something remove the logging-kibana route?' + ) - route = json.loads(get_route) + try: + route = json.loads(get_route) + # check that the route has been accepted by a router + ingress = route["status"]["ingress"] + except (ValueError, KeyError): + raise OpenShiftCheckException( + 'get_route_failed', + '"oc get route" returned an unexpected response:\n' + get_route + ) - # check that the route has been accepted by a router - ingress = route["status"]["ingress"] # ingress can be null if there is no router, or empty if not routed if not ingress or not ingress[0]: - return None, 'route_not_accepted' + raise OpenShiftCheckException( + 'route_not_accepted', + 'The logging-kibana route is not being routed by any router.\n' + 'Is the router deployed and working?' + ) host = route.get("spec", {}).get("host") if not host: - return None, 'route_missing_host' + raise OpenShiftCheckException( + 'route_missing_host', + 'The logging-kibana route has no hostname defined,\n' + 'which should never happen. Did something alter its definition?' + ) - return 'https://{}/'.format(host), None + return 'https://{}/'.format(host) - def _check_kibana_route(self, task_vars): + def check_kibana_route(self): """ Check to see if kibana route is up and working. - Returns: error string + Raises exception if not. """ - known_errors = dict( - no_route_exists=( - 'No route is defined for Kibana in the logging namespace,\n' - 'so the logging stack is not accessible. Is logging deployed?\n' - 'Did something remove the logging-kibana route?' - ), - route_not_accepted=( - 'The logging-kibana route is not being routed by any router.\n' - 'Is the router deployed and working?' - ), - route_missing_host=( - 'The logging-kibana route has no hostname defined,\n' - 'which should never happen. Did something alter its definition?' - ), - ) - kibana_url, error = self._get_kibana_url(task_vars) - if not kibana_url: - return known_errors.get(error, error) + kibana_url = self._get_kibana_url() # first, check that kibana is reachable from the master. - error = self._verify_url_internal(kibana_url, task_vars) + error = self._verify_url_internal(kibana_url) if error: if 'urlopen error [Errno 111] Connection refused' in error: - error = ( + raise OpenShiftCheckException( + 'FailedToConnectInternal', 'Failed to connect from this master to Kibana URL {url}\n' - 'Is kibana running, and is at least one router routing to it?' - ).format(url=kibana_url) + 'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url) + ) elif 'urlopen error [Errno -2] Name or service not known' in error: - error = ( + raise OpenShiftCheckException( + 'FailedToResolveInternal', 'Failed to connect from this master to Kibana URL {url}\n' 'because the hostname does not resolve.\n' - 'Is DNS configured for the Kibana hostname?' - ).format(url=kibana_url) + 'Is DNS configured for the Kibana hostname?'.format(url=kibana_url) + ) elif 'Status code was not' in error: - error = ( + raise OpenShiftCheckException( + 'WrongReturnCodeInternal', 'A request from this master to the Kibana URL {url}\n' 'did not return the correct status code (302).\n' 'This could mean that Kibana is malfunctioning, the hostname is\n' 'resolving incorrectly, or other network issues. The output was:\n' - ' {error}' - ).format(url=kibana_url, error=error) - return 'Error validating the logging Kibana route:\n' + error + ' {error}'.format(url=kibana_url, error=error) + ) + raise OpenShiftCheckException( + 'MiscRouteErrorInternal', + 'Error validating the logging Kibana route internally:\n' + error + ) # in production we would like the kibana route to work from outside the # cluster too; but that may not be the case, so allow disabling just this part. - if not get_var(task_vars, "openshift_check_efk_kibana_external", default=True): - return None + if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true": + return error = self._verify_url_external(kibana_url) - if error: - if 'urlopen error [Errno 111] Connection refused' in error: - error = ( - 'Failed to connect from the Ansible control host to Kibana URL {url}\n' - 'Is the router for the Kibana hostname exposed externally?' - ).format(url=kibana_url) - elif 'urlopen error [Errno -2] Name or service not known' in error: - error = ( - 'Failed to resolve the Kibana hostname in {url}\n' - 'from the Ansible control host.\n' - 'Is DNS configured to resolve this Kibana hostname externally?' - ).format(url=kibana_url) - elif 'Expected success (200)' in error: - error = ( - 'A request to Kibana at {url}\n' - 'returned the wrong error code:\n' - ' {error}\n' - 'This could mean that Kibana is malfunctioning, the hostname is\n' - 'resolving incorrectly, or other network issues.' - ).format(url=kibana_url, error=error) - error = ( - 'Error validating the logging Kibana route:\n{error}\n' - 'To disable external Kibana route validation, set in your inventory:\n' - ' openshift_check_efk_kibana_external=False' - ).format(error=error) - return error - return None - def _exec_oc(self, cmd_str, extra_args, task_vars): - return super(Kibana, self).exec_oc(self.execute_module, - self.logging_namespace, - cmd_str, - extra_args, - task_vars) + if not error: + return + + error_fmt = ( + 'Error validating the logging Kibana route:\n{error}\n' + 'To disable external Kibana route validation, set the variable:\n' + ' openshift_check_efk_kibana_external=False' + ) + if 'urlopen error [Errno 111] Connection refused' in error: + msg = ( + 'Failed to connect from the Ansible control host to Kibana URL {url}\n' + 'Is the router for the Kibana hostname exposed externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg)) + elif 'urlopen error [Errno -2] Name or service not known' in error: + msg = ( + 'Failed to resolve the Kibana hostname in {url}\n' + 'from the Ansible control host.\n' + 'Is DNS configured to resolve this Kibana hostname externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg)) + elif 'Expected success (200)' in error: + msg = ( + 'A request to Kibana at {url}\n' + 'returned the wrong error code:\n' + ' {error}\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues.' + ).format(url=kibana_url, error=error) + raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg)) + raise OpenShiftCheckException( + 'MiscRouteError', + 'Error validating the logging Kibana route externally:\n' + error + ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py index 05b4d300c..ecd8adb64 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -5,92 +5,105 @@ Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana st import json import os -from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var +from openshift_checks import OpenShiftCheck, OpenShiftCheckException + + +class MissingComponentPods(OpenShiftCheckException): + """Raised when a component has no pods in the namespace.""" + pass + + +class CouldNotUseOc(OpenShiftCheckException): + """Raised when ocutil has a failure running oc.""" + pass class LoggingCheck(OpenShiftCheck): - """Base class for logging component checks""" + """Base class for OpenShift aggregated logging component checks""" + + # FIXME: this should not be listed as a check, since it is not meant to be + # run by itself. name = "logging" - @classmethod - def is_active(cls, task_vars): - return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars) + def is_active(self): + logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False) + return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master() - @staticmethod - def is_first_master(task_vars): - """Run only on first master and only when logging is configured. Returns: bool""" - logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True) + def is_first_master(self): + """Determine if running on first master. Returns: bool""" # Note: It would be nice to use membership in oo_first_master group, however for now it # seems best to avoid requiring that setup and just check this is the first master. - hostname = get_var(task_vars, "ansible_ssh_host") or [None] - masters = get_var(task_vars, "groups", "masters", default=None) or [None] - return logging_deployed and masters[0] == hostname + hostname = self.get_var("ansible_ssh_host") or [None] + masters = self.get_var("groups", "masters", default=None) or [None] + return masters[0] == hostname - def run(self, tmp, task_vars): - pass + def run(self): + return {} - def get_pods_for_component(self, execute_module, namespace, logging_component, task_vars): - """Get all pods for a given component. Returns: list of pods for component, error string""" + def get_pods_for_component(self, logging_component): + """Get all pods for a given component. Returns: list of pods.""" pod_output = self.exec_oc( - execute_module, - namespace, "get pods -l component={} -o json".format(logging_component), [], - task_vars ) try: - pods = json.loads(pod_output) - if not pods or not pods.get('items'): + pods = json.loads(pod_output) # raises ValueError if deserialize fails + if not pods or not pods.get('items'): # also a broken response, treat the same raise ValueError() except ValueError: - # successful run but non-parsing data generally means there were no pods in the namespace - return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace) + # successful run but non-parsing data generally means there were no pods to be found + raise MissingComponentPods( + 'There are no "{}" component pods in the "{}" namespace.\n' + 'Is logging deployed?'.format(logging_component, self.logging_namespace()) + ) - return pods['items'], None + return pods['items'] @staticmethod def not_running_pods(pods): """Returns: list of pods not in a ready and running state""" return [ pod for pod in pods - if any( + if not pod.get("status", {}).get("containerStatuses") or any( container['ready'] is False for container in pod['status']['containerStatuses'] ) or not any( condition['type'] == 'Ready' and condition['status'] == 'True' - for condition in pod['status']['conditions'] + for condition in pod['status'].get('conditions', []) ) ] - @staticmethod - def exec_oc(execute_module=None, namespace="logging", cmd_str="", extra_args=None, task_vars=None): + def logging_namespace(self): + """Returns the namespace in which logging is configured to deploy.""" + return self.get_var("openshift_logging_namespace", default="logging") + + def exec_oc(self, cmd_str="", extra_args=None): """ Execute an 'oc' command in the remote host. Returns: output of command and namespace, - or raises OpenShiftCheckException on error + or raises CouldNotUseOc on error """ - config_base = get_var(task_vars, "openshift", "common", "config_base") + config_base = self.get_var("openshift", "common", "config_base") args = { - "namespace": namespace, + "namespace": self.logging_namespace(), "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), "cmd": cmd_str, "extra_args": list(extra_args) if extra_args else [], } - result = execute_module("ocutil", args, task_vars) + result = self.execute_module("ocutil", args) if result.get("failed"): - msg = ( - 'Unexpected error using `oc` to validate the logging stack components.\n' - 'Error executing `oc {cmd}`:\n' - '{error}' - ).format(cmd=args['cmd'], error=result['result']) - if result['result'] == '[Errno 2] No such file or directory': - msg = ( + raise CouldNotUseOc( "This host is supposed to be a master but does not have the `oc` command where expected.\n" "Has an installation been run on this host yet?" ) - raise OpenShiftCheckException(msg) + + raise CouldNotUseOc( + 'Unexpected error using `oc` to validate the logging stack components.\n' + 'Error executing `oc {cmd}`:\n' + '{error}'.format(cmd=args['cmd'], error=result['result']) + ) return result.get("result", "") diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py new file mode 100644 index 000000000..d781db649 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py @@ -0,0 +1,129 @@ +""" +Check for ensuring logs from pods can be queried in a reasonable amount of time. +""" + +import json +import time + +from uuid import uuid4 + +from openshift_checks import OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck + + +ES_CMD_TIMEOUT_SECONDS = 30 + + +class LoggingIndexTime(LoggingCheck): + """Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time.""" + name = "logging_index_time" + tags = ["health", "logging"] + + def run(self): + """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs.""" + try: + log_index_timeout = int( + self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS) + ) + except ValueError: + raise OpenShiftCheckException( + 'InvalidTimeout', + 'Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' + 'Value must be an integer representing an amount in seconds.' + ) + + running_component_pods = dict() + + # get all component pods + for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']): + pods = self.get_pods_for_component(component) + running_pods = self.running_pods(pods) + + if not running_pods: + raise OpenShiftCheckException( + component + 'NoRunningPods', + 'No {} pods in the "Running" state were found.' + 'At least one pod is required in order to perform this check.'.format(name) + ) + + running_component_pods[component] = running_pods + + uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0]) + self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout) + return {} + + def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs): + """Retry an Elasticsearch query every second until query success, or a defined + length of time has passed.""" + deadline = time.time() + timeout_secs + interval = 1 + while not self.query_es_from_es(es_pod, uuid): + if time.time() + interval > deadline: + raise OpenShiftCheckException( + "NoMatchFound", + "expecting match in Elasticsearch for message with uuid {}, " + "but no matches were found after {}s.".format(uuid, timeout_secs) + ) + time.sleep(interval) + + def curl_kibana_with_uuid(self, kibana_pod): + """curl Kibana with a unique uuid.""" + uuid = self.generate_uuid() + pod_name = kibana_pod["metadata"]["name"] + exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}" + exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid) + + error_str = self.exec_oc(exec_cmd, []) + + try: + error_code = json.loads(error_str)["statusCode"] + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'kibanaInvalidResponse', + 'invalid response returned from Kibana request:\n' + 'Command: {}\nResponse: {}'.format(exec_cmd, error_str) + ) + + if error_code != 404: + raise OpenShiftCheckException( + 'kibanaInvalidReturnCode', + 'invalid error code returned from Kibana request.\n' + 'Expecting error code "404", but got "{}" instead.'.format(error_code) + ) + + return uuid + + def query_es_from_es(self, es_pod, uuid): + """curl the Elasticsearch pod and look for a unique uuid in its logs.""" + pod_name = es_pod["metadata"]["name"] + exec_cmd = ( + "exec {pod_name} -- curl --max-time 30 -s -f " + "--cacert /etc/elasticsearch/secret/admin-ca " + "--cert /etc/elasticsearch/secret/admin-cert " + "--key /etc/elasticsearch/secret/admin-key " + "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}" + ) + exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid) + result = self.exec_oc(exec_cmd, []) + + try: + count = json.loads(result)["count"] + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'esInvalidResponse', + 'Invalid response from Elasticsearch query:\n' + ' {}\n' + 'Response was:\n{}'.format(exec_cmd, result) + ) + + return count + + @staticmethod + def running_pods(pods): + """Filter pods that are running.""" + return [pod for pod in pods if pod['status']['phase'] == 'Running'] + + @staticmethod + def generate_uuid(): + """Wrap uuid generator. Allows for testing with expected values.""" + return str(uuid4()) diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py index f4e31065f..765ba072d 100644 --- a/roles/openshift_health_checker/openshift_checks/memory_availability.py +++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py @@ -1,5 +1,5 @@ -# pylint: disable=missing-docstring -from openshift_checks import OpenShiftCheck, get_var +"""Check that recommended memory is available.""" +from openshift_checks import OpenShiftCheck MIB = 2**20 GIB = 2**30 @@ -21,19 +21,18 @@ class MemoryAvailability(OpenShiftCheck): # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal memtotal_adjustment = 1 * GIB - @classmethod - def is_active(cls, task_vars): + def is_active(self): """Skip hosts that do not have recommended memory requirements.""" - group_names = get_var(task_vars, "group_names", default=[]) - has_memory_recommendation = bool(set(group_names).intersection(cls.recommended_memory_bytes)) - return super(MemoryAvailability, cls).is_active(task_vars) and has_memory_recommendation + group_names = self.get_var("group_names", default=[]) + has_memory_recommendation = bool(set(group_names).intersection(self.recommended_memory_bytes)) + return super(MemoryAvailability, self).is_active() and has_memory_recommendation - def run(self, tmp, task_vars): - group_names = get_var(task_vars, "group_names") - total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * MIB + def run(self): + group_names = self.get_var("group_names") + total_memory_bytes = self.get_var("ansible_memtotal_mb") * MIB recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names) - configured_min = float(get_var(task_vars, "openshift_check_min_host_memory_gb", default=0)) * GIB + configured_min = float(self.get_var("openshift_check_min_host_memory_gb", default=0)) * GIB min_memory_bytes = configured_min or recommended_min if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes: diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py index 20d160eaf..e9bae60a3 100644 --- a/roles/openshift_health_checker/openshift_checks/mixins.py +++ b/roles/openshift_health_checker/openshift_checks/mixins.py @@ -1,15 +1,53 @@ -# pylint: disable=missing-docstring,too-few-public-methods """ Mixin classes meant to be used with subclasses of OpenShiftCheck. """ -from openshift_checks import get_var - class NotContainerizedMixin(object): """Mixin for checks that are only active when not in containerized mode.""" + # permanent # pylint: disable=too-few-public-methods + # Reason: The mixin is not intended to stand on its own as a class. + + def is_active(self): + """Only run on non-containerized hosts.""" + is_containerized = self.get_var("openshift", "common", "is_containerized") + return super(NotContainerizedMixin, self).is_active() and not is_containerized + + +class DockerHostMixin(object): + """Mixin for checks that are only active on hosts that require Docker.""" + + dependencies = [] + + def is_active(self): + """Only run on hosts that depend on Docker.""" + is_containerized = self.get_var("openshift", "common", "is_containerized") + is_node = "nodes" in self.get_var("group_names", default=[]) + return super(DockerHostMixin, self).is_active() and (is_containerized or is_node) + + def ensure_dependencies(self): + """ + Ensure that docker-related packages exist, but not on atomic hosts + (which would not be able to install but should already have them). + Returns: msg, failed + """ + if self.get_var("openshift", "common", "is_atomic"): + return "", False - @classmethod - def is_active(cls, task_vars): - is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") - return super(NotContainerizedMixin, cls).is_active(task_vars) and not is_containerized + # NOTE: we would use the "package" module but it's actually an action plugin + # and it's not clear how to invoke one of those. This is about the same anyway: + result = self.execute_module( + self.get_var("ansible_pkg_mgr", default="yum"), + {"name": self.dependencies, "state": "present"}, + ) + msg = result.get("msg", "") + if result.get("failed"): + if "No package matching" in msg: + msg = "Ensure that all required dependencies can be installed via `yum`.\n" + msg = ( + "Unable to install required packages on this host:\n" + " {deps}\n{msg}" + ).format(deps=',\n '.join(self.dependencies), msg=msg) + failed = result.get("failed", False) or result.get("rc", 0) != 0 + self.changed = result.get("changed", False) + return msg, failed diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py index 1e45ae3af..363c12def 100644 --- a/roles/openshift_health_checker/openshift_checks/ovs_version.py +++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py @@ -3,7 +3,7 @@ Ansible module for determining if an installed version of Open vSwitch is incomp currently installed version of OpenShift. """ -from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var +from openshift_checks import OpenShiftCheck, OpenShiftCheckException from openshift_checks.mixins import NotContainerizedMixin @@ -16,63 +16,39 @@ class OvsVersion(NotContainerizedMixin, OpenShiftCheck): tags = ["health"] openshift_to_ovs_version = { - "3.6": "2.6", - "3.5": "2.6", + "3.6": ["2.6", "2.7"], + "3.5": ["2.6", "2.7"], "3.4": "2.4", } - # map major release versions across releases - # to a common major version - openshift_major_release_version = { - "1": "3", - } - - @classmethod - def is_active(cls, task_vars): + def is_active(self): """Skip hosts that do not have package requirements.""" - group_names = get_var(task_vars, "group_names", default=[]) + group_names = self.get_var("group_names", default=[]) master_or_node = 'masters' in group_names or 'nodes' in group_names - return super(OvsVersion, cls).is_active(task_vars) and master_or_node + return super(OvsVersion, self).is_active() and master_or_node - def run(self, tmp, task_vars): + def run(self): args = { "package_list": [ { "name": "openvswitch", - "version": self.get_required_ovs_version(task_vars), + "version": self.get_required_ovs_version(), }, ], } - return self.execute_module("rpm_version", args, task_vars) + return self.execute_module("rpm_version", args) - def get_required_ovs_version(self, task_vars): + def get_required_ovs_version(self): """Return the correct Open vSwitch version for the current OpenShift version""" - openshift_version = self._get_openshift_version(task_vars) + openshift_version_tuple = self.get_major_minor_version(self.get_var("openshift_image_tag")) - if float(openshift_version) < 3.5: + if openshift_version_tuple < (3, 5): return self.openshift_to_ovs_version["3.4"] - ovs_version = self.openshift_to_ovs_version.get(str(openshift_version)) + openshift_version = ".".join(str(x) for x in openshift_version_tuple) + ovs_version = self.openshift_to_ovs_version.get(openshift_version) if ovs_version: - return self.openshift_to_ovs_version[str(openshift_version)] + return self.openshift_to_ovs_version[openshift_version] msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}" raise OpenShiftCheckException(msg.format(openshift_version)) - - def _get_openshift_version(self, task_vars): - openshift_version = get_var(task_vars, "openshift_image_tag") - if openshift_version and openshift_version[0] == 'v': - openshift_version = openshift_version[1:] - - return self._parse_version(openshift_version) - - def _parse_version(self, version): - components = version.split(".") - if not components or len(components) < 2: - msg = "An invalid version of OpenShift was found for this host: {}" - raise OpenShiftCheckException(msg.format(version)) - - if components[0] in self.openshift_major_release_version: - components[0] = self.openshift_major_release_version[components[0]] - - return '.'.join(components[:2]) diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py index a7eb720fd..a86180b00 100644 --- a/roles/openshift_health_checker/openshift_checks/package_availability.py +++ b/roles/openshift_health_checker/openshift_checks/package_availability.py @@ -1,5 +1,6 @@ -# pylint: disable=missing-docstring -from openshift_checks import OpenShiftCheck, get_var +"""Check that required RPM packages are available.""" + +from openshift_checks import OpenShiftCheck from openshift_checks.mixins import NotContainerizedMixin @@ -9,13 +10,13 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck): name = "package_availability" tags = ["preflight"] - @classmethod - def is_active(cls, task_vars): - return super(PackageAvailability, cls).is_active(task_vars) and task_vars["ansible_pkg_mgr"] == "yum" + def is_active(self): + """Run only when yum is the package manager as the code is specific to it.""" + return super(PackageAvailability, self).is_active() and self.get_var("ansible_pkg_mgr") == "yum" - def run(self, tmp, task_vars): - rpm_prefix = get_var(task_vars, "openshift", "common", "service_type") - group_names = get_var(task_vars, "group_names", default=[]) + def run(self): + rpm_prefix = self.get_var("openshift", "common", "service_type") + group_names = self.get_var("group_names", default=[]) packages = set() @@ -25,10 +26,11 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck): packages.update(self.node_packages(rpm_prefix)) args = {"packages": sorted(set(packages))} - return self.execute_module("check_yum_update", args, tmp, task_vars) + return self.execute_module("check_yum_update", args) @staticmethod def master_packages(rpm_prefix): + """Return a list of RPMs that we expect a master install to have available.""" return [ "{rpm_prefix}".format(rpm_prefix=rpm_prefix), "{rpm_prefix}-clients".format(rpm_prefix=rpm_prefix), @@ -36,8 +38,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck): "bash-completion", "cockpit-bridge", "cockpit-docker", - "cockpit-kubernetes", - "cockpit-shell", + "cockpit-system", "cockpit-ws", "etcd", "httpd-tools", @@ -45,6 +46,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck): @staticmethod def node_packages(rpm_prefix): + """Return a list of RPMs that we expect a node install to have available.""" return [ "{rpm_prefix}".format(rpm_prefix=rpm_prefix), "{rpm_prefix}-node".format(rpm_prefix=rpm_prefix), diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py index fd0c0a755..1e9aecbe0 100644 --- a/roles/openshift_health_checker/openshift_checks/package_update.py +++ b/roles/openshift_health_checker/openshift_checks/package_update.py @@ -1,14 +1,14 @@ -# pylint: disable=missing-docstring +"""Check that a yum update would not run into conflicts with available packages.""" from openshift_checks import OpenShiftCheck from openshift_checks.mixins import NotContainerizedMixin class PackageUpdate(NotContainerizedMixin, OpenShiftCheck): - """Check that there are no conflicts in RPM packages.""" + """Check that a yum update would not run into conflicts with available packages.""" name = "package_update" tags = ["preflight"] - def run(self, tmp, task_vars): + def run(self): args = {"packages": []} - return self.execute_module("check_yum_update", args, tmp, task_vars) + return self.execute_module("check_yum_update", args) diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py index 2e737818b..8b780114f 100644 --- a/roles/openshift_health_checker/openshift_checks/package_version.py +++ b/roles/openshift_health_checker/openshift_checks/package_version.py @@ -1,5 +1,8 @@ -# pylint: disable=missing-docstring -from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var +"""Check that available RPM packages match the required versions.""" + +import re + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException from openshift_checks.mixins import NotContainerizedMixin @@ -9,48 +12,49 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck): name = "package_version" tags = ["preflight"] + # NOTE: versions outside those specified are mapped to least/greatest openshift_to_ovs_version = { - "3.6": "2.6", - "3.5": "2.6", - "3.4": "2.4", + (3, 4): "2.4", + (3, 5): ["2.6", "2.7"], + (3, 6): ["2.6", "2.7"], } openshift_to_docker_version = { - "3.1": "1.8", - "3.2": "1.10", - "3.3": "1.10", - "3.4": "1.12", + (3, 1): "1.8", + (3, 2): "1.10", + (3, 3): "1.10", + (3, 4): "1.12", + (3, 5): "1.12", + (3, 6): "1.12", } - # map major release versions across releases - # to a common major version - openshift_major_release_version = { - "1": "3", + # map major OpenShift release versions across releases to a common major version + map_major_release_version = { + 1: 3, } - @classmethod - def is_active(cls, task_vars): + def is_active(self): """Skip hosts that do not have package requirements.""" - group_names = get_var(task_vars, "group_names", default=[]) + group_names = self.get_var("group_names", default=[]) master_or_node = 'masters' in group_names or 'nodes' in group_names - return super(PackageVersion, cls).is_active(task_vars) and master_or_node + return super(PackageVersion, self).is_active() and master_or_node - def run(self, tmp, task_vars): - rpm_prefix = get_var(task_vars, "openshift", "common", "service_type") - openshift_release = get_var(task_vars, "openshift_release", default='') - deployment_type = get_var(task_vars, "openshift_deployment_type") + def run(self): + rpm_prefix = self.get_var("openshift", "common", "service_type") + openshift_release = self.get_var("openshift_release", default='') + deployment_type = self.get_var("openshift_deployment_type") check_multi_minor_release = deployment_type in ['openshift-enterprise'] args = { "package_list": [ { "name": "openvswitch", - "version": self.get_required_ovs_version(task_vars), + "version": self.get_required_ovs_version(), "check_multi": False, }, { "name": "docker", - "version": self.get_required_docker_version(task_vars), + "version": self.get_required_docker_version(), "check_multi": False, }, { @@ -71,55 +75,52 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck): ], } - return self.execute_module("aos_version", args, tmp, task_vars) + return self.execute_module("aos_version", args) - def get_required_ovs_version(self, task_vars): - """Return the correct Open vSwitch version for the current OpenShift version. - If the current OpenShift version is >= 3.5, ensure Open vSwitch version 2.6, - Else ensure Open vSwitch version 2.4""" - openshift_version = self.get_openshift_version(task_vars) + def get_required_ovs_version(self): + """Return the correct Open vSwitch version(s) for the current OpenShift version.""" + openshift_version = self.get_openshift_version_tuple() - if float(openshift_version) < 3.5: - return self.openshift_to_ovs_version["3.4"] + earliest = min(self.openshift_to_ovs_version) + latest = max(self.openshift_to_ovs_version) + if openshift_version < earliest: + return self.openshift_to_ovs_version[earliest] + if openshift_version > latest: + return self.openshift_to_ovs_version[latest] - ovs_version = self.openshift_to_ovs_version.get(str(openshift_version)) - if ovs_version: - return ovs_version + ovs_version = self.openshift_to_ovs_version.get(openshift_version) + if not ovs_version: + msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}" + raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version))) - msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}" - raise OpenShiftCheckException(msg.format(openshift_version)) + return ovs_version - def get_required_docker_version(self, task_vars): - """Return the correct Docker version for the current OpenShift version. - If the OpenShift version is 3.1, ensure Docker version 1.8. - If the OpenShift version is 3.2 or 3.3, ensure Docker version 1.10. - If the current OpenShift version is >= 3.4, ensure Docker version 1.12.""" - openshift_version = self.get_openshift_version(task_vars) + def get_required_docker_version(self): + """Return the correct Docker version(s) for the current OpenShift version.""" + openshift_version = self.get_openshift_version_tuple() - if float(openshift_version) >= 3.4: - return self.openshift_to_docker_version["3.4"] + earliest = min(self.openshift_to_docker_version) + latest = max(self.openshift_to_docker_version) + if openshift_version < earliest: + return self.openshift_to_docker_version[earliest] + if openshift_version > latest: + return self.openshift_to_docker_version[latest] - docker_version = self.openshift_to_docker_version.get(str(openshift_version)) - if docker_version: - return docker_version + docker_version = self.openshift_to_docker_version.get(openshift_version) + if not docker_version: + msg = "There is no recommended version of Docker for the current version of OpenShift: {}" + raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version))) - msg = "There is no recommended version of Docker for the current version of OpenShift: {}" - raise OpenShiftCheckException(msg.format(openshift_version)) + return docker_version - def get_openshift_version(self, task_vars): - openshift_version = get_var(task_vars, "openshift_image_tag") - if openshift_version and openshift_version[0] == 'v': - openshift_version = openshift_version[1:] + def get_openshift_version_tuple(self): + """Return received image tag as a normalized (X, Y) minor version tuple.""" + version = self.get_var("openshift_image_tag") + comps = [int(component) for component in re.findall(r'\d+', version)] - return self.parse_version(openshift_version) - - def parse_version(self, version): - components = version.split(".") - if not components or len(components) < 2: + if len(comps) < 2: msg = "An invalid version of OpenShift was found for this host: {}" raise OpenShiftCheckException(msg.format(version)) - if components[0] in self.openshift_major_release_version: - components[0] = self.openshift_major_release_version[components[0]] - - return '.'.join(components[:2]) + comps[0] = self.map_major_release_version.get(comps[0], comps[0]) + return tuple(comps[0:2]) diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py index 6ebf0ebb2..f5161d6f5 100644 --- a/roles/openshift_health_checker/test/action_plugin_test.py +++ b/roles/openshift_health_checker/test/action_plugin_test.py @@ -6,7 +6,7 @@ from openshift_health_check import ActionModule, resolve_checks from openshift_checks import OpenShiftCheckException -def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None): +def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None, changed=False): """Returns a new class that is compatible with OpenShiftCheck for testing.""" _name, _tags = name, tags @@ -14,15 +14,16 @@ def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, ru class FakeCheck(object): name = _name tags = _tags or [] + changed = False - def __init__(self, execute_module=None): + def __init__(self, execute_module=None, task_vars=None, tmp=None): pass - @classmethod - def is_active(cls, task_vars): + def is_active(self): return is_active - def run(self, tmp, task_vars): + def run(self): + self.changed = changed if run_exception is not None: raise run_exception return run_return @@ -59,7 +60,7 @@ def failed(result, msg_has=None): if msg_has is not None: assert 'msg' in result for term in msg_has: - assert term in result['msg'] + assert term.lower() in result['msg'].lower() return result.get('failed', False) @@ -124,7 +125,7 @@ def test_action_plugin_skip_disabled_checks(plugin, task_vars, monkeypatch): def test_action_plugin_run_check_ok(plugin, task_vars, monkeypatch): check_return_value = {'ok': 'test'} check_class = fake_check(run_return=check_return_value) - monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()}) + monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()}) monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) result = plugin.run(tmp=None, task_vars=task_vars) @@ -136,14 +137,15 @@ def test_action_plugin_run_check_ok(plugin, task_vars, monkeypatch): def test_action_plugin_run_check_changed(plugin, task_vars, monkeypatch): - check_return_value = {'ok': 'test', 'changed': True} - check_class = fake_check(run_return=check_return_value) - monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()}) + check_return_value = {'ok': 'test'} + check_class = fake_check(run_return=check_return_value, changed=True) + monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()}) monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) result = plugin.run(tmp=None, task_vars=task_vars) assert result['checks']['fake_check'] == check_return_value + assert changed(result['checks']['fake_check']) assert not failed(result) assert changed(result) assert not skipped(result) @@ -152,7 +154,7 @@ def test_action_plugin_run_check_changed(plugin, task_vars, monkeypatch): def test_action_plugin_run_check_fail(plugin, task_vars, monkeypatch): check_return_value = {'failed': True} check_class = fake_check(run_return=check_return_value) - monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()}) + monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()}) monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) result = plugin.run(tmp=None, task_vars=task_vars) @@ -166,14 +168,25 @@ def test_action_plugin_run_check_fail(plugin, task_vars, monkeypatch): def test_action_plugin_run_check_exception(plugin, task_vars, monkeypatch): exception_msg = 'fake check has an exception' run_exception = OpenShiftCheckException(exception_msg) - check_class = fake_check(run_exception=run_exception) - monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()}) + check_class = fake_check(run_exception=run_exception, changed=True) + monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()}) monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) result = plugin.run(tmp=None, task_vars=task_vars) assert failed(result['checks']['fake_check'], msg_has=exception_msg) assert failed(result, msg_has=['failed']) + assert changed(result['checks']['fake_check']) + assert changed(result) + assert not skipped(result) + + +def test_action_plugin_resolve_checks_exception(plugin, task_vars, monkeypatch): + monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {}) + + result = plugin.run(tmp=None, task_vars=task_vars) + + assert failed(result, msg_has=['unknown', 'name']) assert not changed(result) assert not skipped(result) diff --git a/roles/openshift_health_checker/test/aos_version_test.py b/roles/openshift_health_checker/test/aos_version_test.py index 697805dd2..4100f6c70 100644 --- a/roles/openshift_health_checker/test/aos_version_test.py +++ b/roles/openshift_health_checker/test/aos_version_test.py @@ -18,7 +18,43 @@ expected_pkgs = { } -@pytest.mark.parametrize('pkgs, expect_not_found', [ +@pytest.mark.parametrize('pkgs,expected_pkgs_dict', [ + ( + # all found + [Package('spam', '3.2.1'), Package('eggs', '3.2.1')], + expected_pkgs, + ), + ( + # found with more specific version + [Package('spam', '3.2.1'), Package('eggs', '3.2.1.5')], + expected_pkgs, + ), + ( + [Package('ovs', '2.6'), Package('ovs', '2.4')], + { + "ovs": { + "name": "ovs", + "version": ["2.6", "2.7"], + "check_multi": False, + } + }, + ), + ( + [Package('ovs', '2.7')], + { + "ovs": { + "name": "ovs", + "version": ["2.6", "2.7"], + "check_multi": False, + } + }, + ), +]) +def test_check_precise_version_found(pkgs, expected_pkgs_dict): + aos_version._check_precise_version_found(pkgs, expected_pkgs_dict) + + +@pytest.mark.parametrize('pkgs,expect_not_found', [ ( [], { @@ -55,14 +91,6 @@ expected_pkgs = { }, # not the right version ), ( - [Package('spam', '3.2.1'), Package('eggs', '3.2.1')], - {}, # all found - ), - ( - [Package('spam', '3.2.1'), Package('eggs', '3.2.1.5')], - {}, # found with more specific version - ), - ( [Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5')], { "spam": { @@ -73,64 +101,86 @@ expected_pkgs = { }, # eggs found with multiple versions ), ]) -def test_check_pkgs_for_precise_version(pkgs, expect_not_found): - if expect_not_found: - with pytest.raises(aos_version.PreciseVersionNotFound) as e: - aos_version._check_precise_version_found(pkgs, expected_pkgs) - - assert list(expect_not_found.values()) == e.value.problem_pkgs - else: +def test_check_precise_version_found_fail(pkgs, expect_not_found): + with pytest.raises(aos_version.PreciseVersionNotFound) as e: aos_version._check_precise_version_found(pkgs, expected_pkgs) + assert list(expect_not_found.values()) == e.value.problem_pkgs -@pytest.mark.parametrize('pkgs, expect_higher', [ +@pytest.mark.parametrize('pkgs,expected_pkgs_dict', [ ( [], - [], + expected_pkgs, ), ( + # more precise but not strictly higher [Package('spam', '3.2.1.9')], - [], # more precise but not strictly higher + expected_pkgs, ), ( + [Package('ovs', '2.7')], + { + "ovs": { + "name": "ovs", + "version": ["2.6", "2.7"], + "check_multi": False, + } + }, + ), +]) +def test_check_higher_version_found(pkgs, expected_pkgs_dict): + aos_version._check_higher_version_found(pkgs, expected_pkgs_dict) + + +@pytest.mark.parametrize('pkgs,expected_pkgs_dict,expect_higher', [ + ( [Package('spam', '3.3')], + expected_pkgs, ['spam-3.3'], # lower precision, but higher ), ( [Package('spam', '3.2.1'), Package('eggs', '3.3.2')], + expected_pkgs, ['eggs-3.3.2'], # one too high ), ( [Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5'), Package('eggs', '3.4')], + expected_pkgs, ['eggs-3.4'], # multiple versions, one is higher ), ( [Package('eggs', '3.2.1'), Package('eggs', '3.4'), Package('eggs', '3.3')], + expected_pkgs, ['eggs-3.4'], # multiple versions, two are higher ), + ( + [Package('ovs', '2.8')], + { + "ovs": { + "name": "ovs", + "version": ["2.6", "2.7"], + "check_multi": False, + } + }, + ['ovs-2.8'], + ), ]) -def test_check_pkgs_for_greater_version(pkgs, expect_higher): - if expect_higher: - with pytest.raises(aos_version.FoundHigherVersion) as e: - aos_version._check_higher_version_found(pkgs, expected_pkgs) - assert set(expect_higher) == set(e.value.problem_pkgs) - else: - aos_version._check_higher_version_found(pkgs, expected_pkgs) +def test_check_higher_version_found_fail(pkgs, expected_pkgs_dict, expect_higher): + with pytest.raises(aos_version.FoundHigherVersion) as e: + aos_version._check_higher_version_found(pkgs, expected_pkgs_dict) + assert set(expect_higher) == set(e.value.problem_pkgs) -@pytest.mark.parametrize('pkgs, expect_to_flag_pkgs', [ - ( - [], - [], - ), - ( - [Package('spam', '3.2.1')], - [], - ), - ( - [Package('spam', '3.2.1'), Package('eggs', '3.2.2')], - [], - ), +@pytest.mark.parametrize('pkgs', [ + [], + [Package('spam', '3.2.1')], + [Package('spam', '3.2.1'), Package('eggs', '3.2.2')], +]) +def test_check_multi_minor_release(pkgs): + aos_version._check_multi_minor_release(pkgs, expected_pkgs) + + +@pytest.mark.parametrize('pkgs,expect_to_flag_pkgs', [ ( [Package('spam', '3.2.1'), Package('spam', '3.3.2')], ['spam'], @@ -140,10 +190,7 @@ def test_check_pkgs_for_greater_version(pkgs, expect_higher): ['eggs'], ), ]) -def test_check_pkgs_for_multi_release(pkgs, expect_to_flag_pkgs): - if expect_to_flag_pkgs: - with pytest.raises(aos_version.FoundMultiRelease) as e: - aos_version._check_multi_minor_release(pkgs, expected_pkgs) - assert set(expect_to_flag_pkgs) == set(e.value.problem_pkgs) - else: +def test_check_multi_minor_release_fail(pkgs, expect_to_flag_pkgs): + with pytest.raises(aos_version.FoundMultiRelease) as e: aos_version._check_multi_minor_release(pkgs, expected_pkgs) + assert set(expect_to_flag_pkgs) == set(e.value.problem_pkgs) diff --git a/roles/openshift_health_checker/test/curator_test.py b/roles/openshift_health_checker/test/curator_test.py index ae108c96e..62c680b74 100644 --- a/roles/openshift_health_checker/test/curator_test.py +++ b/roles/openshift_health_checker/test/curator_test.py @@ -1,22 +1,6 @@ import pytest -from openshift_checks.logging.curator import Curator - - -def canned_curator(exec_oc=None): - """Create a Curator check object with canned exec_oc method""" - check = Curator("dummy") # fails if a module is actually invoked - if exec_oc: - check._exec_oc = exec_oc - return check - - -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +from openshift_checks.logging.curator import Curator, OpenShiftCheckException plain_curator_pod = { @@ -44,25 +28,30 @@ not_running_curator_pod = { } +def test_get_curator_pods(): + check = Curator() + check.get_pods_for_component = lambda *_: [plain_curator_pod] + result = check.run() + assert "failed" not in result or not result["failed"] + + @pytest.mark.parametrize('pods, expect_error', [ ( [], - "no Curator pods", - ), - ( - [plain_curator_pod], - None, + 'MissingComponentPods', ), ( [not_running_curator_pod], - "not currently in a running state", + 'CuratorNotRunning', ), ( [plain_curator_pod, plain_curator_pod], - "more than one Curator pod", + 'TooManyCurators', ), ]) -def test_get_curator_pods(pods, expect_error): - check = canned_curator() - error = check.check_curator(pods) - assert_error(error, expect_error) +def test_get_curator_pods_fail(pods, expect_error): + check = Curator() + check.get_pods_for_component = lambda *_: pods + with pytest.raises(OpenShiftCheckException) as excinfo: + check.run() + assert excinfo.value.name == expect_error diff --git a/roles/openshift_health_checker/test/disk_availability_test.py b/roles/openshift_health_checker/test/disk_availability_test.py index b353fa610..f4fd2dfed 100644 --- a/roles/openshift_health_checker/test/disk_availability_test.py +++ b/roles/openshift_health_checker/test/disk_availability_test.py @@ -3,43 +3,51 @@ import pytest from openshift_checks.disk_availability import DiskAvailability, OpenShiftCheckException -@pytest.mark.parametrize('group_names,is_containerized,is_active', [ - (['masters'], False, True), - # ensure check is skipped on containerized installs - (['masters'], True, False), - (['nodes'], False, True), - (['etcd'], False, True), - (['masters', 'nodes'], False, True), - (['masters', 'etcd'], False, True), - ([], False, False), - (['lb'], False, False), - (['nfs'], False, False), +@pytest.mark.parametrize('group_names,is_active', [ + (['masters'], True), + (['nodes'], True), + (['etcd'], True), + (['masters', 'nodes'], True), + (['masters', 'etcd'], True), + ([], False), + (['lb'], False), + (['nfs'], False), ]) -def test_is_active(group_names, is_containerized, is_active): +def test_is_active(group_names, is_active): task_vars = dict( group_names=group_names, - openshift=dict(common=dict(is_containerized=is_containerized)), ) - assert DiskAvailability.is_active(task_vars=task_vars) == is_active + assert DiskAvailability(None, task_vars).is_active() == is_active -@pytest.mark.parametrize('ansible_mounts,extra_words', [ - ([], ['none']), # empty ansible_mounts - ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths - ([{'mount': '/var'}], ['/var']), # missing size_available +@pytest.mark.parametrize('desc, ansible_mounts, expect_chunks', [ + ( + 'empty ansible_mounts', + [], + ['determine mount point', 'none'], + ), + ( + 'missing relevant mount paths', + [{'mount': '/mnt'}], + ['determine mount point', '/mnt'], + ), + ( + 'missing size_available', + [{'mount': '/var'}, {'mount': '/usr'}, {'mount': '/tmp'}], + ['missing', 'size_available'], + ), ]) -def test_cannot_determine_available_disk(ansible_mounts, extra_words): +def test_cannot_determine_available_disk(desc, ansible_mounts, expect_chunks): task_vars = dict( group_names=['masters'], ansible_mounts=ansible_mounts, ) - check = DiskAvailability(execute_module=fake_execute_module) with pytest.raises(OpenShiftCheckException) as excinfo: - check.run(tmp=None, task_vars=task_vars) + DiskAvailability(fake_execute_module, task_vars).run() - for word in 'determine available disk'.split() + extra_words: - assert word in str(excinfo.value) + for chunk in expect_chunks: + assert chunk in str(excinfo.value) @pytest.mark.parametrize('group_names,configured_min,ansible_mounts', [ @@ -81,7 +89,7 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words): [{ # not enough space on / ... 'mount': '/', - 'size_available': 0, + 'size_available': 2 * 10**9, }, { # ... but enough on /var 'mount': '/var', @@ -96,14 +104,14 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib ansible_mounts=ansible_mounts, ) - check = DiskAvailability(execute_module=fake_execute_module) - result = check.run(tmp=None, task_vars=task_vars) + result = DiskAvailability(fake_execute_module, task_vars).run() assert not result.get('failed', False) -@pytest.mark.parametrize('group_names,configured_min,ansible_mounts,extra_words', [ +@pytest.mark.parametrize('name,group_names,configured_min,ansible_mounts,expect_chunks', [ ( + 'test with no space available', ['masters'], 0, [{ @@ -113,6 +121,7 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib ['0.0 GB'], ), ( + 'test with a higher configured required value', ['masters'], 100, # set a higher threshold [{ @@ -122,6 +131,7 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib ['100.0 GB'], ), ( + 'test with 1GB available, but "0" GB space requirement', ['nodes'], 0, [{ @@ -131,6 +141,7 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib ['1.0 GB'], ), ( + 'test with no space available, but "0" GB space requirement', ['etcd'], 0, [{ @@ -140,16 +151,17 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib ['0.0 GB'], ), ( + 'test with enough space for a node, but not for a master', ['nodes', 'masters'], 0, [{ 'mount': '/', - # enough space for a node, not enough for a master 'size_available': 15 * 10**9 + 1, }], ['15.0 GB'], ), ( + 'test failure with enough space on "/", but not enough on "/var"', ['etcd'], 0, [{ @@ -163,20 +175,73 @@ def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansib }], ['0.0 GB'], ), -]) -def test_fails_with_insufficient_disk_space(group_names, configured_min, ansible_mounts, extra_words): +], ids=lambda argval: argval[0]) +def test_fails_with_insufficient_disk_space(name, group_names, configured_min, ansible_mounts, expect_chunks): task_vars = dict( group_names=group_names, openshift_check_min_host_disk_gb=configured_min, ansible_mounts=ansible_mounts, ) - check = DiskAvailability(execute_module=fake_execute_module) - result = check.run(tmp=None, task_vars=task_vars) + result = DiskAvailability(fake_execute_module, task_vars).run() assert result['failed'] - for word in 'below recommended'.split() + extra_words: - assert word in result['msg'] + for chunk in 'below recommended'.split() + expect_chunks: + assert chunk in result.get('msg', '') + + +@pytest.mark.parametrize('name,group_names,context,ansible_mounts,failed,extra_words', [ + ( + 'test without enough space for master under "upgrade" context', + ['nodes', 'masters'], + "upgrade", + [{ + 'mount': '/', + 'size_available': 1 * 10**9 + 1, + 'size_total': 21 * 10**9 + 1, + }], + True, + ["1.0 GB"], + ), + ( + 'test with enough space for master under "upgrade" context', + ['nodes', 'masters'], + "upgrade", + [{ + 'mount': '/', + 'size_available': 10 * 10**9 + 1, + 'size_total': 21 * 10**9 + 1, + }], + False, + [], + ), + ( + 'test with not enough space for master, and non-upgrade context', + ['nodes', 'masters'], + "health", + [{ + 'mount': '/', + # not enough space for a master, + # "health" context should not lower requirement + 'size_available': 20 * 10**9 + 1, + }], + True, + ["20.0 GB", "below minimum"], + ), +], ids=lambda argval: argval[0]) +def test_min_required_space_changes_with_upgrade_context(name, group_names, context, ansible_mounts, failed, extra_words): + task_vars = dict( + r_openshift_health_checker_playbook_context=context, + group_names=group_names, + ansible_mounts=ansible_mounts, + ) + + check = DiskAvailability(fake_execute_module, task_vars) + result = check.run() + + assert result.get("failed", False) == failed + for word in extra_words: + assert word in result.get('msg', '') def fake_execute_module(*args): diff --git a/roles/openshift_health_checker/test/docker_image_availability_test.py b/roles/openshift_health_checker/test/docker_image_availability_test.py index 0379cafb5..8d0a53df9 100644 --- a/roles/openshift_health_checker/test/docker_image_availability_test.py +++ b/roles/openshift_health_checker/test/docker_image_availability_test.py @@ -3,19 +3,25 @@ import pytest from openshift_checks.docker_image_availability import DockerImageAvailability -@pytest.mark.parametrize('deployment_type,is_active', [ - ("origin", True), - ("openshift-enterprise", True), - ("enterprise", False), - ("online", False), - ("invalid", False), - ("", False), +@pytest.mark.parametrize('deployment_type, is_containerized, group_names, expect_active', [ + ("origin", True, [], True), + ("openshift-enterprise", True, [], True), + ("enterprise", True, [], False), + ("online", True, [], False), + ("invalid", True, [], False), + ("", True, [], False), + ("origin", False, [], False), + ("openshift-enterprise", False, [], False), + ("origin", False, ["nodes", "masters"], True), + ("openshift-enterprise", False, ["etcd"], False), ]) -def test_is_active(deployment_type, is_active): +def test_is_active(deployment_type, is_containerized, group_names, expect_active): task_vars = dict( + openshift=dict(common=dict(is_containerized=is_containerized)), openshift_deployment_type=deployment_type, + group_names=group_names, ) - assert DockerImageAvailability.is_active(task_vars=task_vars) == is_active + assert DockerImageAvailability(None, task_vars).is_active() == expect_active @pytest.mark.parametrize("is_containerized,is_atomic", [ @@ -25,18 +31,18 @@ def test_is_active(deployment_type, is_active): (False, True), ]) def test_all_images_available_locally(is_containerized, is_atomic): - def execute_module(module_name, args, task_vars): + def execute_module(module_name, module_args, *_): if module_name == "yum": return {"changed": True} assert module_name == "docker_image_facts" - assert 'name' in args - assert args['name'] + assert 'name' in module_args + assert module_args['name'] return { - 'images': [args['name']], + 'images': [module_args['name']], } - result = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict( + result = DockerImageAvailability(execute_module, task_vars=dict( openshift=dict( common=dict( service_type='origin', @@ -46,9 +52,9 @@ def test_all_images_available_locally(is_containerized, is_atomic): docker=dict(additional_registries=["docker.io"]), ), openshift_deployment_type='origin', - openshift_release='v3.4', openshift_image_tag='3.4', - )) + group_names=['nodes', 'masters'], + )).run() assert not result.get('failed', False) @@ -58,12 +64,12 @@ def test_all_images_available_locally(is_containerized, is_atomic): True, ]) def test_all_images_available_remotely(available_locally): - def execute_module(module_name, args, task_vars): + def execute_module(module_name, *_): if module_name == 'docker_image_facts': return {'images': [], 'failed': available_locally} return {'changed': False} - result = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict( + result = DockerImageAvailability(execute_module, task_vars=dict( openshift=dict( common=dict( service_type='origin', @@ -73,15 +79,15 @@ def test_all_images_available_remotely(available_locally): docker=dict(additional_registries=["docker.io", "registry.access.redhat.com"]), ), openshift_deployment_type='origin', - openshift_release='3.4', openshift_image_tag='v3.4', - )) + group_names=['nodes', 'masters'], + )).run() assert not result.get('failed', False) def test_all_images_unavailable(): - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): + def execute_module(module_name=None, *_): if module_name == "command": return { 'failed': True, @@ -91,8 +97,7 @@ def test_all_images_unavailable(): 'changed': False, } - check = DockerImageAvailability(execute_module=execute_module) - actual = check.run(tmp=None, task_vars=dict( + actual = DockerImageAvailability(execute_module, task_vars=dict( openshift=dict( common=dict( service_type='origin', @@ -102,9 +107,9 @@ def test_all_images_unavailable(): docker=dict(additional_registries=["docker.io"]), ), openshift_deployment_type="openshift-enterprise", - openshift_release=None, - openshift_image_tag='latest' - )) + openshift_image_tag='latest', + group_names=['nodes', 'masters'], + )).run() assert actual['failed'] assert "required Docker images are not available" in actual['msg'] @@ -121,7 +126,7 @@ def test_all_images_unavailable(): ), ]) def test_skopeo_update_failure(message, extra_words): - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): + def execute_module(module_name=None, *_): if module_name == "yum": return { "failed": True, @@ -131,7 +136,7 @@ def test_skopeo_update_failure(message, extra_words): return {'changed': False} - actual = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict( + actual = DockerImageAvailability(execute_module, task_vars=dict( openshift=dict( common=dict( service_type='origin', @@ -141,9 +146,9 @@ def test_skopeo_update_failure(message, extra_words): docker=dict(additional_registries=["unknown.io"]), ), openshift_deployment_type="openshift-enterprise", - openshift_release='', openshift_image_tag='', - )) + group_names=['nodes', 'masters'], + )).run() assert actual["failed"] for word in extra_words: @@ -156,12 +161,12 @@ def test_skopeo_update_failure(message, extra_words): ("openshift-enterprise", []), ]) def test_registry_availability(deployment_type, registries): - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): + def execute_module(module_name=None, *_): return { 'changed': False, } - actual = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict( + actual = DockerImageAvailability(execute_module, task_vars=dict( openshift=dict( common=dict( service_type='origin', @@ -171,8 +176,99 @@ def test_registry_availability(deployment_type, registries): docker=dict(additional_registries=registries), ), openshift_deployment_type=deployment_type, - openshift_release='', openshift_image_tag='', - )) + group_names=['nodes', 'masters'], + )).run() assert not actual.get("failed", False) + + +@pytest.mark.parametrize("deployment_type, is_containerized, groups, oreg_url, expected", [ + ( # standard set of stuff required on nodes + "origin", False, ['nodes'], None, + set([ + 'openshift/origin-pod:vtest', + 'openshift/origin-deployer:vtest', + 'openshift/origin-docker-registry:vtest', + 'openshift/origin-haproxy-router:vtest', + 'cockpit/kubernetes', # origin version of registry-console + ]) + ), + ( # set a different URL for images + "origin", False, ['nodes'], 'foo.io/openshift/origin-${component}:${version}', + set([ + 'foo.io/openshift/origin-pod:vtest', + 'foo.io/openshift/origin-deployer:vtest', + 'foo.io/openshift/origin-docker-registry:vtest', + 'foo.io/openshift/origin-haproxy-router:vtest', + 'cockpit/kubernetes', # AFAICS this is not built from the URL + ]) + ), + ( + "origin", True, ['nodes', 'masters', 'etcd'], None, + set([ + # images running on top of openshift + 'openshift/origin-pod:vtest', + 'openshift/origin-deployer:vtest', + 'openshift/origin-docker-registry:vtest', + 'openshift/origin-haproxy-router:vtest', + 'cockpit/kubernetes', + # containerized component images + 'openshift/origin:vtest', + 'openshift/node:vtest', + 'openshift/openvswitch:vtest', + 'registry.access.redhat.com/rhel7/etcd', + ]) + ), + ( # enterprise images + "openshift-enterprise", True, ['nodes'], 'foo.io/openshift3/ose-${component}:f13ac45', + set([ + 'foo.io/openshift3/ose-pod:f13ac45', + 'foo.io/openshift3/ose-deployer:f13ac45', + 'foo.io/openshift3/ose-docker-registry:f13ac45', + 'foo.io/openshift3/ose-haproxy-router:f13ac45', + # registry-console is not constructed/versioned the same as the others. + 'registry.access.redhat.com/openshift3/registry-console', + # containerized images aren't built from oreg_url + 'openshift3/node:vtest', + 'openshift3/openvswitch:vtest', + ]) + ), + ( + "openshift-enterprise", True, ['etcd', 'lb'], 'foo.io/openshift3/ose-${component}:f13ac45', + set([ + 'registry.access.redhat.com/rhel7/etcd', + # lb does not yet come in a containerized version + ]) + ), + +]) +def test_required_images(deployment_type, is_containerized, groups, oreg_url, expected): + task_vars = dict( + openshift=dict( + common=dict( + is_containerized=is_containerized, + is_atomic=False, + ), + ), + openshift_deployment_type=deployment_type, + group_names=groups, + oreg_url=oreg_url, + openshift_image_tag='vtest', + ) + + assert expected == DockerImageAvailability("DUMMY", task_vars).required_images() + + +def test_containerized_etcd(): + task_vars = dict( + openshift=dict( + common=dict( + is_containerized=True, + ), + ), + openshift_deployment_type="origin", + group_names=['etcd'], + ) + expected = set(['registry.access.redhat.com/rhel7/etcd']) + assert expected == DockerImageAvailability("DUMMY", task_vars).required_images() diff --git a/roles/openshift_health_checker/test/docker_storage_test.py b/roles/openshift_health_checker/test/docker_storage_test.py new file mode 100644 index 000000000..e0dccc062 --- /dev/null +++ b/roles/openshift_health_checker/test/docker_storage_test.py @@ -0,0 +1,305 @@ +import pytest + +from openshift_checks import OpenShiftCheckException +from openshift_checks.docker_storage import DockerStorage + + +@pytest.mark.parametrize('is_containerized, group_names, is_active', [ + (False, ["masters", "etcd"], False), + (False, ["masters", "nodes"], True), + (True, ["etcd"], True), +]) +def test_is_active(is_containerized, group_names, is_active): + task_vars = dict( + openshift=dict(common=dict(is_containerized=is_containerized)), + group_names=group_names, + ) + assert DockerStorage(None, task_vars).is_active() == is_active + + +def non_atomic_task_vars(): + return {"openshift": {"common": {"is_atomic": False}}} + + +@pytest.mark.parametrize('docker_info, failed, expect_msg', [ + ( + dict(failed=True, msg="Error connecting: Error while fetching server API version"), + True, + ["Is docker running on this host?"], + ), + ( + dict(msg="I have no info"), + True, + ["missing info"], + ), + ( + dict(info={ + "Driver": "devicemapper", + "DriverStatus": [("Pool Name", "docker-docker--pool")], + }), + False, + [], + ), + ( + dict(info={ + "Driver": "devicemapper", + "DriverStatus": [("Data loop file", "true")], + }), + True, + ["loopback devices with the Docker devicemapper storage driver"], + ), + ( + dict(info={ + "Driver": "overlay2", + "DriverStatus": [("Backing Filesystem", "xfs")], + }), + False, + [], + ), + ( + dict(info={ + "Driver": "overlay", + "DriverStatus": [("Backing Filesystem", "btrfs")], + }), + True, + ["storage is type 'btrfs'", "only supported with\n'xfs'"], + ), + ( + dict(info={ + "Driver": "overlay2", + "DriverStatus": [("Backing Filesystem", "xfs")], + "OperatingSystem": "Red Hat Enterprise Linux Server release 7.2 (Maipo)", + "KernelVersion": "3.10.0-327.22.2.el7.x86_64", + }), + True, + ["Docker reports kernel version 3.10.0-327"], + ), + ( + dict(info={ + "Driver": "overlay", + "DriverStatus": [("Backing Filesystem", "xfs")], + "OperatingSystem": "CentOS", + "KernelVersion": "3.10.0-514", + }), + False, + [], + ), + ( + dict(info={ + "Driver": "unsupported", + }), + True, + ["unsupported Docker storage driver"], + ), +]) +def test_check_storage_driver(docker_info, failed, expect_msg): + def execute_module(module_name, *_): + if module_name == "yum": + return {} + if module_name != "docker_info": + raise ValueError("not expecting module " + module_name) + return docker_info + + check = DockerStorage(execute_module, non_atomic_task_vars()) + check.check_dm_usage = lambda status: dict() # stub out for this test + check.check_overlay_usage = lambda info: dict() # stub out for this test + result = check.run() + + if failed: + assert result["failed"] + else: + assert not result.get("failed", False) + + for word in expect_msg: + assert word in result["msg"] + + +enough_space = { + "Pool Name": "docker--vg-docker--pool", + "Data Space Used": "19.92 MB", + "Data Space Total": "8.535 GB", + "Metadata Space Used": "40.96 kB", + "Metadata Space Total": "25.17 MB", +} + +not_enough_space = { + "Pool Name": "docker--vg-docker--pool", + "Data Space Used": "10 GB", + "Data Space Total": "10 GB", + "Metadata Space Used": "42 kB", + "Metadata Space Total": "43 kB", +} + + +@pytest.mark.parametrize('task_vars, driver_status, vg_free, success, expect_msg', [ + ( + {"max_thinpool_data_usage_percent": "not a float"}, + enough_space, + "12g", + False, + ["is not a percentage"], + ), + ( + {}, + {}, # empty values from driver status + "bogus", # also does not parse as bytes + False, + ["Could not interpret", "as bytes"], + ), + ( + {}, + enough_space, + "12.00g", + True, + [], + ), + ( + {}, + not_enough_space, + "0.00", + False, + ["data usage", "metadata usage", "higher than threshold"], + ), +]) +def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg): + check = DockerStorage(None, task_vars) + check.get_vg_free = lambda pool: vg_free + result = check.check_dm_usage(driver_status) + result_success = not result.get("failed") + + assert result_success is success + for msg in expect_msg: + assert msg in result["msg"] + + +@pytest.mark.parametrize('pool, command_returns, raises, returns', [ + ( + "foo-bar", + { # vgs missing + "msg": "[Errno 2] No such file or directory", + "failed": True, + "cmd": "/sbin/vgs", + "rc": 2, + }, + "Failed to run /sbin/vgs", + None, + ), + ( + "foo", # no hyphen in name - should not happen + {}, + "name does not have the expected format", + None, + ), + ( + "foo-bar", + dict(stdout=" 4.00g\n"), + None, + "4.00g", + ), + ( + "foo-bar", + dict(stdout="\n"), # no matching VG + "vgs did not find this VG", + None, + ) +]) +def test_vg_free(pool, command_returns, raises, returns): + def execute_module(module_name, *_): + if module_name != "command": + raise ValueError("not expecting module " + module_name) + return command_returns + + check = DockerStorage(execute_module) + if raises: + with pytest.raises(OpenShiftCheckException) as err: + check.get_vg_free(pool) + assert raises in str(err.value) + else: + ret = check.get_vg_free(pool) + assert ret == returns + + +@pytest.mark.parametrize('string, expect_bytes', [ + ("12", 12.0), + ("12 k", 12.0 * 1024), + ("42.42 MB", 42.42 * 1024**2), + ("12g", 12.0 * 1024**3), +]) +def test_convert_to_bytes(string, expect_bytes): + got = DockerStorage.convert_to_bytes(string) + assert got == expect_bytes + + +@pytest.mark.parametrize('string', [ + "bork", + "42 Qs", +]) +def test_convert_to_bytes_error(string): + with pytest.raises(ValueError) as err: + DockerStorage.convert_to_bytes(string) + assert "Cannot convert" in str(err.value) + assert string in str(err.value) + + +ansible_mounts_enough = [{ + 'mount': '/var/lib/docker', + 'size_available': 50 * 10**9, + 'size_total': 50 * 10**9, +}] +ansible_mounts_not_enough = [{ + 'mount': '/var/lib/docker', + 'size_available': 0, + 'size_total': 50 * 10**9, +}] +ansible_mounts_missing_fields = [dict(mount='/var/lib/docker')] +ansible_mounts_zero_size = [{ + 'mount': '/var/lib/docker', + 'size_available': 0, + 'size_total': 0, +}] + + +@pytest.mark.parametrize('ansible_mounts, threshold, expect_fail, expect_msg', [ + ( + ansible_mounts_enough, + None, + False, + [], + ), + ( + ansible_mounts_not_enough, + None, + True, + ["usage percentage", "higher than threshold"], + ), + ( + ansible_mounts_not_enough, + "bogus percent", + True, + ["is not a percentage"], + ), + ( + ansible_mounts_missing_fields, + None, + True, + ["Ansible bug"], + ), + ( + ansible_mounts_zero_size, + None, + True, + ["Ansible bug"], + ), +]) +def test_overlay_usage(ansible_mounts, threshold, expect_fail, expect_msg): + task_vars = non_atomic_task_vars() + task_vars["ansible_mounts"] = ansible_mounts + if threshold is not None: + task_vars["max_overlay_usage_percent"] = threshold + check = DockerStorage(None, task_vars) + docker_info = dict(DockerRootDir="/var/lib/docker", Driver="overlay") + result = check.check_overlay_usage(docker_info) + + assert expect_fail == bool(result.get("failed")) + for msg in expect_msg: + assert msg in result["msg"] diff --git a/roles/openshift_health_checker/test/elasticsearch_test.py b/roles/openshift_health_checker/test/elasticsearch_test.py index b9d375d8c..09bacd9ac 100644 --- a/roles/openshift_health_checker/test/elasticsearch_test.py +++ b/roles/openshift_health_checker/test/elasticsearch_test.py @@ -1,25 +1,26 @@ import pytest import json -from openshift_checks.logging.elasticsearch import Elasticsearch +from openshift_checks.logging.elasticsearch import Elasticsearch, OpenShiftCheckExceptionList + task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) -def canned_elasticsearch(exec_oc=None): - """Create an Elasticsearch check object with canned exec_oc method""" - check = Elasticsearch("dummy") # fails if a module is actually invoked +def canned_elasticsearch(task_vars=None, exec_oc=None): + """Create an Elasticsearch check object with stubbed exec_oc method""" + check = Elasticsearch(None, task_vars or {}) if exec_oc: - check._exec_oc = exec_oc + check.exec_oc = exec_oc return check -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +def assert_error_in_list(expect_err, errorlist): + assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist)) + + +def pods_by_name(pods): + return {pod['metadata']['name']: pod for pod in pods} plain_es_pod = { @@ -27,6 +28,7 @@ plain_es_pod = { "labels": {"component": "es", "deploymentconfig": "logging-es"}, "name": "logging-es", }, + "spec": {}, "status": { "conditions": [{"status": "True", "type": "Ready"}], "containerStatuses": [{"ready": True}], @@ -40,6 +42,7 @@ split_es_pod = { "labels": {"component": "es", "deploymentconfig": "logging-es-2"}, "name": "logging-es-2", }, + "spec": {}, "status": { "conditions": [{"status": "True", "type": "Ready"}], "containerStatuses": [{"ready": True}], @@ -48,12 +51,28 @@ split_es_pod = { "_test_master_name_str": "name logging-es-2", } +unready_es_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es-3"}, + "name": "logging-es-3", + }, + "spec": {}, + "status": { + "conditions": [{"status": "False", "type": "Ready"}], + "containerStatuses": [{"ready": False}], + "podIP": "10.10.10.10", + }, + "_test_master_name_str": "BAD_NAME_RESPONSE", +} + def test_check_elasticsearch(): - assert 'No logging Elasticsearch pods' in canned_elasticsearch().check_elasticsearch([], {}) + with pytest.raises(OpenShiftCheckExceptionList) as excinfo: + canned_elasticsearch().check_elasticsearch([]) + assert_error_in_list('NoRunningPods', excinfo.value) # canned oc responses to match so all the checks pass - def _exec_oc(cmd, args, task_vars): + def exec_oc(cmd, args): if '_cat/master' in cmd: return 'name logging-es' elif '/_nodes' in cmd: @@ -65,33 +84,41 @@ def test_check_elasticsearch(): else: raise Exception(cmd) - assert not canned_elasticsearch(_exec_oc).check_elasticsearch([plain_es_pod], {}) + check = canned_elasticsearch({}, exec_oc) + check.get_pods_for_component = lambda *_: [plain_es_pod] + assert {} == check.run() -def pods_by_name(pods): - return {pod['metadata']['name']: pod for pod in pods} +def test_check_running_es_pods(): + pods, errors = Elasticsearch().running_elasticsearch_pods([plain_es_pod, unready_es_pod]) + assert plain_es_pod in pods + assert_error_in_list('PodNotRunning', errors) + + +def test_check_elasticsearch_masters(): + pods = [plain_es_pod] + check = canned_elasticsearch(task_vars_config_base, lambda *_: plain_es_pod['_test_master_name_str']) + assert not check.check_elasticsearch_masters(pods_by_name(pods)) @pytest.mark.parametrize('pods, expect_error', [ ( [], - 'No logging Elasticsearch masters', + 'NoMasterFound', ), ( - [plain_es_pod], - None, + [unready_es_pod], + 'NoMasterName', ), ( [plain_es_pod, split_es_pod], - 'Found multiple Elasticsearch masters', + 'SplitBrainMasters', ), ]) -def test_check_elasticsearch_masters(pods, expect_error): +def test_check_elasticsearch_masters_error(pods, expect_error): test_pods = list(pods) - check = canned_elasticsearch(lambda cmd, args, task_vars: test_pods.pop(0)['_test_master_name_str']) - - errors = check._check_elasticsearch_masters(pods_by_name(pods), task_vars_config_base) - assert_error(''.join(errors), expect_error) + check = canned_elasticsearch(task_vars_config_base, lambda *_: test_pods.pop(0)['_test_master_name_str']) + assert_error_in_list(expect_error, check.check_elasticsearch_masters(pods_by_name(pods))) es_node_list = { @@ -101,80 +128,76 @@ es_node_list = { }}} +def test_check_elasticsearch_node_list(): + check = canned_elasticsearch(task_vars_config_base, lambda *_: json.dumps(es_node_list)) + assert not check.check_elasticsearch_node_list(pods_by_name([plain_es_pod])) + + @pytest.mark.parametrize('pods, node_list, expect_error', [ ( [], {}, - 'No logging Elasticsearch masters', - ), - ( - [plain_es_pod], - es_node_list, - None, + 'MissingComponentPods', ), ( [plain_es_pod], {}, # empty list of nodes triggers KeyError - "Failed to query", + 'MissingNodeList', ), ( [split_es_pod], es_node_list, - 'does not correspond to any known ES pod', + 'EsPodNodeMismatch', ), ]) -def test_check_elasticsearch_node_list(pods, node_list, expect_error): - check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(node_list)) +def test_check_elasticsearch_node_list_errors(pods, node_list, expect_error): + check = canned_elasticsearch(task_vars_config_base, lambda cmd, args: json.dumps(node_list)) + assert_error_in_list(expect_error, check.check_elasticsearch_node_list(pods_by_name(pods))) - errors = check._check_elasticsearch_node_list(pods_by_name(pods), task_vars_config_base) - assert_error(''.join(errors), expect_error) + +def test_check_elasticsearch_cluster_health(): + test_health_data = [{"status": "green"}] + check = canned_elasticsearch(exec_oc=lambda *_: json.dumps(test_health_data.pop(0))) + assert not check.check_es_cluster_health(pods_by_name([plain_es_pod])) @pytest.mark.parametrize('pods, health_data, expect_error', [ ( [plain_es_pod], - [{"status": "green"}], - None, - ), - ( - [plain_es_pod], [{"no-status": "should bomb"}], - 'Could not retrieve cluster health status', + 'BadEsResponse', ), ( [plain_es_pod, split_es_pod], [{"status": "green"}, {"status": "red"}], - 'Elasticsearch cluster health status is RED', + 'EsClusterHealthRed', ), ]) -def test_check_elasticsearch_cluster_health(pods, health_data, expect_error): +def test_check_elasticsearch_cluster_health_errors(pods, health_data, expect_error): test_health_data = list(health_data) - check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(test_health_data.pop(0))) + check = canned_elasticsearch(exec_oc=lambda *_: json.dumps(test_health_data.pop(0))) + assert_error_in_list(expect_error, check.check_es_cluster_health(pods_by_name(pods))) - errors = check._check_es_cluster_health(pods_by_name(pods), task_vars_config_base) - assert_error(''.join(errors), expect_error) + +def test_check_elasticsearch_diskspace(): + check = canned_elasticsearch(exec_oc=lambda *_: 'IUse% Use%\n 3% 4%\n') + assert not check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod])) @pytest.mark.parametrize('disk_data, expect_error', [ ( 'df: /elasticsearch/persistent: No such file or directory\n', - 'Could not retrieve storage usage', - ), - ( - 'IUse% Use%\n 3% 4%\n', - None, + 'BadDfResponse', ), ( 'IUse% Use%\n 95% 40%\n', - 'Inode percent usage on the storage volume', + 'InodeUsageTooHigh', ), ( 'IUse% Use%\n 3% 94%\n', - 'Disk percent usage on the storage volume', + 'DiskUsageTooHigh', ), ]) -def test_check_elasticsearch_diskspace(disk_data, expect_error): - check = canned_elasticsearch(lambda cmd, args, task_vars: disk_data) - - errors = check._check_elasticsearch_diskspace(pods_by_name([plain_es_pod]), task_vars_config_base) - assert_error(''.join(errors), expect_error) +def test_check_elasticsearch_diskspace_errors(disk_data, expect_error): + check = canned_elasticsearch(exec_oc=lambda *_: disk_data) + assert_error_in_list(expect_error, check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod]))) diff --git a/roles/openshift_health_checker/test/etcd_imagedata_size_test.py b/roles/openshift_health_checker/test/etcd_imagedata_size_test.py index df9d52d41..d3aae98f2 100644 --- a/roles/openshift_health_checker/test/etcd_imagedata_size_test.py +++ b/roles/openshift_health_checker/test/etcd_imagedata_size_test.py @@ -1,7 +1,8 @@ import pytest from collections import namedtuple -from openshift_checks.etcd_imagedata_size import EtcdImageDataSize, OpenShiftCheckException +from openshift_checks.etcd_imagedata_size import EtcdImageDataSize +from openshift_checks import OpenShiftCheckException from etcdkeysize import check_etcd_key_size @@ -51,12 +52,12 @@ def test_cannot_determine_available_mountpath(ansible_mounts, extra_words): task_vars = dict( ansible_mounts=ansible_mounts, ) - check = EtcdImageDataSize(execute_module=fake_execute_module) + check = EtcdImageDataSize(fake_execute_module, task_vars) with pytest.raises(OpenShiftCheckException) as excinfo: - check.run(tmp=None, task_vars=task_vars) + check.run() - for word in 'determine valid etcd mountpath'.split() + extra_words: + for word in ['Unable to determine mount point'] + extra_words: assert word in str(excinfo.value) @@ -111,14 +112,14 @@ def test_cannot_determine_available_mountpath(ansible_mounts, extra_words): ) ]) def test_check_etcd_key_size_calculates_correct_limit(ansible_mounts, tree, size_limit, should_fail, extra_words): - def execute_module(module_name, args, tmp=None, task_vars=None): + def execute_module(module_name, module_args, *_): if module_name != "etcdkeysize": return { "changed": False, } client = fake_etcd_client(tree) - s, limit_exceeded = check_etcd_key_size(client, tree["key"], args["size_limit_bytes"]) + s, limit_exceeded = check_etcd_key_size(client, tree["key"], module_args["size_limit_bytes"]) return {"size_limit_exceeded": limit_exceeded} @@ -133,7 +134,7 @@ def test_check_etcd_key_size_calculates_correct_limit(ansible_mounts, tree, size if size_limit is None: task_vars.pop("etcd_max_image_data_size_bytes") - check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars) + check = EtcdImageDataSize(execute_module, task_vars).run() if should_fail: assert check["failed"] @@ -267,14 +268,14 @@ def test_check_etcd_key_size_calculates_correct_limit(ansible_mounts, tree, size ), ]) def test_etcd_key_size_check_calculates_correct_size(ansible_mounts, tree, root_path, expected_size, extra_words): - def execute_module(module_name, args, tmp=None, task_vars=None): + def execute_module(module_name, module_args, *_): if module_name != "etcdkeysize": return { "changed": False, } client = fake_etcd_client(tree) - size, limit_exceeded = check_etcd_key_size(client, root_path, args["size_limit_bytes"]) + size, limit_exceeded = check_etcd_key_size(client, root_path, module_args["size_limit_bytes"]) assert size == expected_size return { @@ -289,12 +290,12 @@ def test_etcd_key_size_check_calculates_correct_size(ansible_mounts, tree, root_ ) ) - check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars) + check = EtcdImageDataSize(execute_module, task_vars).run() assert not check.get("failed", False) def test_etcdkeysize_module_failure(): - def execute_module(module_name, tmp=None, task_vars=None): + def execute_module(module_name, *_): if module_name != "etcdkeysize": return { "changed": False, @@ -317,7 +318,7 @@ def test_etcdkeysize_module_failure(): ) ) - check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars) + check = EtcdImageDataSize(execute_module, task_vars).run() assert check["failed"] for word in "Failed to retrieve stats": diff --git a/roles/openshift_health_checker/test/etcd_traffic_test.py b/roles/openshift_health_checker/test/etcd_traffic_test.py new file mode 100644 index 000000000..fae3e578d --- /dev/null +++ b/roles/openshift_health_checker/test/etcd_traffic_test.py @@ -0,0 +1,72 @@ +import pytest + +from openshift_checks.etcd_traffic import EtcdTraffic + + +@pytest.mark.parametrize('group_names,version,is_active', [ + (['masters'], "3.5", False), + (['masters'], "3.6", False), + (['nodes'], "3.4", False), + (['etcd'], "3.4", True), + (['etcd'], "1.5", True), + (['etcd'], "3.1", False), + (['masters', 'nodes'], "3.5", False), + (['masters', 'etcd'], "3.5", True), + ([], "3.4", False), +]) +def test_is_active(group_names, version, is_active): + task_vars = dict( + group_names=group_names, + openshift_image_tag=version, + ) + assert EtcdTraffic(task_vars=task_vars).is_active() == is_active + + +@pytest.mark.parametrize('group_names,matched,failed,extra_words', [ + (["masters"], True, True, ["Higher than normal", "traffic"]), + (["masters", "etcd"], False, False, []), + (["etcd"], False, False, []), +]) +def test_log_matches_high_traffic_msg(group_names, matched, failed, extra_words): + def execute_module(module_name, *_): + return { + "matched": matched, + "failed": failed, + } + + task_vars = dict( + group_names=group_names, + openshift=dict( + common=dict(service_type="origin", is_containerized=False), + ) + ) + + result = EtcdTraffic(execute_module, task_vars).run() + + for word in extra_words: + assert word in result.get("msg", "") + + assert result.get("failed", False) == failed + + +@pytest.mark.parametrize('is_containerized,expected_unit_value', [ + (False, "etcd"), + (True, "etcd_container"), +]) +def test_systemd_unit_matches_deployment_type(is_containerized, expected_unit_value): + task_vars = dict( + openshift=dict( + common=dict(is_containerized=is_containerized), + ) + ) + + def execute_module(module_name, args, *_): + assert module_name == "search_journalctl" + matchers = args["log_matchers"] + + for matcher in matchers: + assert matcher["unit"] == expected_unit_value + + return {"failed": False} + + EtcdTraffic(execute_module, task_vars).run() diff --git a/roles/openshift_health_checker/test/etcd_volume_test.py b/roles/openshift_health_checker/test/etcd_volume_test.py index 917045526..077cea3ea 100644 --- a/roles/openshift_health_checker/test/etcd_volume_test.py +++ b/roles/openshift_health_checker/test/etcd_volume_test.py @@ -1,6 +1,7 @@ import pytest -from openshift_checks.etcd_volume import EtcdVolume, OpenShiftCheckException +from openshift_checks.etcd_volume import EtcdVolume +from openshift_checks import OpenShiftCheckException @pytest.mark.parametrize('ansible_mounts,extra_words', [ @@ -11,12 +12,11 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words): task_vars = dict( ansible_mounts=ansible_mounts, ) - check = EtcdVolume(execute_module=fake_execute_module) with pytest.raises(OpenShiftCheckException) as excinfo: - check.run(tmp=None, task_vars=task_vars) + EtcdVolume(fake_execute_module, task_vars).run() - for word in 'Unable to find etcd storage mount point'.split() + extra_words: + for word in ['Unable to determine mount point'] + extra_words: assert word in str(excinfo.value) @@ -76,8 +76,7 @@ def test_succeeds_with_recommended_disk_space(size_limit, ansible_mounts): if task_vars["etcd_device_usage_threshold_percent"] is None: task_vars.pop("etcd_device_usage_threshold_percent") - check = EtcdVolume(execute_module=fake_execute_module) - result = check.run(tmp=None, task_vars=task_vars) + result = EtcdVolume(fake_execute_module, task_vars).run() assert not result.get('failed', False) @@ -137,8 +136,7 @@ def test_fails_with_insufficient_disk_space(size_limit_percent, ansible_mounts, if task_vars["etcd_device_usage_threshold_percent"] is None: task_vars.pop("etcd_device_usage_threshold_percent") - check = EtcdVolume(execute_module=fake_execute_module) - result = check.run(tmp=None, task_vars=task_vars) + result = EtcdVolume(fake_execute_module, task_vars).run() assert result['failed'] for word in extra_words: diff --git a/roles/openshift_health_checker/test/fluentd_config_test.py b/roles/openshift_health_checker/test/fluentd_config_test.py new file mode 100644 index 000000000..10db253bc --- /dev/null +++ b/roles/openshift_health_checker/test/fluentd_config_test.py @@ -0,0 +1,348 @@ +import pytest + +from openshift_checks.logging.fluentd_config import FluentdConfig, OpenShiftCheckException + + +def canned_fluentd_pod(containers): + return { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-1", + }, + "spec": { + "host": "node1", + "nodeName": "node1", + "containers": containers, + }, + "status": { + "phase": "Running", + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } + } + + +fluentd_pod = { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-1", + }, + "spec": { + "host": "node1", + "nodeName": "node1", + "containers": [ + { + "name": "container1", + "env": [ + { + "name": "USE_JOURNAL", + "value": "true", + } + ], + } + ], + }, + "status": { + "phase": "Running", + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} + +not_running_fluentd_pod = { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-2", + }, + "status": { + "phase": "Unknown", + "containerStatuses": [{"ready": True}, {"ready": False}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} + + +@pytest.mark.parametrize('name, use_journald, logging_driver, extra_words', [ + ( + 'test success with use_journald=false, and docker config set to use "json-file"', + False, + "json-file", + [], + ), +], ids=lambda argvals: argvals[0]) +def test_check_logging_config_non_master(name, use_journald, logging_driver, extra_words): + def execute_module(module_name, args): + if module_name == "docker_info": + return { + "info": { + "LoggingDriver": logging_driver, + } + } + + return {} + + task_vars = dict( + group_names=["nodes", "etcd"], + openshift_logging_fluentd_use_journal=use_journald, + openshift=dict( + common=dict(config_base=""), + ), + ) + + check = FluentdConfig(execute_module, task_vars) + check.execute_module = execute_module + error = check.check_logging_config() + + assert error is None + + +@pytest.mark.parametrize('name, use_journald, logging_driver, words', [ + ( + 'test failure with use_journald=false, but docker config set to use "journald"', + False, + "journald", + ['json log files', 'has been set to use "journald"'], + ), + ( + 'test failure with use_journald=false, but docker config set to use an "unsupported" driver', + False, + "unsupported", + ["json log files", 'has been set to use "unsupported"'], + ), + ( + 'test failure with use_journald=true, but docker config set to use "json-file"', + True, + "json-file", + ['logs from "journald"', 'has been set to use "json-file"'], + ), +], ids=lambda argvals: argvals[0]) +def test_check_logging_config_non_master_failed(name, use_journald, logging_driver, words): + def execute_module(module_name, args): + if module_name == "docker_info": + return { + "info": { + "LoggingDriver": logging_driver, + } + } + + return {} + + task_vars = dict( + group_names=["nodes", "etcd"], + openshift_logging_fluentd_use_journal=use_journald, + openshift=dict( + common=dict(config_base=""), + ), + ) + + check = FluentdConfig(execute_module, task_vars) + check.execute_module = execute_module + error = check.check_logging_config() + + assert error is not None + for word in words: + assert word in error + + +@pytest.mark.parametrize('name, pods, logging_driver, extra_words', [ + # use_journald returns false (not using journald), but check succeeds + # since docker is set to use json-file + ( + 'test success with use_journald=false, and docker config set to use default driver "json-file"', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [{ + "name": "USE_JOURNAL", + "value": "false", + }], + }, + ] + )], + "json-file", + [], + ), + ( + 'test success with USE_JOURNAL env var missing and docker config set to use default driver "json-file"', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [{ + "name": "RANDOM", + "value": "value", + }], + }, + ] + )], + "json-file", + [], + ), +], ids=lambda argvals: argvals[0]) +def test_check_logging_config_master(name, pods, logging_driver, extra_words): + def execute_module(module_name, args): + if module_name == "docker_info": + return { + "info": { + "LoggingDriver": logging_driver, + } + } + + return {} + + task_vars = dict( + group_names=["masters"], + openshift=dict( + common=dict(config_base=""), + ), + ) + + check = FluentdConfig(execute_module, task_vars) + check.execute_module = execute_module + check.get_pods_for_component = lambda _: pods + error = check.check_logging_config() + + assert error is None + + +@pytest.mark.parametrize('name, pods, logging_driver, words', [ + ( + 'test failure with use_journald=false, but docker config set to use "journald"', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [{ + "name": "USE_JOURNAL", + "value": "false", + }], + }, + ] + )], + "journald", + ['json log files', 'has been set to use "journald"'], + ), + ( + 'test failure with use_journald=true, but docker config set to use "json-file"', + [fluentd_pod], + "json-file", + ['logs from "journald"', 'has been set to use "json-file"'], + ), + ( + 'test failure with use_journald=false, but docker set to use an "unsupported" driver', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [{ + "name": "USE_JOURNAL", + "value": "false", + }], + }, + ] + )], + "unsupported", + ["json log files", 'has been set to use "unsupported"'], + ), + ( + 'test failure with USE_JOURNAL env var missing and docker config set to use "journald"', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [{ + "name": "RANDOM", + "value": "value", + }], + }, + ] + )], + "journald", + ["configuration is set to", "json log files"], + ), +], ids=lambda argvals: argvals[0]) +def test_check_logging_config_master_failed(name, pods, logging_driver, words): + def execute_module(module_name, args): + if module_name == "docker_info": + return { + "info": { + "LoggingDriver": logging_driver, + } + } + + return {} + + task_vars = dict( + group_names=["masters"], + openshift=dict( + common=dict(config_base=""), + ), + ) + + check = FluentdConfig(execute_module, task_vars) + check.execute_module = execute_module + check.get_pods_for_component = lambda _: pods + error = check.check_logging_config() + + assert error is not None + for word in words: + assert word in error + + +@pytest.mark.parametrize('name, pods, response, logging_driver, extra_words', [ + ( + 'test OpenShiftCheckException with no running containers', + [canned_fluentd_pod([])], + { + "failed": True, + "result": "unexpected", + }, + "json-file", + ['no running containers'], + ), + ( + 'test OpenShiftCheckException one container and no env vars set', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [], + }, + ] + )], + { + "failed": True, + "result": "unexpected", + }, + "json-file", + ['no environment variables'], + ), +], ids=lambda argvals: argvals[0]) +def test_check_logging_config_master_fails_on_unscheduled_deployment(name, pods, response, logging_driver, extra_words): + def execute_module(module_name, args): + if module_name == "docker_info": + return { + "info": { + "LoggingDriver": logging_driver, + } + } + + return {} + + task_vars = dict( + group_names=["masters"], + openshift=dict( + common=dict(config_base=""), + ), + ) + + check = FluentdConfig(execute_module, task_vars) + check.get_pods_for_component = lambda _: pods + + with pytest.raises(OpenShiftCheckException) as error: + check.check_logging_config() + + assert error is not None + for word in extra_words: + assert word in str(error) diff --git a/roles/openshift_health_checker/test/fluentd_test.py b/roles/openshift_health_checker/test/fluentd_test.py index d151c0b19..e7bf9818b 100644 --- a/roles/openshift_health_checker/test/fluentd_test.py +++ b/roles/openshift_health_checker/test/fluentd_test.py @@ -1,23 +1,11 @@ import pytest import json -from openshift_checks.logging.fluentd import Fluentd +from openshift_checks.logging.fluentd import Fluentd, OpenShiftCheckExceptionList, OpenShiftCheckException -def canned_fluentd(exec_oc=None): - """Create a Fluentd check object with canned exec_oc method""" - check = Fluentd("dummy") # fails if a module is actually invoked - if exec_oc: - check._exec_oc = exec_oc - return check - - -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +def assert_error_in_list(expect_err, errorlist): + assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist)) fluentd_pod_node1 = { @@ -65,45 +53,60 @@ fluentd_node3_unlabeled = { } +def test_get_fluentd_pods(): + check = Fluentd() + check.exec_oc = lambda *_: json.dumps(dict(items=[fluentd_node1])) + check.get_pods_for_component = lambda *_: [fluentd_pod_node1] + assert not check.run() + + @pytest.mark.parametrize('pods, nodes, expect_error', [ ( [], [], - 'No nodes appear to be defined', + 'NoNodesDefined', ), ( [], [fluentd_node3_unlabeled], - 'There are no nodes with the fluentd label', + 'NoNodesLabeled', ), ( [], [fluentd_node1, fluentd_node3_unlabeled], - 'Fluentd will not aggregate logs from these nodes.', + 'NodesUnlabeled', ), ( [], [fluentd_node2], - "nodes are supposed to have a Fluentd pod but do not", + 'MissingFluentdPod', ), ( [fluentd_pod_node1, fluentd_pod_node1], [fluentd_node1], - 'more Fluentd pods running than nodes labeled', + 'TooManyFluentdPods', ), ( [fluentd_pod_node2_down], [fluentd_node2], - "Fluentd pods are supposed to be running", - ), - ( - [fluentd_pod_node1], - [fluentd_node1], - None, + 'FluentdNotRunning', ), ]) -def test_get_fluentd_pods(pods, nodes, expect_error): - check = canned_fluentd(lambda cmd, args, task_vars: json.dumps(dict(items=nodes))) +def test_get_fluentd_pods_errors(pods, nodes, expect_error): + check = Fluentd() + check.exec_oc = lambda *_: json.dumps(dict(items=nodes)) + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_fluentd(pods) + if isinstance(excinfo.value, OpenShiftCheckExceptionList): + assert_error_in_list(expect_error, excinfo.value) + else: + assert expect_error == excinfo.value.name + - error = check.check_fluentd(pods, {}) - assert_error(error, expect_error) +def test_bad_oc_node_list(): + check = Fluentd() + check.exec_oc = lambda *_: "this isn't even json" + with pytest.raises(OpenShiftCheckException) as excinfo: + check.get_nodes_by_name() + assert 'BadOcNodeList' == excinfo.value.name diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py index 19140a1b6..04a5e89c4 100644 --- a/roles/openshift_health_checker/test/kibana_test.py +++ b/roles/openshift_health_checker/test/kibana_test.py @@ -8,23 +8,7 @@ except ImportError: from urllib.error import HTTPError, URLError import urllib.request as urllib2 -from openshift_checks.logging.kibana import Kibana - - -def canned_kibana(exec_oc=None): - """Create a Kibana check object with canned exec_oc method""" - check = Kibana("dummy") # fails if a module is actually invoked - if exec_oc: - check._exec_oc = exec_oc - return check - - -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +from openshift_checks.logging.kibana import Kibana, OpenShiftCheckException plain_kibana_pod = { @@ -49,39 +33,45 @@ not_running_kibana_pod = { } +def test_check_kibana(): + # should run without exception: + Kibana().check_kibana([plain_kibana_pod]) + + @pytest.mark.parametrize('pods, expect_error', [ ( [], - "There are no Kibana pods deployed", - ), - ( - [plain_kibana_pod], - None, + "MissingComponentPods", ), ( [not_running_kibana_pod], - "No Kibana pod is in a running state", + "NoRunningPods", ), ( [plain_kibana_pod, not_running_kibana_pod], - "The following Kibana pods are not currently in a running state", + "PodNotRunning", ), ]) -def test_check_kibana(pods, expect_error): - check = canned_kibana() - error = check.check_kibana(pods) - assert_error(error, expect_error) +def test_check_kibana_error(pods, expect_error): + with pytest.raises(OpenShiftCheckException) as excinfo: + Kibana().check_kibana(pods) + assert expect_error == excinfo.value.name -@pytest.mark.parametrize('route, expect_url, expect_error', [ +@pytest.mark.parametrize('comment, route, expect_error', [ ( + "No route returned", None, - None, - 'no_route_exists', + "no_route_exists", ), - # test route with no ingress ( + "broken route response", + {"status": {}}, + "get_route_failed", + ), + ( + "route with no ingress", { "metadata": { "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -94,12 +84,11 @@ def test_check_kibana(pods, expect_error): "host": "hostname", } }, - None, - 'route_not_accepted', + "route_not_accepted", ), - # test route with no host ( + "route with no host", { "metadata": { "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -112,12 +101,21 @@ def test_check_kibana(pods, expect_error): }, "spec": {}, }, - None, - 'route_missing_host', + "route_missing_host", ), +]) +def test_get_kibana_url_error(comment, route, expect_error): + check = Kibana() + check.exec_oc = lambda *_: json.dumps(route) if route else "" - # test route that looks fine + with pytest.raises(OpenShiftCheckException) as excinfo: + check._get_kibana_url() + assert excinfo.value.name == expect_error + + +@pytest.mark.parametrize('comment, route, expect_url', [ ( + "test route that looks fine", { "metadata": { "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -133,61 +131,57 @@ def test_check_kibana(pods, expect_error): }, }, "https://hostname/", - None, ), ]) -def test_get_kibana_url(route, expect_url, expect_error): - check = canned_kibana(lambda cmd, args, task_vars: json.dumps(route) if route else "") - - url, error = check._get_kibana_url({}) - if expect_url: - assert url == expect_url - else: - assert not url - if expect_error: - assert error == expect_error - else: - assert not error +def test_get_kibana_url(comment, route, expect_url): + check = Kibana() + check.exec_oc = lambda *_: json.dumps(route) + assert expect_url == check._get_kibana_url() @pytest.mark.parametrize('exec_result, expect', [ ( 'urlopen error [Errno 111] Connection refused', - 'at least one router routing to it?', + 'FailedToConnectInternal', ), ( 'urlopen error [Errno -2] Name or service not known', - 'DNS configured for the Kibana hostname?', + 'FailedToResolveInternal', ), ( 'Status code was not [302]: HTTP Error 500: Server error', - 'did not return the correct status code', + 'WrongReturnCodeInternal', ), ( 'bork bork bork', - 'bork bork bork', # should pass through + 'MiscRouteErrorInternal', ), ]) def test_verify_url_internal_failure(exec_result, expect): - check = Kibana(execute_module=lambda module_name, args, task_vars: dict(failed=True, msg=exec_result)) - check._get_kibana_url = lambda task_vars: ('url', None) + check = Kibana(execute_module=lambda *_: dict(failed=True, msg=exec_result)) + check._get_kibana_url = lambda: 'url' - error = check._check_kibana_route({}) - assert_error(error, expect) + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_kibana_route() + assert expect == excinfo.value.name @pytest.mark.parametrize('lib_result, expect', [ ( - HTTPError('url', 500, "it broke", hdrs=None, fp=None), - 'it broke', + HTTPError('url', 500, 'it broke', hdrs=None, fp=None), + 'MiscRouteError', ), ( - URLError('it broke'), - 'it broke', + URLError('urlopen error [Errno 111] Connection refused'), + 'FailedToConnect', + ), + ( + URLError('urlopen error [Errno -2] Name or service not known'), + 'FailedToResolve', ), ( 302, - 'returned the wrong error code', + 'WrongReturnCode', ), ( 200, @@ -210,9 +204,41 @@ def test_verify_url_external_failure(lib_result, expect, monkeypatch): raise lib_result monkeypatch.setattr(urllib2, 'urlopen', urlopen) - check = canned_kibana() - check._get_kibana_url = lambda task_vars: ('url', None) - check._verify_url_internal = lambda url, task_vars: None + check = Kibana() + check._get_kibana_url = lambda: 'url' + check._verify_url_internal = lambda url: None + + if not expect: + check.check_kibana_route() + return + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_kibana_route() + assert expect == excinfo.value.name + + +def test_verify_url_external_skip(): + check = Kibana(lambda *_: {}, dict(openshift_check_efk_kibana_external="false")) + check._get_kibana_url = lambda: 'url' + check.check_kibana_route() + + +# this is kind of silly but it adds coverage for the run() method... +def test_run(): + pods = ["foo"] + ran = dict(check_kibana=False, check_route=False) + + def check_kibana(pod_list): + ran["check_kibana"] = True + assert pod_list == pods + + def check_kibana_route(): + ran["check_route"] = True + + check = Kibana() + check.get_pods_for_component = lambda *_: pods + check.check_kibana = check_kibana + check.check_kibana_route = check_kibana_route - error = check._check_kibana_route({}) - assert_error(error, expect) + check.run() + assert ran["check_kibana"] and ran["check_route"] diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py index b6db34fe3..1a1c190f6 100644 --- a/roles/openshift_health_checker/test/logging_check_test.py +++ b/roles/openshift_health_checker/test/logging_check_test.py @@ -1,18 +1,14 @@ import pytest import json -from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck, MissingComponentPods, CouldNotUseOc task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) -logging_namespace = "logging" - - -def canned_loggingcheck(exec_oc=None): +def canned_loggingcheck(exec_oc=None, execute_module=None): """Create a LoggingCheck object with canned exec_oc method""" - check = LoggingCheck("dummy") # fails if a module is actually invoked - check.logging_namespace = 'logging' + check = LoggingCheck(execute_module) if exec_oc: check.exec_oc = exec_oc return check @@ -50,6 +46,16 @@ plain_kibana_pod = { } } +plain_kibana_pod_no_containerstatus = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-1", + }, + "status": { + "conditions": [{"status": "True", "type": "Ready"}], + } +} + fluentd_pod_node1 = { "metadata": { "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, @@ -80,15 +86,15 @@ plain_curator_pod = { ("Permission denied", "Unexpected error using `oc`"), ]) def test_oc_failure(problem, expect): - def execute_module(module_name, args, task_vars): + def execute_module(module_name, *_): if module_name == "ocutil": return dict(failed=True, result=problem) return dict(changed=False) - check = LoggingCheck({}) + check = LoggingCheck(execute_module, task_vars_config_base) - with pytest.raises(OpenShiftCheckException) as excinfo: - check.exec_oc(execute_module, logging_namespace, 'get foo', [], task_vars=task_vars_config_base) + with pytest.raises(CouldNotUseOc) as excinfo: + check.exec_oc('get foo', []) assert expect in str(excinfo) @@ -111,27 +117,52 @@ def test_is_active(groups, logging_deployed, is_active): openshift_hosted_logging_deploy=logging_deployed, ) - assert LoggingCheck.is_active(task_vars=task_vars) == is_active + assert LoggingCheck(None, task_vars).is_active() == is_active -@pytest.mark.parametrize('pod_output, expect_pods, expect_error', [ +@pytest.mark.parametrize('pod_output, expect_pods', [ + ( + json.dumps({'items': [plain_es_pod]}), + [plain_es_pod], + ), +]) +def test_get_pods_for_component(pod_output, expect_pods): + check = canned_loggingcheck(lambda *_: pod_output) + pods = check.get_pods_for_component("es") + assert pods == expect_pods + + +@pytest.mark.parametrize('exec_oc_output, expect_error', [ ( 'No resources found.', - None, - 'There are no pods in the logging namespace', + MissingComponentPods, ), ( - json.dumps({'items': [plain_kibana_pod, plain_es_pod, plain_curator_pod, fluentd_pod_node1]}), - [plain_es_pod], - None, + '{"items": null}', + MissingComponentPods, ), ]) -def test_get_pods_for_component(pod_output, expect_pods, expect_error): - check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: pod_output) - pods, error = check.get_pods_for_component( - lambda name, args, task_vars: {}, - logging_namespace, - "es", - {} - ) - assert_error(error, expect_error) +def test_get_pods_for_component_fail(exec_oc_output, expect_error): + check = canned_loggingcheck(lambda *_: exec_oc_output) + with pytest.raises(expect_error): + check.get_pods_for_component("es") + + +@pytest.mark.parametrize('name, pods, expected_pods', [ + ( + 'test single pod found, scheduled, but no containerStatuses field', + [plain_kibana_pod_no_containerstatus], + [plain_kibana_pod_no_containerstatus], + ), + ( + 'set of pods has at least one pod with containerStatuses (scheduled); should still fail', + [plain_kibana_pod_no_containerstatus, plain_kibana_pod], + [plain_kibana_pod_no_containerstatus], + ), + +], ids=lambda argvals: argvals[0]) +def test_get_not_running_pods_no_container_status(name, pods, expected_pods): + check = canned_loggingcheck(lambda *_: '') + result = check.not_running_pods(pods) + + assert result == expected_pods diff --git a/roles/openshift_health_checker/test/logging_index_time_test.py b/roles/openshift_health_checker/test/logging_index_time_test.py new file mode 100644 index 000000000..22566b295 --- /dev/null +++ b/roles/openshift_health_checker/test/logging_index_time_test.py @@ -0,0 +1,170 @@ +import json + +import pytest + +from openshift_checks.logging.logging_index_time import LoggingIndexTime, OpenShiftCheckException + + +SAMPLE_UUID = "unique-test-uuid" + + +def canned_loggingindextime(exec_oc=None): + """Create a check object with a canned exec_oc method""" + check = LoggingIndexTime() # fails if a module is actually invoked + if exec_oc: + check.exec_oc = exec_oc + return check + + +plain_running_elasticsearch_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es-data-master"}, + "name": "logging-es-data-master-1", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": True}], + "phase": "Running", + } +} +plain_running_kibana_pod = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-1", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": True}], + "phase": "Running", + } +} +not_running_kibana_pod = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-2", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": False}], + "conditions": [{"status": "True", "type": "Ready"}], + "phase": "pending", + } +} + + +@pytest.mark.parametrize('pods, expect_pods', [ + ( + [not_running_kibana_pod], + [], + ), + ( + [plain_running_kibana_pod], + [plain_running_kibana_pod], + ), + ( + [], + [], + ) +]) +def test_check_running_pods(pods, expect_pods): + check = canned_loggingindextime() + pods = check.running_pods(pods) + assert pods == expect_pods + + +def test_bad_config_param(): + with pytest.raises(OpenShiftCheckException) as error: + LoggingIndexTime(task_vars=dict(openshift_check_logging_index_timeout_seconds="foo")).run() + assert 'InvalidTimeout' == error.value.name + + +def test_no_running_pods(): + check = LoggingIndexTime() + check.get_pods_for_component = lambda *_: [not_running_kibana_pod] + with pytest.raises(OpenShiftCheckException) as error: + check.run() + assert 'kibanaNoRunningPods' == error.value.name + + +def test_with_running_pods(): + check = LoggingIndexTime() + check.get_pods_for_component = lambda *_: [plain_running_kibana_pod, plain_running_elasticsearch_pod] + check.curl_kibana_with_uuid = lambda *_: SAMPLE_UUID + check.wait_until_cmd_or_err = lambda *_: None + assert not check.run().get("failed") + + +@pytest.mark.parametrize('name, json_response, uuid, timeout', [ + ( + 'valid count in response', + { + "count": 1, + }, + SAMPLE_UUID, + 0.001, + ), +], ids=lambda argval: argval[0]) +def test_wait_until_cmd_or_err_succeeds(name, json_response, uuid, timeout): + check = canned_loggingindextime(lambda *_: json.dumps(json_response)) + check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, uuid, timeout) + + +@pytest.mark.parametrize('name, json_response, timeout, expect_error', [ + ( + 'invalid json response', + { + "invalid_field": 1, + }, + 0.001, + 'esInvalidResponse', + ), + ( + 'empty response', + {}, + 0.001, + 'esInvalidResponse', + ), + ( + 'valid response but invalid match count', + { + "count": 0, + }, + 0.005, + 'NoMatchFound', + ) +], ids=lambda argval: argval[0]) +def test_wait_until_cmd_or_err(name, json_response, timeout, expect_error): + check = canned_loggingindextime(lambda *_: json.dumps(json_response)) + with pytest.raises(OpenShiftCheckException) as error: + check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, SAMPLE_UUID, timeout) + + assert expect_error == error.value.name + + +def test_curl_kibana_with_uuid(): + check = canned_loggingindextime(lambda *_: json.dumps({"statusCode": 404})) + check.generate_uuid = lambda: SAMPLE_UUID + assert SAMPLE_UUID == check.curl_kibana_with_uuid(plain_running_kibana_pod) + + +@pytest.mark.parametrize('name, json_response, expect_error', [ + ( + 'invalid json response', + { + "invalid_field": "invalid", + }, + 'kibanaInvalidResponse', + ), + ( + 'wrong error code in response', + { + "statusCode": 500, + }, + 'kibanaInvalidReturnCode', + ), +], ids=lambda argval: argval[0]) +def test_failed_curl_kibana_with_uuid(name, json_response, expect_error): + check = canned_loggingindextime(lambda *_: json.dumps(json_response)) + check.generate_uuid = lambda: SAMPLE_UUID + + with pytest.raises(OpenShiftCheckException) as error: + check.curl_kibana_with_uuid(plain_running_kibana_pod) + + assert expect_error == error.value.name diff --git a/roles/openshift_health_checker/test/memory_availability_test.py b/roles/openshift_health_checker/test/memory_availability_test.py index 4fbaea0a9..aee2f0416 100644 --- a/roles/openshift_health_checker/test/memory_availability_test.py +++ b/roles/openshift_health_checker/test/memory_availability_test.py @@ -17,7 +17,7 @@ def test_is_active(group_names, is_active): task_vars = dict( group_names=group_names, ) - assert MemoryAvailability.is_active(task_vars=task_vars) == is_active + assert MemoryAvailability(None, task_vars).is_active() == is_active @pytest.mark.parametrize('group_names,configured_min,ansible_memtotal_mb', [ @@ -59,8 +59,7 @@ def test_succeeds_with_recommended_memory(group_names, configured_min, ansible_m ansible_memtotal_mb=ansible_memtotal_mb, ) - check = MemoryAvailability(execute_module=fake_execute_module) - result = check.run(tmp=None, task_vars=task_vars) + result = MemoryAvailability(fake_execute_module, task_vars).run() assert not result.get('failed', False) @@ -117,8 +116,7 @@ def test_fails_with_insufficient_memory(group_names, configured_min, ansible_mem ansible_memtotal_mb=ansible_memtotal_mb, ) - check = MemoryAvailability(execute_module=fake_execute_module) - result = check.run(tmp=None, task_vars=task_vars) + result = MemoryAvailability(fake_execute_module, task_vars).run() assert result.get('failed', False) for word in 'below recommended'.split() + extra_words: diff --git a/roles/openshift_health_checker/test/mixins_test.py b/roles/openshift_health_checker/test/mixins_test.py index 2d83e207d..b1a41ca3c 100644 --- a/roles/openshift_health_checker/test/mixins_test.py +++ b/roles/openshift_health_checker/test/mixins_test.py @@ -14,10 +14,10 @@ class NotContainerizedCheck(NotContainerizedMixin, OpenShiftCheck): (dict(openshift=dict(common=dict(is_containerized=True))), False), ]) def test_is_active(task_vars, expected): - assert NotContainerizedCheck.is_active(task_vars) == expected + assert NotContainerizedCheck(None, task_vars).is_active() == expected def test_is_active_missing_task_vars(): with pytest.raises(OpenShiftCheckException) as excinfo: - NotContainerizedCheck.is_active(task_vars={}) + NotContainerizedCheck().is_active() assert 'is_containerized' in str(excinfo.value) diff --git a/roles/openshift_health_checker/test/openshift_check_test.py b/roles/openshift_health_checker/test/openshift_check_test.py index e3153979c..789784c77 100644 --- a/roles/openshift_health_checker/test/openshift_check_test.py +++ b/roles/openshift_health_checker/test/openshift_check_test.py @@ -1,7 +1,7 @@ import pytest from openshift_checks import OpenShiftCheck, OpenShiftCheckException -from openshift_checks import load_checks, get_var +from openshift_checks import load_checks # Fixtures @@ -28,34 +28,23 @@ def test_OpenShiftCheck_init(): name = "test_check" run = NotImplemented - # initialization requires at least one argument (apart from self) - with pytest.raises(TypeError) as excinfo: - TestCheck() + # execute_module required at init if it will be used + with pytest.raises(RuntimeError) as excinfo: + TestCheck().execute_module("foo") assert 'execute_module' in str(excinfo.value) - assert 'module_executor' in str(excinfo.value) execute_module = object() # initialize with positional argument check = TestCheck(execute_module) - # new recommended name - assert check.execute_module == execute_module - # deprecated attribute name - assert check.module_executor == execute_module + assert check._execute_module == execute_module - # initialize with keyword argument, recommended name + # initialize with keyword argument check = TestCheck(execute_module=execute_module) - # new recommended name - assert check.execute_module == execute_module - # deprecated attribute name - assert check.module_executor == execute_module + assert check._execute_module == execute_module - # initialize with keyword argument, deprecated name - check = TestCheck(module_executor=execute_module) - # new recommended name - assert check.execute_module == execute_module - # deprecated attribute name - assert check.module_executor == execute_module + assert check.task_vars == {} + assert check.tmp is None def test_subclasses(): @@ -81,19 +70,49 @@ def test_load_checks(): assert modules +def dummy_check(task_vars): + class TestCheck(OpenShiftCheck): + name = "dummy" + run = NotImplemented + + return TestCheck(task_vars=task_vars) + + @pytest.mark.parametrize("keys,expected", [ (("foo",), 42), (("bar", "baz"), "openshift"), + (("bar.baz",), "openshift"), ]) def test_get_var_ok(task_vars, keys, expected): - assert get_var(task_vars, *keys) == expected + assert dummy_check(task_vars).get_var(*keys) == expected def test_get_var_error(task_vars, missing_keys): with pytest.raises(OpenShiftCheckException): - get_var(task_vars, *missing_keys) + dummy_check(task_vars).get_var(*missing_keys) def test_get_var_default(task_vars, missing_keys): default = object() - assert get_var(task_vars, *missing_keys, default=default) == default + assert dummy_check(task_vars).get_var(*missing_keys, default=default) == default + + +@pytest.mark.parametrize("keys, convert, expected", [ + (("foo",), str, "42"), + (("foo",), float, 42.0), + (("bar", "baz"), bool, False), +]) +def test_get_var_convert(task_vars, keys, convert, expected): + assert dummy_check(task_vars).get_var(*keys, convert=convert) == expected + + +@pytest.mark.parametrize("keys, convert", [ + (("bar", "baz"), int), + (("bar.baz"), float), + (("foo"), "bogus"), + (("foo"), lambda a, b: 1), + (("foo"), lambda a: 1 / 0), +]) +def test_get_var_convert_error(task_vars, keys, convert): + with pytest.raises(OpenShiftCheckException): + dummy_check(task_vars).get_var(*keys, convert=convert) diff --git a/roles/openshift_health_checker/test/ovs_version_test.py b/roles/openshift_health_checker/test/ovs_version_test.py index 6494e1c06..e1bf29d2a 100644 --- a/roles/openshift_health_checker/test/ovs_version_test.py +++ b/roles/openshift_health_checker/test/ovs_version_test.py @@ -4,7 +4,7 @@ from openshift_checks.ovs_version import OvsVersion, OpenShiftCheckException def test_openshift_version_not_supported(): - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): + def execute_module(*_): return {} openshift_release = '111.7.0' @@ -16,15 +16,14 @@ def test_openshift_version_not_supported(): openshift_deployment_type='origin', ) - check = OvsVersion(execute_module=execute_module) with pytest.raises(OpenShiftCheckException) as excinfo: - check.run(tmp=None, task_vars=task_vars) + OvsVersion(execute_module, task_vars).run() assert "no recommended version of Open vSwitch" in str(excinfo.value) def test_invalid_openshift_release_format(): - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): + def execute_module(*_): return {} task_vars = dict( @@ -33,15 +32,14 @@ def test_invalid_openshift_release_format(): openshift_deployment_type='origin', ) - check = OvsVersion(execute_module=execute_module) with pytest.raises(OpenShiftCheckException) as excinfo: - check.run(tmp=None, task_vars=task_vars) + OvsVersion(execute_module, task_vars).run() assert "invalid version" in str(excinfo.value) @pytest.mark.parametrize('openshift_release,expected_ovs_version', [ - ("3.5", "2.6"), - ("3.6", "2.6"), + ("3.5", ["2.6", "2.7"]), + ("3.6", ["2.6", "2.7"]), ("3.4", "2.4"), ("3.3", "2.4"), ("1.0", "2.4"), @@ -54,7 +52,7 @@ def test_ovs_package_version(openshift_release, expected_ovs_version): ) return_value = object() - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): + def execute_module(module_name=None, module_args=None, *_): assert module_name == 'rpm_version' assert "package_list" in module_args @@ -64,8 +62,7 @@ def test_ovs_package_version(openshift_release, expected_ovs_version): return return_value - check = OvsVersion(execute_module=execute_module) - result = check.run(tmp=None, task_vars=task_vars) + result = OvsVersion(execute_module, task_vars).run() assert result is return_value @@ -86,4 +83,4 @@ def test_ovs_version_skip_when_not_master_nor_node(group_names, is_containerized group_names=group_names, openshift=dict(common=dict(is_containerized=is_containerized)), ) - assert OvsVersion.is_active(task_vars=task_vars) == is_active + assert OvsVersion(None, task_vars).is_active() == is_active diff --git a/roles/openshift_health_checker/test/package_availability_test.py b/roles/openshift_health_checker/test/package_availability_test.py index f7e916a46..1fe648b75 100644 --- a/roles/openshift_health_checker/test/package_availability_test.py +++ b/roles/openshift_health_checker/test/package_availability_test.py @@ -14,7 +14,7 @@ def test_is_active(pkg_mgr, is_containerized, is_active): ansible_pkg_mgr=pkg_mgr, openshift=dict(common=dict(is_containerized=is_containerized)), ) - assert PackageAvailability.is_active(task_vars=task_vars) == is_active + assert PackageAvailability(None, task_vars).is_active() == is_active @pytest.mark.parametrize('task_vars,must_have_packages,must_not_have_packages', [ @@ -51,13 +51,12 @@ def test_is_active(pkg_mgr, is_containerized, is_active): def test_package_availability(task_vars, must_have_packages, must_not_have_packages): return_value = object() - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): + def execute_module(module_name=None, module_args=None, *_): assert module_name == 'check_yum_update' assert 'packages' in module_args assert set(module_args['packages']).issuperset(must_have_packages) assert not set(module_args['packages']).intersection(must_not_have_packages) return return_value - check = PackageAvailability(execute_module=execute_module) - result = check.run(tmp=None, task_vars=task_vars) + result = PackageAvailability(execute_module, task_vars).run() assert result is return_value diff --git a/roles/openshift_health_checker/test/package_update_test.py b/roles/openshift_health_checker/test/package_update_test.py index 5e000cff5..06489b0d7 100644 --- a/roles/openshift_health_checker/test/package_update_test.py +++ b/roles/openshift_health_checker/test/package_update_test.py @@ -4,13 +4,12 @@ from openshift_checks.package_update import PackageUpdate def test_package_update(): return_value = object() - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): + def execute_module(module_name=None, module_args=None, *_): assert module_name == 'check_yum_update' assert 'packages' in module_args # empty list of packages means "generic check if 'yum update' will work" assert module_args['packages'] == [] return return_value - check = PackageUpdate(execute_module=execute_module) - result = check.run(tmp=None, task_vars=None) + result = PackageUpdate(execute_module).run() assert result is return_value diff --git a/roles/openshift_health_checker/test/package_version_test.py b/roles/openshift_health_checker/test/package_version_test.py index 91eace512..6054d3f3e 100644 --- a/roles/openshift_health_checker/test/package_version_test.py +++ b/roles/openshift_health_checker/test/package_version_test.py @@ -3,61 +3,56 @@ import pytest from openshift_checks.package_version import PackageVersion, OpenShiftCheckException -@pytest.mark.parametrize('openshift_release, extra_words', [ - ('111.7.0', ["no recommended version of Open vSwitch"]), - ('0.0.0', ["no recommended version of Docker"]), -]) -def test_openshift_version_not_supported(openshift_release, extra_words): - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): - return {} - - task_vars = dict( - openshift=dict(common=dict(service_type='origin')), +def task_vars_for(openshift_release, deployment_type): + return dict( + openshift=dict(common=dict(service_type=deployment_type)), openshift_release=openshift_release, openshift_image_tag='v' + openshift_release, - openshift_deployment_type='origin', + openshift_deployment_type=deployment_type, ) - check = PackageVersion(execute_module=execute_module) + +def test_openshift_version_not_supported(): + check = PackageVersion(None, task_vars_for("1.2.3", 'origin')) + check.get_openshift_version_tuple = lambda: (3, 4, 1) # won't be in the dict + with pytest.raises(OpenShiftCheckException) as excinfo: - check.run(tmp=None, task_vars=task_vars) + check.get_required_ovs_version() + assert "no recommended version of Open vSwitch" in str(excinfo.value) - for word in extra_words: - assert word in str(excinfo.value) + with pytest.raises(OpenShiftCheckException) as excinfo: + check.get_required_docker_version() + assert "no recommended version of Docker" in str(excinfo.value) def test_invalid_openshift_release_format(): - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): - return {} - task_vars = dict( openshift=dict(common=dict(service_type='origin')), openshift_image_tag='v0', openshift_deployment_type='origin', ) - check = PackageVersion(execute_module=execute_module) + check = PackageVersion(lambda *_: {}, task_vars) with pytest.raises(OpenShiftCheckException) as excinfo: - check.run(tmp=None, task_vars=task_vars) + check.run() assert "invalid version" in str(excinfo.value) @pytest.mark.parametrize('openshift_release', [ - "3.5", + "111.7.0", + "3.7", "3.6", + "3.5.1.2.3", + "3.5", "3.4", "3.3", + "2.1.0", ]) def test_package_version(openshift_release): - task_vars = dict( - openshift=dict(common=dict(service_type='origin')), - openshift_release=openshift_release, - openshift_image_tag='v' + openshift_release, - openshift_deployment_type='origin', - ) + return_value = object() - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): + def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *_): assert module_name == 'aos_version' assert "package_list" in module_args @@ -67,59 +62,24 @@ def test_package_version(openshift_release): return return_value - check = PackageVersion(execute_module=execute_module) - result = check.run(tmp=None, task_vars=task_vars) - assert result is return_value - - -@pytest.mark.parametrize('deployment_type,openshift_release,expected_ovs_version', [ - ("openshift-enterprise", "3.5", "2.6"), - ("origin", "3.6", "2.6"), - ("openshift-enterprise", "3.4", "2.4"), - ("origin", "3.3", "2.4"), -]) -def test_ovs_package_version(deployment_type, openshift_release, expected_ovs_version): - task_vars = dict( - openshift=dict(common=dict(service_type='origin')), - openshift_release=openshift_release, - openshift_image_tag='v' + openshift_release, - openshift_deployment_type=deployment_type, - ) - return_value = object() - - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): - assert module_name == 'aos_version' - assert "package_list" in module_args - - for pkg in module_args["package_list"]: - if pkg["name"] == "openvswitch": - assert pkg["version"] == expected_ovs_version - - return return_value - - check = PackageVersion(execute_module=execute_module) - result = check.run(tmp=None, task_vars=task_vars) + check = PackageVersion(execute_module, task_vars_for(openshift_release, 'origin')) + result = check.run() assert result is return_value @pytest.mark.parametrize('deployment_type,openshift_release,expected_docker_version', [ ("origin", "3.5", "1.12"), + ("origin", "1.3", "1.10"), + ("origin", "1.1", "1.8"), ("openshift-enterprise", "3.4", "1.12"), - ("origin", "3.3", "1.10"), ("openshift-enterprise", "3.2", "1.10"), - ("origin", "3.1", "1.8"), ("openshift-enterprise", "3.1", "1.8"), ]) def test_docker_package_version(deployment_type, openshift_release, expected_docker_version): - task_vars = dict( - openshift=dict(common=dict(service_type='origin')), - openshift_release=openshift_release, - openshift_image_tag='v' + openshift_release, - openshift_deployment_type=deployment_type, - ) + return_value = object() - def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None): + def execute_module(module_name=None, module_args=None, *_): assert module_name == 'aos_version' assert "package_list" in module_args @@ -129,8 +89,8 @@ def test_docker_package_version(deployment_type, openshift_release, expected_doc return return_value - check = PackageVersion(execute_module=execute_module) - result = check.run(tmp=None, task_vars=task_vars) + check = PackageVersion(execute_module, task_vars_for(openshift_release, deployment_type)) + result = check.run() assert result is return_value @@ -151,4 +111,4 @@ def test_package_version_skip_when_not_master_nor_node(group_names, is_container group_names=group_names, openshift=dict(common=dict(is_containerized=is_containerized)), ) - assert PackageVersion.is_active(task_vars=task_vars) == is_active + assert PackageVersion(None, task_vars).is_active() == is_active diff --git a/roles/openshift_health_checker/test/rpm_version_test.py b/roles/openshift_health_checker/test/rpm_version_test.py index 2f09ef965..2c1bcf876 100644 --- a/roles/openshift_health_checker/test/rpm_version_test.py +++ b/roles/openshift_health_checker/test/rpm_version_test.py @@ -49,7 +49,7 @@ def test_check_pkg_found(pkgs, expect_not_found): }, { "eggs": { - "required_version": "3.2", + "required_versions": ["3.2"], "found_versions": ["3.3"], } }, # not the right version @@ -61,11 +61,11 @@ def test_check_pkg_found(pkgs, expect_not_found): }, { "eggs": { - "required_version": "3.2", + "required_versions": ["3.2"], "found_versions": ["3.3", "1.2"], }, "spam": { - "required_version": "3.2", + "required_versions": ["3.2"], "found_versions": ["3.1", "3.3"], } }, # not the right version diff --git a/roles/openshift_health_checker/test/search_journalctl_test.py b/roles/openshift_health_checker/test/search_journalctl_test.py new file mode 100644 index 000000000..724928aa1 --- /dev/null +++ b/roles/openshift_health_checker/test/search_journalctl_test.py @@ -0,0 +1,157 @@ +import pytest +import search_journalctl + + +def canned_search_journalctl(get_log_output=None): + """Create a search_journalctl object with canned get_log_output method""" + module = search_journalctl + if get_log_output: + module.get_log_output = get_log_output + return module + + +DEFAULT_TIMESTAMP = 1496341364 + + +def get_timestamp(modifier=0): + return DEFAULT_TIMESTAMP + modifier + + +def get_timestamp_microseconds(modifier=0): + return get_timestamp(modifier) * 1000000 + + +def create_test_log_object(stamp, msg): + return '{{"__REALTIME_TIMESTAMP": "{}", "MESSAGE": "{}"}}'.format(stamp, msg) + + +@pytest.mark.parametrize('name,matchers,log_input,expected_matches,expected_errors', [ + ( + 'test with valid params', + [ + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"test log message", + "unit": "test", + }, + ], + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), + ], + ["test log message"], + [], + ), + ( + 'test with invalid json in log input', + [ + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"test log message", + "unit": "test-unit", + }, + ], + [ + '{__REALTIME_TIMESTAMP: ' + str(get_timestamp_microseconds()) + ', "MESSAGE": "test log message"}', + ], + [], + [ + ["invalid json", "test-unit", "test log message"], + ], + ), + ( + 'test with invalid regexp', + [ + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"test [ log message", + "unit": "test", + }, + ], + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), + ], + [], + [ + ["invalid regular expression"], + ], + ), +], ids=lambda argval: argval[0]) +def test_get_log_matches(name, matchers, log_input, expected_matches, expected_errors): + def get_log_output(matcher): + return log_input + + module = canned_search_journalctl(get_log_output) + matched_regexp, errors = module.get_log_matches(matchers, 500, 60 * 60) + + assert set(matched_regexp) == set(expected_matches) + assert len(expected_errors) == len(errors) + + for idx, partial_err_set in enumerate(expected_errors): + for partial_err_msg in partial_err_set: + assert partial_err_msg in errors[idx] + + +@pytest.mark.parametrize('name,matcher,log_count_lim,stamp_lim_seconds,log_input,expected_match', [ + ( + 'test with matching log message, but out of bounds of log_count_lim', + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"dummy log message", + "unit": "test", + }, + 3, + get_timestamp(-100 * 60 * 60), + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), + ], + None, + ), + ( + 'test with matching log message, but with timestamp too old', + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"dummy log message", + "unit": "test", + }, + 100, + get_timestamp(-10), + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"), + ], + None, + ), + ( + 'test with matching log message, and timestamp within time limit', + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"dummy log message", + "unit": "test", + }, + 100, + get_timestamp(-1010), + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"), + ], + create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), + ), +], ids=lambda argval: argval[0]) +def test_find_matches_skips_logs(name, matcher, log_count_lim, stamp_lim_seconds, log_input, expected_match): + match = search_journalctl.find_matches(log_input, matcher, log_count_lim, stamp_lim_seconds) + assert match == expected_match |