diff options
Diffstat (limited to 'roles/openshift_health_checker')
62 files changed, 10354 insertions, 0 deletions
diff --git a/roles/openshift_health_checker/HOWTO_CHECKS.md b/roles/openshift_health_checker/HOWTO_CHECKS.md new file mode 100644 index 000000000..6c5662a4e --- /dev/null +++ b/roles/openshift_health_checker/HOWTO_CHECKS.md @@ -0,0 +1,34 @@ +# OpenShift health checks + +This Ansible role contains health checks to diagnose problems in OpenShift +environments. + +Checks are typically implemented as two parts: + +1. a Python module in [openshift_checks/](openshift_checks), with a class that + inherits from `OpenShiftCheck`. +2. a custom Ansible module in [library/](library), for cases when the modules + shipped with Ansible do not provide the required functionality. + +The checks are called from Ansible playbooks via the `openshift_health_check` +action plugin. See +[playbooks/byo/openshift-preflight/check.yml](../../playbooks/byo/openshift-preflight/check.yml) +for an example. + +The action plugin dynamically discovers all checks and executes only those +selected in the play. + +Checks can determine when they are active by implementing the method +`is_active`. Inactive checks are skipped. This is similar to the `when` +instruction in Ansible plays. + +Checks may have tags, which are a way to group related checks together. For +instance, to run all preflight checks, pass in the group `'@preflight'` to +`openshift_health_check`. + +Groups are automatically computed from tags. + +Groups and individual check names can be used together in the argument list to +`openshift_health_check`. + +Look at existing checks for the implementation details. diff --git a/roles/openshift_health_checker/README.md b/roles/openshift_health_checker/README.md new file mode 100644 index 000000000..4ab5f1f7b --- /dev/null +++ b/roles/openshift_health_checker/README.md @@ -0,0 +1,45 @@ +OpenShift Health Checker +======================== + +This role detects common problems with OpenShift installations or with +environments prior to install. + +For more information about creating new checks, see [HOWTO_CHECKS.md](HOWTO_CHECKS.md). + +Requirements +------------ + +* Ansible 2.2+ + +Role Variables +-------------- + +None + +Dependencies +------------ + +- openshift_facts + +Example Playbook +---------------- + +```yaml +--- +- hosts: OSEv3 + name: run OpenShift health checks + roles: + - openshift_health_checker + post_tasks: + - action: openshift_health_check +``` + +License +------- + +Apache License Version 2.0 + +Author Information +------------------ + +Customer Success team (dev@lists.openshift.redhat.com) diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py new file mode 100644 index 000000000..326176273 --- /dev/null +++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py @@ -0,0 +1,350 @@ +""" +Ansible action plugin to execute health checks in OpenShift clusters. +""" +import sys +import os +import base64 +import traceback +import errno +import json +from collections import defaultdict + +from ansible.plugins.action import ActionBase +from ansible.module_utils.six import string_types + +try: + from __main__ import display +except ImportError: + # pylint: disable=ungrouped-imports; this is the standard way how to import + # the default display object in Ansible action plugins. + from ansible.utils.display import Display + display = Display() + +# Augment sys.path so that we can import checks from a directory relative to +# this callback plugin. +sys.path.insert(1, os.path.dirname(os.path.dirname(__file__))) + +# pylint: disable=wrong-import-position; the import statement must come after +# the manipulation of sys.path. +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, load_checks # noqa: E402 + + +class ActionModule(ActionBase): + """Action plugin to execute health checks.""" + + def run(self, tmp=None, task_vars=None): + result = super(ActionModule, self).run(tmp, task_vars) + task_vars = task_vars or {} + + # callback plugins cannot read Ansible vars, but we would like + # zz_failure_summary to have access to certain values. We do so by + # storing the information we need in the result. + result['playbook_context'] = task_vars.get('r_openshift_health_checker_playbook_context') + + # if the user wants to write check results to files, they provide this directory: + output_dir = task_vars.get("openshift_checks_output_dir") + if output_dir: + output_dir = os.path.join(output_dir, task_vars["ansible_host"]) + + try: + known_checks = self.load_known_checks(tmp, task_vars, output_dir) + args = self._task.args + requested_checks = normalize(args.get('checks', [])) + + if not requested_checks: + result['failed'] = True + result['msg'] = list_known_checks(known_checks) + return result + + resolved_checks = resolve_checks(requested_checks, known_checks.values()) + except OpenShiftCheckException as exc: + result["failed"] = True + result["msg"] = str(exc) + return result + + if "openshift" not in task_vars: + result["failed"] = True + result["msg"] = "'openshift' is undefined, did 'openshift_facts' run?" + return result + + result["checks"] = check_results = {} + + user_disabled_checks = normalize(task_vars.get('openshift_disable_check', [])) + + for name in resolved_checks: + display.banner("CHECK [{} : {}]".format(name, task_vars["ansible_host"])) + check_results[name] = run_check(name, known_checks[name], user_disabled_checks, output_dir) + + result["changed"] = any(r.get("changed") for r in check_results.values()) + if any(r.get("failed") for r in check_results.values()): + result["failed"] = True + result["msg"] = "One or more checks failed" + write_result_to_output_dir(output_dir, result) + + return result + + def load_known_checks(self, tmp, task_vars, output_dir=None): + """Find all existing checks and return a mapping of names to instances.""" + load_checks() + want_full_results = bool(output_dir) + + known_checks = {} + for cls in OpenShiftCheck.subclasses(): + name = cls.name + if name in known_checks: + other_cls = known_checks[name].__class__ + raise OpenShiftCheckException( + "duplicate check name '{}' in: '{}' and '{}'" + "".format(name, full_class_name(cls), full_class_name(other_cls)) + ) + known_checks[name] = cls( + execute_module=self._execute_module, + tmp=tmp, + task_vars=task_vars, + want_full_results=want_full_results + ) + return known_checks + + +def list_known_checks(known_checks): + """Return text listing the existing checks and tags.""" + # TODO: we could include a description of each check by taking it from a + # check class attribute (e.g., __doc__) when building the message below. + msg = ( + 'This playbook is meant to run health checks, but no checks were ' + 'requested. Set the `openshift_checks` variable to a comma-separated ' + 'list of check names or a YAML list. Available checks:\n {}' + ).format('\n '.join(sorted(known_checks))) + + tags = describe_tags(known_checks.values()) + + msg += ( + '\n\nTags can be used as a shortcut to select multiple ' + 'checks. Available tags and the checks they select:\n {}' + ).format('\n '.join(tags)) + + return msg + + +def describe_tags(check_classes): + """Return a sorted list of strings describing tags and the checks they include.""" + tag_checks = defaultdict(list) + for cls in check_classes: + for tag in cls.tags: + tag_checks[tag].append(cls.name) + tags = [ + '@{} = {}'.format(tag, ','.join(sorted(checks))) + for tag, checks in tag_checks.items() + ] + return sorted(tags) + + +def resolve_checks(names, all_checks): + """Returns a set of resolved check names. + + Resolving a check name expands tag references (e.g., "@tag") to all the + checks that contain the given tag. OpenShiftCheckException is raised if + names contains an unknown check or tag name. + + names should be a sequence of strings. + + all_checks should be a sequence of check classes/instances. + """ + known_check_names = set(check.name for check in all_checks) + known_tag_names = set(name for check in all_checks for name in check.tags) + + check_names = set(name for name in names if not name.startswith('@')) + tag_names = set(name[1:] for name in names if name.startswith('@')) + + unknown_check_names = check_names - known_check_names + unknown_tag_names = tag_names - known_tag_names + + if unknown_check_names or unknown_tag_names: + msg = [] + if unknown_check_names: + msg.append('Unknown check names: {}.'.format(', '.join(sorted(unknown_check_names)))) + if unknown_tag_names: + msg.append('Unknown tag names: {}.'.format(', '.join(sorted(unknown_tag_names)))) + msg.append('Make sure there is no typo in the playbook and no files are missing.') + # TODO: implement a "Did you mean ...?" when the input is similar to a + # valid check or tag. + msg.append('Known checks:') + msg.append(' {}'.format('\n '.join(sorted(known_check_names)))) + msg.append('Known tags:') + msg.append(' {}'.format('\n '.join(describe_tags(all_checks)))) + raise OpenShiftCheckException('\n'.join(msg)) + + tag_to_checks = defaultdict(set) + for check in all_checks: + for tag in check.tags: + tag_to_checks[tag].add(check.name) + + resolved = check_names.copy() + for tag in tag_names: + resolved.update(tag_to_checks[tag]) + + return resolved + + +def normalize(checks): + """Return a clean list of check names. + + The input may be a comma-separated string or a sequence. Leading and + trailing whitespace characters are removed. Empty items are discarded. + """ + if isinstance(checks, string_types): + checks = checks.split(',') + return [name.strip() for name in checks if name.strip()] + + +def run_check(name, check, user_disabled_checks, output_dir=None): + """Run a single check if enabled and return a result dict.""" + + # determine if we're going to run the check (not inactive or disabled) + if name in user_disabled_checks or '*' in user_disabled_checks: + return dict(skipped=True, skipped_reason="Disabled by user request") + + # pylint: disable=broad-except; capturing exceptions broadly is intentional, + # to isolate arbitrary failures in one check from others. + try: + is_active = check.is_active() + except Exception as exc: + reason = "Could not determine if check should be run, exception: {}".format(exc) + return dict(skipped=True, skipped_reason=reason, exception=traceback.format_exc()) + + if not is_active: + return dict(skipped=True, skipped_reason="Not active for this host") + + # run the check + result = {} + try: + result = check.run() + except OpenShiftCheckException as exc: + check.register_failure(exc) + except Exception as exc: + check.register_failure("\n".join([str(exc), traceback.format_exc()])) + + # process the check state; compose the result hash, write files as needed + if check.changed: + result["changed"] = True + if check.failures or result.get("failed"): + if "msg" in result: # failure result has msg; combine with any registered failures + check.register_failure(result.get("msg")) + result["failures"] = [(fail.name, str(fail)) for fail in check.failures] + result["failed"] = True + result["msg"] = "\n".join(str(fail) for fail in check.failures) + write_to_output_file(output_dir, name + ".failures.json", result["failures"]) + if check.logs: + write_to_output_file(output_dir, name + ".log.json", check.logs) + if check.files_to_save: + write_files_to_save(output_dir, check) + + return result + + +def prepare_output_dir(dirname): + """Create the directory, including parents. Return bool for success/failure.""" + try: + os.makedirs(dirname) + return True + except OSError as exc: + # trying to create existing dir leads to error; + # that error is fine, but for any other, assume the dir is not there + return exc.errno == errno.EEXIST + + +def copy_remote_file_to_dir(check, file_to_save, output_dir, fname): + """Copy file from remote host to local file in output_dir, if given.""" + if not output_dir or not prepare_output_dir(output_dir): + return + local_file = os.path.join(output_dir, fname) + + # pylint: disable=broad-except; do not need to do anything about failure to write dir/file + # and do not want exceptions to break anything. + try: + # NOTE: it would have been nice to copy the file directly without loading it into + # memory, but there does not seem to be a good way to do this via ansible. + result = check.execute_module("slurp", dict(src=file_to_save), register=False) + if result.get("failed"): + display.warning("Could not retrieve file {}: {}".format(file_to_save, result.get("msg"))) + return + + content = result["content"] + if result.get("encoding") == "base64": + content = base64.b64decode(content) + with open(local_file, "wb") as outfile: + outfile.write(content) + except Exception as exc: + display.warning("Failed writing remote {} to local {}: {}".format(file_to_save, local_file, exc)) + return + + +def _no_fail(obj): + # pylint: disable=broad-except; do not want serialization to fail for any reason + try: + return str(obj) + except Exception: + return "[not serializable]" + + +def write_to_output_file(output_dir, filename, data): + """If output_dir provided, write data to file. Serialize as JSON if data is not a string.""" + + if not output_dir or not prepare_output_dir(output_dir): + return + filename = os.path.join(output_dir, filename) + try: + with open(filename, 'w') as outfile: + if isinstance(data, string_types): + outfile.write(data) + else: + json.dump(data, outfile, sort_keys=True, indent=4, default=_no_fail) + # pylint: disable=broad-except; do not want serialization/write to break for any reason + except Exception as exc: + display.warning("Could not write output file {}: {}".format(filename, exc)) + + +def write_result_to_output_dir(output_dir, result): + """If output_dir provided, write the result as json to result.json. + + Success/failure of the write is recorded as "output_files" in the result hash afterward. + Otherwise this is much like write_to_output_file. + """ + + if not output_dir: + return + if not prepare_output_dir(output_dir): + result["output_files"] = "Error creating output directory " + output_dir + return + + filename = os.path.join(output_dir, "result.json") + try: + with open(filename, 'w') as outfile: + json.dump(result, outfile, sort_keys=True, indent=4, default=_no_fail) + result["output_files"] = "Check results for this host written to " + filename + # pylint: disable=broad-except; do not want serialization/write to break for any reason + except Exception as exc: + result["output_files"] = "Error writing check results to {}:\n{}".format(filename, exc) + + +def write_files_to_save(output_dir, check): + """Write files to check subdir in output dir.""" + if not output_dir: + return + output_dir = os.path.join(output_dir, check.name) + seen_file = defaultdict(lambda: 0) + for file_to_save in check.files_to_save: + fname = file_to_save.filename + while seen_file[fname]: # just to be sure we never re-write a file, append numbers as needed + seen_file[fname] += 1 + fname = "{}.{}".format(fname, seen_file[fname]) + seen_file[fname] += 1 + if file_to_save.remote_filename: + copy_remote_file_to_dir(check, file_to_save.remote_filename, output_dir, fname) + else: + write_to_output_file(output_dir, fname, file_to_save.contents) + + +def full_class_name(cls): + """Return the name of a class prefixed with its module name.""" + return '{}.{}'.format(cls.__module__, cls.__name__) diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py new file mode 100644 index 000000000..dcaf87eca --- /dev/null +++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py @@ -0,0 +1,235 @@ +"""Ansible callback plugin to print a nicely formatted summary of failures. + +The file / module name is prefixed with `zz_` to make this plugin be loaded last +by Ansible, thus making its output the last thing that users see. +""" + +from collections import defaultdict +import traceback + +from ansible.plugins.callback import CallbackBase +from ansible import constants as C +from ansible.utils.color import stringc +from ansible.module_utils.six import string_types + + +FAILED_NO_MSG = u'Failed without returning a message.' + + +class CallbackModule(CallbackBase): + """This callback plugin stores task results and summarizes failures.""" + + CALLBACK_VERSION = 2.0 + CALLBACK_TYPE = 'aggregate' + CALLBACK_NAME = 'failure_summary' + CALLBACK_NEEDS_WHITELIST = False + + def __init__(self): + super(CallbackModule, self).__init__() + self.__failures = [] + self.__playbook_file = '' + + def v2_playbook_on_start(self, playbook): + super(CallbackModule, self).v2_playbook_on_start(playbook) + # pylint: disable=protected-access; Ansible gives us no public API to + # get the file name of the current playbook from a callback plugin. + self.__playbook_file = playbook._file_name + + def v2_runner_on_failed(self, result, ignore_errors=False): + super(CallbackModule, self).v2_runner_on_failed(result, ignore_errors) + if not ignore_errors: + self.__failures.append(result) + + def v2_playbook_on_stats(self, stats): + super(CallbackModule, self).v2_playbook_on_stats(stats) + # pylint: disable=broad-except; capturing exceptions broadly is + # intentional, to isolate arbitrary failures in this callback plugin. + try: + if self.__failures: + self._display.display(failure_summary(self.__failures, self.__playbook_file)) + except Exception: + msg = stringc( + u'An error happened while generating a summary of failures:\n' + u'{}'.format(traceback.format_exc()), C.COLOR_WARN) + self._display.v(msg) + + +def failure_summary(failures, playbook): + """Return a summary of failed tasks, including details on health checks.""" + if not failures: + return u'' + + # NOTE: because we don't have access to task_vars from callback plugins, we + # store the playbook context in the task result when the + # openshift_health_check action plugin is used, and we use this context to + # customize the error message. + # pylint: disable=protected-access; Ansible gives us no sufficient public + # API on TaskResult objects. + context = next(( + context for context in + (failure._result.get('playbook_context') for failure in failures) + if context + ), None) + + failures = [failure_to_dict(failure) for failure in failures] + failures = deduplicate_failures(failures) + + summary = [u'', u'', u'Failure summary:', u''] + + width = len(str(len(failures))) + initial_indent_format = u' {{:>{width}}}. '.format(width=width) + initial_indent_len = len(initial_indent_format.format(0)) + subsequent_indent = u' ' * initial_indent_len + subsequent_extra_indent = u' ' * (initial_indent_len + 10) + + for i, failure in enumerate(failures, 1): + entries = format_failure(failure) + summary.append(u'\n{}{}'.format(initial_indent_format.format(i), entries[0])) + for entry in entries[1:]: + entry = entry.replace(u'\n', u'\n' + subsequent_extra_indent) + indented = u'{}{}'.format(subsequent_indent, entry) + summary.append(indented) + + failed_checks = set() + for failure in failures: + failed_checks.update(name for name, message in failure['checks']) + if failed_checks: + summary.append(check_failure_footer(failed_checks, context, playbook)) + + return u'\n'.join(summary) + + +def failure_to_dict(failed_task_result): + """Extract information out of a failed TaskResult into a dict. + + The intent is to transform a TaskResult object into something easier to + manipulate. TaskResult is ansible.executor.task_result.TaskResult. + """ + # pylint: disable=protected-access; Ansible gives us no sufficient public + # API on TaskResult objects. + _result = failed_task_result._result + return { + 'host': failed_task_result._host.get_name(), + 'play': play_name(failed_task_result._task), + 'task': failed_task_result.task_name, + 'msg': _result.get('msg', FAILED_NO_MSG), + 'checks': tuple( + (name, result.get('msg', FAILED_NO_MSG)) + for name, result in sorted(_result.get('checks', {}).items()) + if result.get('failed') + ), + } + + +def play_name(obj): + """Given a task or block, return the name of its parent play. + + This is loosely inspired by ansible.playbook.base.Base.dump_me. + """ + # pylint: disable=protected-access; Ansible gives us no sufficient public + # API to implement this. + if not obj: + return '' + if hasattr(obj, '_play'): + return obj._play.get_name() + return play_name(getattr(obj, '_parent')) + + +def deduplicate_failures(failures): + """Group together similar failures from different hosts. + + Returns a new list of failures such that identical failures from different + hosts are grouped together in a single entry. The relative order of failures + is preserved. + + If failures is unhashable, the original list of failures is returned. + """ + groups = defaultdict(list) + for failure in failures: + group_key = tuple(sorted((key, value) for key, value in failure.items() if key != 'host')) + try: + groups[group_key].append(failure) + except TypeError: + # abort and return original list of failures when failures has an + # unhashable type. + return failures + + result = [] + for failure in failures: + group_key = tuple(sorted((key, value) for key, value in failure.items() if key != 'host')) + if group_key not in groups: + continue + failure['host'] = tuple(sorted(g_failure['host'] for g_failure in groups.pop(group_key))) + result.append(failure) + return result + + +def format_failure(failure): + """Return a list of pretty-formatted text entries describing a failure, including + relevant information about it. Expect that the list of text entries will be joined + by a newline separator when output to the user.""" + if isinstance(failure['host'], string_types): + host = failure['host'] + else: + host = u', '.join(failure['host']) + play = failure['play'] + task = failure['task'] + msg = failure['msg'] + checks = failure['checks'] + fields = ( + (u'Hosts', host), + (u'Play', play), + (u'Task', task), + (u'Message', stringc(msg, C.COLOR_ERROR)), + ) + if checks: + fields += ((u'Details', format_failed_checks(checks)),) + row_format = '{:10}{}' + return [row_format.format(header + u':', body) for header, body in fields] + + +def format_failed_checks(checks): + """Return pretty-formatted text describing checks that failed.""" + messages = [] + for name, message in checks: + messages.append(u'check "{}":\n{}'.format(name, message)) + return stringc(u'\n\n'.join(messages), C.COLOR_ERROR) + + +def check_failure_footer(failed_checks, context, playbook): + """Return a textual explanation about checks depending on context. + + The purpose of specifying context is to vary the output depending on what + the user was expecting to happen (based on which playbook they ran). The + only use currently is to vary the message depending on whether the user was + deliberately running checks or was trying to install/upgrade and checks are + just included. Other use cases may arise. + """ + checks = ','.join(sorted(failed_checks)) + summary = [u''] + if context in ['pre-install', 'health', 'adhoc']: + # User was expecting to run checks, less explanation needed. + summary.extend([ + u'You may configure or disable checks by setting Ansible ' + u'variables. To disable those above, set:', + u' openshift_disable_check={checks}'.format(checks=checks), + u'Consult check documentation for configurable variables.', + ]) + else: + # User may not be familiar with the checks, explain what checks are in + # the first place. + summary.extend([ + u'The execution of "{playbook}" includes checks designed to fail ' + u'early if the requirements of the playbook are not met. One or ' + u'more of these checks failed. To disregard these results,' + u'explicitly disable checks by setting an Ansible variable:'.format(playbook=playbook), + u' openshift_disable_check={checks}'.format(checks=checks), + u'Failing check names are shown in the failure details above. ' + u'Some checks may be configurable by variables if your requirements ' + u'are different from the defaults; consult check documentation.', + ]) + summary.append( + u'Variables can be set in the inventory or passed on the command line ' + u'using the -e flag to ansible-playbook.' + ) + return u'\n'.join(summary) diff --git a/roles/openshift_health_checker/library/aos_version.py b/roles/openshift_health_checker/library/aos_version.py new file mode 100644 index 000000000..db3c0b654 --- /dev/null +++ b/roles/openshift_health_checker/library/aos_version.py @@ -0,0 +1,255 @@ +#!/usr/bin/python +""" +Ansible module for yum-based systems determining if multiple releases +of an OpenShift package are available, and if the release requested +(if any) is available down to the given precision. + +For Enterprise, multiple releases available suggest that multiple repos +are enabled for the different releases, which may cause installation +problems. With Origin, however, this is a normal state of affairs as +all the releases are provided in a single repo with the expectation that +only the latest can be installed. + +Code in the openshift_version role contains a lot of logic to pin down +the exact package and image version to use and so does some validation +of release availability already. Without duplicating all that, we would +like the user to have a helpful error message if we detect things will +not work out right. Note that if openshift_release is not specified in +the inventory, the version comparison checks just pass. +""" + +from ansible.module_utils.basic import AnsibleModule +# NOTE: because of the dependency on yum (Python 2-only), this module does not +# work under Python 3. But since we run unit tests against both Python 2 and +# Python 3, we use six for cross compatibility in this module alone: +from ansible.module_utils.six import string_types + +YUM_IMPORT_EXCEPTION = None +DNF_IMPORT_EXCEPTION = None +try: + import yum # pylint: disable=import-error +except ImportError as err: + YUM_IMPORT_EXCEPTION = err + +try: + import dnf # pylint: disable=import-error +except ImportError as err: + DNF_IMPORT_EXCEPTION = err + + +class AosVersionException(Exception): + """Base exception class for package version problems""" + def __init__(self, message, problem_pkgs=None): + Exception.__init__(self, message) + self.problem_pkgs = problem_pkgs + + +def main(): + """Entrypoint for this Ansible module""" + module = AnsibleModule( + argument_spec=dict( + package_list=dict(type="list", required=True), + package_mgr=dict(type="str", required=True), + ), + supports_check_mode=True + ) + + # determine the package manager to use + package_mgr = module.params['package_mgr'] + if package_mgr not in ('yum', 'dnf'): + module.fail_json(msg="package_mgr must be one of: yum, dnf") + pkg_mgr_exception = dict(yum=YUM_IMPORT_EXCEPTION, dnf=DNF_IMPORT_EXCEPTION)[package_mgr] + if pkg_mgr_exception: + module.fail_json( + msg="aos_version module could not import {}: {}".format(package_mgr, pkg_mgr_exception) + ) + + # determine the packages we will look for + package_list = module.params['package_list'] + if not package_list: + module.fail_json(msg="package_list must not be empty") + + # generate set with only the names of expected packages + expected_pkg_names = [p["name"] for p in package_list] + + # gather packages that require a multi_minor_release check + multi_minor_pkgs = [p for p in package_list if p["check_multi"]] + + # generate list of packages with a specified (non-empty) version + # should look like a version string with possibly many segments e.g. "3.4.1" + versioned_pkgs = [p for p in package_list if p["version"]] + + # get the list of packages available and complain if anything is wrong + try: + pkgs = _retrieve_available_packages(package_mgr, expected_pkg_names) + if versioned_pkgs: + _check_precise_version_found(pkgs, _to_dict(versioned_pkgs)) + _check_higher_version_found(pkgs, _to_dict(versioned_pkgs)) + if multi_minor_pkgs: + _check_multi_minor_release(pkgs, _to_dict(multi_minor_pkgs)) + except AosVersionException as excinfo: + module.fail_json(msg=str(excinfo)) + module.exit_json(changed=False) + + +def _to_dict(pkg_list): + return {pkg["name"]: pkg for pkg in pkg_list} + + +def _retrieve_available_packages(pkg_mgr, expected_pkgs): + # The openshift excluder prevents unintended updates to openshift + # packages by setting yum excludes on those packages. See: + # https://wiki.centos.org/SpecialInterestGroup/PaaS/OpenShift-Origin-Control-Updates + # Excludes are then disabled during an install or upgrade, but + # this check will most likely be running outside either. When we + # attempt to determine what packages are available via yum they may + # be excluded. So, for our purposes here, disable excludes to see + # what will really be available during an install or upgrade. + + if pkg_mgr == "yum": + # search for package versions available for openshift pkgs + yb = yum.YumBase() # pylint: disable=invalid-name + + yb.conf.disable_excludes = ['all'] + + try: + pkgs = yb.rpmdb.returnPackages(patterns=expected_pkgs) + pkgs += yb.pkgSack.returnPackages(patterns=expected_pkgs) + except yum.Errors.PackageSackError as excinfo: + # you only hit this if *none* of the packages are available + raise AosVersionException('\n'.join([ + 'Unable to find any OpenShift packages.', + 'Check your subscription and repo settings.', + str(excinfo), + ])) + elif pkg_mgr == "dnf": + dbase = dnf.Base() # pyling: disable=invalid-name + + dbase.conf.disable_excludes = ['all'] + dbase.read_all_repos() + dbase.fill_sack(load_system_repo=False, load_available_repos=True) + + dquery = dbase.sack.query() + aquery = dquery.available() + iquery = dquery.installed() + + available_pkgs = list(aquery.filter(name=expected_pkgs)) + installed_pkgs = list(iquery.filter(name=expected_pkgs)) + pkgs = available_pkgs + installed_pkgs + + if not pkgs: + # pkgs list is empty, raise because no expected packages found + raise AosVersionException('\n'.join([ + 'Unable to find any OpenShift packages.', + 'Check your subscription and repo settings.', + ])) + + return pkgs + + +class PreciseVersionNotFound(AosVersionException): + """Exception for reporting packages not available at given version""" + def __init__(self, not_found): + msg = ['Not all of the required packages are available at their requested version'] + msg += ['{}:{} '.format(pkg["name"], pkg["version"]) for pkg in not_found] + msg += ['Please check your subscriptions and enabled repositories.'] + AosVersionException.__init__(self, '\n'.join(msg), not_found) + + +def _check_precise_version_found(pkgs, expected_pkgs_dict): + # see if any packages couldn't be found at requested release version + # we would like to verify that the latest available pkgs have however specific a version is given. + # so e.g. if there is a package version 3.4.1.5 the check passes; if only 3.4.0, it fails. + + pkgs_precise_version_found = set() + for pkg in pkgs: + if pkg.name not in expected_pkgs_dict: + continue + expected_pkg_versions = expected_pkgs_dict[pkg.name]["version"] + if isinstance(expected_pkg_versions, string_types): + expected_pkg_versions = [expected_pkg_versions] + for expected_pkg_version in expected_pkg_versions: + # does the version match, to the precision requested? + # and, is it strictly greater, at the precision requested? + match_version = '.'.join(pkg.version.split('.')[:expected_pkg_version.count('.') + 1]) + if match_version == expected_pkg_version: + pkgs_precise_version_found.add(pkg.name) + + not_found = [] + for name, pkg in expected_pkgs_dict.items(): + if name not in pkgs_precise_version_found: + not_found.append(pkg) + + if not_found: + raise PreciseVersionNotFound(not_found) + + +class FoundHigherVersion(AosVersionException): + """Exception for reporting that a higher version than requested is available""" + def __init__(self, higher_found): + msg = ['Some required package(s) are available at a version', + 'that is higher than requested'] + msg += [' ' + name for name in higher_found] + msg += ['This will prevent installing the version you requested.'] + msg += ['Please check your enabled repositories or adjust openshift_release.'] + AosVersionException.__init__(self, '\n'.join(msg), higher_found) + + +def _check_higher_version_found(pkgs, expected_pkgs_dict): + expected_pkg_names = list(expected_pkgs_dict) + + # see if any packages are available in a version higher than requested + higher_version_for_pkg = {} + for pkg in pkgs: + if pkg.name not in expected_pkg_names: + continue + expected_pkg_versions = expected_pkgs_dict[pkg.name]["version"] + if isinstance(expected_pkg_versions, string_types): + expected_pkg_versions = [expected_pkg_versions] + # NOTE: the list of versions is assumed to be sorted so that the highest + # desirable version is the last. + highest_desirable_version = expected_pkg_versions[-1] + req_release_arr = [int(segment) for segment in highest_desirable_version.split(".")] + version = [int(segment) for segment in pkg.version.split(".")] + too_high = version[:len(req_release_arr)] > req_release_arr + higher_than_seen = version > higher_version_for_pkg.get(pkg.name, []) + if too_high and higher_than_seen: + higher_version_for_pkg[pkg.name] = version + + if higher_version_for_pkg: + higher_found = [] + for name, version in higher_version_for_pkg.items(): + higher_found.append(name + '-' + '.'.join(str(segment) for segment in version)) + raise FoundHigherVersion(higher_found) + + +class FoundMultiRelease(AosVersionException): + """Exception for reporting multiple minor releases found for same package""" + def __init__(self, multi_found): + msg = ['Multiple minor versions of these packages are available'] + msg += [' ' + name for name in multi_found] + msg += ["There should only be one OpenShift release repository enabled at a time."] + AosVersionException.__init__(self, '\n'.join(msg), multi_found) + + +def _check_multi_minor_release(pkgs, expected_pkgs_dict): + # see if any packages are available in more than one minor version + pkgs_by_name_version = {} + for pkg in pkgs: + # keep track of x.y (minor release) versions seen + minor_release = '.'.join(pkg.version.split('.')[:2]) + if pkg.name not in pkgs_by_name_version: + pkgs_by_name_version[pkg.name] = set() + pkgs_by_name_version[pkg.name].add(minor_release) + + multi_found = [] + for name in expected_pkgs_dict: + if name in pkgs_by_name_version and len(pkgs_by_name_version[name]) > 1: + multi_found.append(name) + + if multi_found: + raise FoundMultiRelease(multi_found) + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/library/check_yum_update.py b/roles/openshift_health_checker/library/check_yum_update.py new file mode 100644 index 000000000..433795b67 --- /dev/null +++ b/roles/openshift_health_checker/library/check_yum_update.py @@ -0,0 +1,105 @@ +#!/usr/bin/python +''' +Ansible module to test whether a yum update or install will succeed, +without actually performing it or running yum. +parameters: + packages: (optional) A list of package names to install or update. + If omitted, all installed RPMs are considered for updates. +''' + +import sys + +import yum # pylint: disable=import-error + +from ansible.module_utils.basic import AnsibleModule + + +def main(): # pylint: disable=missing-docstring,too-many-branches + module = AnsibleModule( + argument_spec=dict( + packages=dict(type='list', default=[]) + ), + supports_check_mode=True + ) + + def bail(error): # pylint: disable=missing-docstring + module.fail_json(msg=error) + + yb = yum.YumBase() # pylint: disable=invalid-name + yb.conf.disable_excludes = ["all"] # assume the openshift excluder will be managed, ignore current state + # determine if the existing yum configuration is valid + try: + yb.repos.populateSack(mdtype='metadata', cacheonly=1) + # for error of type: + # 1. can't reach the repo URL(s) + except yum.Errors.NoMoreMirrorsRepoError as e: # pylint: disable=invalid-name + bail('Error getting data from at least one yum repository: %s' % e) + # 2. invalid repo definition + except yum.Errors.RepoError as e: # pylint: disable=invalid-name + bail('Error with yum repository configuration: %s' % e) + # 3. other/unknown + # * just report the problem verbatim + except: # pylint: disable=bare-except; # noqa + bail('Unexpected error with yum repository: %s' % sys.exc_info()[1]) + + packages = module.params['packages'] + no_such_pkg = [] + for pkg in packages: + try: + yb.install(name=pkg) + except yum.Errors.InstallError as e: # pylint: disable=invalid-name + no_such_pkg.append(pkg) + except: # pylint: disable=bare-except; # noqa + bail('Unexpected error with yum install/update: %s' % + sys.exc_info()[1]) + if not packages: + # no packages requested means test a yum update of everything + yb.update() + elif no_such_pkg: + # wanted specific packages to install but some aren't available + user_msg = 'Cannot install all of the necessary packages. Unavailable:\n' + for pkg in no_such_pkg: + user_msg += ' %s\n' % pkg + user_msg += 'You may need to enable one or more yum repositories to make this content available.' + bail(user_msg) + + try: + txn_result, txn_msgs = yb.buildTransaction() + except: # pylint: disable=bare-except; # noqa + bail('Unexpected error during dependency resolution for yum update: \n %s' % + sys.exc_info()[1]) + + # find out if there are any errors with the update/install + if txn_result == 0: # 'normal exit' meaning there's nothing to install/update + pass + elif txn_result == 1: # error with transaction + user_msg = 'Could not perform a yum update.\n' + if len(txn_msgs) > 0: + user_msg += 'Errors from dependency resolution:\n' + for msg in txn_msgs: + user_msg += ' %s\n' % msg + user_msg += 'You should resolve these issues before proceeding with an install.\n' + user_msg += 'You may need to remove or downgrade packages or enable/disable yum repositories.' + bail(user_msg) + # TODO: it would be nice depending on the problem: + # 1. dependency for update not found + # * construct the dependency tree + # * find the installed package(s) that required the missing dep + # * determine if any of these packages matter to openshift + # * build helpful error output + # 2. conflicts among packages in available content + # * analyze dependency tree and build helpful error output + # 3. other/unknown + # * report the problem verbatim + # * add to this list as we come across problems we can clearly diagnose + elif txn_result == 2: # everything resolved fine + pass + else: + bail('Unknown error(s) from dependency resolution. Exit Code: %d:\n%s' % + (txn_result, txn_msgs)) + + module.exit_json(changed=False) + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/library/docker_container.py b/roles/openshift_health_checker/library/docker_container.py new file mode 100644 index 000000000..f81b4ec01 --- /dev/null +++ b/roles/openshift_health_checker/library/docker_container.py @@ -0,0 +1,2036 @@ +#!/usr/bin/python +# pylint: skip-file +# flake8: noqa + +# TODO: remove this file once openshift-ansible requires ansible >= 2.3. +# This file is a copy of +# https://github.com/ansible/ansible/blob/20bf02f/lib/ansible/modules/cloud/docker/docker_container.py. +# It has been temporarily vendored here due to issue https://github.com/ansible/ansible/issues/22323. + + +# Copyright 2016 Red Hat | Ansible +# +# This file is part of Ansible +# +# Ansible is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Ansible is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Ansible. If not, see <http://www.gnu.org/licenses/>. + +ANSIBLE_METADATA = {'status': ['preview'], + 'supported_by': 'committer', + 'version': '1.0'} + +DOCUMENTATION = ''' +--- +module: docker_container + +short_description: manage docker containers + +description: + - Manage the life cycle of docker containers. + - Supports check mode. Run with --check and --diff to view config difference and list of actions to be taken. + +version_added: "2.1" + +options: + blkio_weight: + description: + - Block IO (relative weight), between 10 and 1000. + default: null + required: false + capabilities: + description: + - List of capabilities to add to the container. + default: null + required: false + cleanup: + description: + - Use with I(detach) to remove the container after successful execution. + default: false + required: false + version_added: "2.2" + command: + description: + - Command to execute when the container starts. + default: null + required: false + cpu_period: + description: + - Limit CPU CFS (Completely Fair Scheduler) period + default: 0 + required: false + cpu_quota: + description: + - Limit CPU CFS (Completely Fair Scheduler) quota + default: 0 + required: false + cpuset_cpus: + description: + - CPUs in which to allow execution C(1,3) or C(1-3). + default: null + required: false + cpuset_mems: + description: + - Memory nodes (MEMs) in which to allow execution C(0-3) or C(0,1) + default: null + required: false + cpu_shares: + description: + - CPU shares (relative weight). + default: null + required: false + detach: + description: + - Enable detached mode to leave the container running in background. + If disabled, the task will reflect the status of the container run (failed if the command failed). + default: true + required: false + devices: + description: + - "List of host device bindings to add to the container. Each binding is a mapping expressed + in the format: <path_on_host>:<path_in_container>:<cgroup_permissions>" + default: null + required: false + dns_servers: + description: + - List of custom DNS servers. + default: null + required: false + dns_search_domains: + description: + - List of custom DNS search domains. + default: null + required: false + env: + description: + - Dictionary of key,value pairs. + default: null + required: false + env_file: + version_added: "2.2" + description: + - Path to a file containing environment variables I(FOO=BAR). + - If variable also present in C(env), then C(env) value will override. + - Requires docker-py >= 1.4.0. + default: null + required: false + entrypoint: + description: + - Command that overwrites the default ENTRYPOINT of the image. + default: null + required: false + etc_hosts: + description: + - Dict of host-to-IP mappings, where each host name is a key in the dictionary. + Each host name will be added to the container's /etc/hosts file. + default: null + required: false + exposed_ports: + description: + - List of additional container ports which informs Docker that the container + listens on the specified network ports at runtime. + If the port is already exposed using EXPOSE in a Dockerfile, it does not + need to be exposed again. + default: null + required: false + aliases: + - exposed + force_kill: + description: + - Use the kill command when stopping a running container. + default: false + required: false + groups: + description: + - List of additional group names and/or IDs that the container process will run as. + default: null + required: false + hostname: + description: + - Container hostname. + default: null + required: false + ignore_image: + description: + - When C(state) is I(present) or I(started) the module compares the configuration of an existing + container to requested configuration. The evaluation includes the image version. If + the image version in the registry does not match the container, the container will be + recreated. Stop this behavior by setting C(ignore_image) to I(True). + default: false + required: false + version_added: "2.2" + image: + description: + - Repository path and tag used to create the container. If an image is not found or pull is true, the image + will be pulled from the registry. If no tag is included, 'latest' will be used. + default: null + required: false + interactive: + description: + - Keep stdin open after a container is launched, even if not attached. + default: false + required: false + ipc_mode: + description: + - Set the IPC mode for the container. Can be one of 'container:<name|id>' to reuse another + container's IPC namespace or 'host' to use the host's IPC namespace within the container. + default: null + required: false + keep_volumes: + description: + - Retain volumes associated with a removed container. + default: true + required: false + kill_signal: + description: + - Override default signal used to kill a running container. + default null: + required: false + kernel_memory: + description: + - "Kernel memory limit (format: <number>[<unit>]). Number is a positive integer. + Unit can be one of b, k, m, or g. Minimum is 4M." + default: 0 + required: false + labels: + description: + - Dictionary of key value pairs. + default: null + required: false + links: + description: + - List of name aliases for linked containers in the format C(container_name:alias) + default: null + required: false + log_driver: + description: + - Specify the logging driver. Docker uses json-file by default. + choices: + - none + - json-file + - syslog + - journald + - gelf + - fluentd + - awslogs + - splunk + default: null + required: false + log_options: + description: + - Dictionary of options specific to the chosen log_driver. See https://docs.docker.com/engine/admin/logging/overview/ + for details. + required: false + default: null + mac_address: + description: + - Container MAC address (e.g. 92:d0:c6:0a:29:33) + default: null + required: false + memory: + description: + - "Memory limit (format: <number>[<unit>]). Number is a positive integer. + Unit can be one of b, k, m, or g" + default: 0 + required: false + memory_reservation: + description: + - "Memory soft limit (format: <number>[<unit>]). Number is a positive integer. + Unit can be one of b, k, m, or g" + default: 0 + required: false + memory_swap: + description: + - Total memory limit (memory + swap, format:<number>[<unit>]). + Number is a positive integer. Unit can be one of b, k, m, or g. + default: 0 + required: false + memory_swappiness: + description: + - Tune a container's memory swappiness behavior. Accepts an integer between 0 and 100. + default: 0 + required: false + name: + description: + - Assign a name to a new container or match an existing container. + - When identifying an existing container name may be a name or a long or short container ID. + required: true + network_mode: + description: + - Connect the container to a network. + choices: + - bridge + - container:<name|id> + - host + - none + default: null + required: false + networks: + description: + - List of networks the container belongs to. + - Each network is a dict with keys C(name), C(ipv4_address), C(ipv6_address), C(links), C(aliases). + - For each network C(name) is required, all other keys are optional. + - If included, C(links) or C(aliases) are lists. + - For examples of the data structure and usage see EXAMPLES below. + - To remove a container from one or more networks, use the C(purge_networks) option. + default: null + required: false + version_added: "2.2" + oom_killer: + description: + - Whether or not to disable OOM Killer for the container. + default: false + required: false + oom_score_adj: + description: + - An integer value containing the score given to the container in order to tune OOM killer preferences. + default: 0 + required: false + version_added: "2.2" + paused: + description: + - Use with the started state to pause running processes inside the container. + default: false + required: false + pid_mode: + description: + - Set the PID namespace mode for the container. Currently only supports 'host'. + default: null + required: false + privileged: + description: + - Give extended privileges to the container. + default: false + required: false + published_ports: + description: + - List of ports to publish from the container to the host. + - "Use docker CLI syntax: C(8000), C(9000:8000), or C(0.0.0.0:9000:8000), where 8000 is a + container port, 9000 is a host port, and 0.0.0.0 is a host interface." + - Container ports must be exposed either in the Dockerfile or via the C(expose) option. + - A value of all will publish all exposed container ports to random host ports, ignoring + any other mappings. + - If C(networks) parameter is provided, will inspect each network to see if there exists + a bridge network with optional parameter com.docker.network.bridge.host_binding_ipv4. + If such a network is found, then published ports where no host IP address is specified + will be bound to the host IP pointed to by com.docker.network.bridge.host_binding_ipv4. + Note that the first bridge network with a com.docker.network.bridge.host_binding_ipv4 + value encountered in the list of C(networks) is the one that will be used. + aliases: + - ports + required: false + default: null + pull: + description: + - If true, always pull the latest version of an image. Otherwise, will only pull an image when missing. + default: false + required: false + purge_networks: + description: + - Remove the container from ALL networks not included in C(networks) parameter. + - Any default networks such as I(bridge), if not found in C(networks), will be removed as well. + default: false + required: false + version_added: "2.2" + read_only: + description: + - Mount the container's root file system as read-only. + default: false + required: false + recreate: + description: + - Use with present and started states to force the re-creation of an existing container. + default: false + required: false + restart: + description: + - Use with started state to force a matching container to be stopped and restarted. + default: false + required: false + restart_policy: + description: + - Container restart policy. Place quotes around I(no) option. + choices: + - always + - no + - on-failure + - unless-stopped + default: on-failure + required: false + restart_retries: + description: + - Use with restart policy to control maximum number of restart attempts. + default: 0 + required: false + shm_size: + description: + - Size of `/dev/shm`. The format is `<number><unit>`. `number` must be greater than `0`. + Unit is optional and can be `b` (bytes), `k` (kilobytes), `m` (megabytes), or `g` (gigabytes). + - Omitting the unit defaults to bytes. If you omit the size entirely, the system uses `64m`. + default: null + required: false + security_opts: + description: + - List of security options in the form of C("label:user:User") + default: null + required: false + state: + description: + - 'I(absent) - A container matching the specified name will be stopped and removed. Use force_kill to kill the container + rather than stopping it. Use keep_volumes to retain volumes associated with the removed container.' + - 'I(present) - Asserts the existence of a container matching the name and any provided configuration parameters. If no + container matches the name, a container will be created. If a container matches the name but the provided configuration + does not match, the container will be updated, if it can be. If it cannot be updated, it will be removed and re-created + with the requested config. Image version will be taken into account when comparing configuration. To ignore image + version use the ignore_image option. Use the recreate option to force the re-creation of the matching container. Use + force_kill to kill the container rather than stopping it. Use keep_volumes to retain volumes associated with a removed + container.' + - 'I(started) - Asserts there is a running container matching the name and any provided configuration. If no container + matches the name, a container will be created and started. If a container matching the name is found but the + configuration does not match, the container will be updated, if it can be. If it cannot be updated, it will be removed + and a new container will be created with the requested configuration and started. Image version will be taken into + account when comparing configuration. To ignore image version use the ignore_image option. Use recreate to always + re-create a matching container, even if it is running. Use restart to force a matching container to be stopped and + restarted. Use force_kill to kill a container rather than stopping it. Use keep_volumes to retain volumes associated + with a removed container.' + - 'I(stopped) - Asserts that the container is first I(present), and then if the container is running moves it to a stopped + state. Use force_kill to kill a container rather than stopping it.' + required: false + default: started + choices: + - absent + - present + - stopped + - started + stop_signal: + description: + - Override default signal used to stop the container. + default: null + required: false + stop_timeout: + description: + - Number of seconds to wait for the container to stop before sending SIGKILL. + required: false + default: null + trust_image_content: + description: + - If true, skip image verification. + default: false + required: false + tty: + description: + - Allocate a psuedo-TTY. + default: false + required: false + ulimits: + description: + - "List of ulimit options. A ulimit is specified as C(nofile:262144:262144)" + default: null + required: false + user: + description: + - Sets the username or UID used and optionally the groupname or GID for the specified command. + - "Can be [ user | user:group | uid | uid:gid | user:gid | uid:group ]" + default: null + required: false + uts: + description: + - Set the UTS namespace mode for the container. + default: null + required: false + volumes: + description: + - List of volumes to mount within the container. + - "Use docker CLI-style syntax: C(/host:/container[:mode])" + - You can specify a read mode for the mount with either C(ro) or C(rw). + - SELinux hosts can additionally use C(z) or C(Z) to use a shared or + private label for the volume. + default: null + required: false + volume_driver: + description: + - The container volume driver. + default: none + required: false + volumes_from: + description: + - List of container names or Ids to get volumes from. + default: null + required: false +extends_documentation_fragment: + - docker + +author: + - "Cove Schneider (@cove)" + - "Joshua Conner (@joshuaconner)" + - "Pavel Antonov (@softzilla)" + - "Thomas Steinbach (@ThomasSteinbach)" + - "Philippe Jandot (@zfil)" + - "Daan Oosterveld (@dusdanig)" + - "James Tanner (@jctanner)" + - "Chris Houseknecht (@chouseknecht)" + +requirements: + - "python >= 2.6" + - "docker-py >= 1.7.0" + - "Docker API >= 1.20" +''' + +EXAMPLES = ''' +- name: Create a data container + docker_container: + name: mydata + image: busybox + volumes: + - /data + +- name: Re-create a redis container + docker_container: + name: myredis + image: redis + command: redis-server --appendonly yes + state: present + recreate: yes + exposed_ports: + - 6379 + volumes_from: + - mydata + +- name: Restart a container + docker_container: + name: myapplication + image: someuser/appimage + state: started + restart: yes + links: + - "myredis:aliasedredis" + devices: + - "/dev/sda:/dev/xvda:rwm" + ports: + - "8080:9000" + - "127.0.0.1:8081:9001/udp" + env: + SECRET_KEY: ssssh + +- name: Container present + docker_container: + name: mycontainer + state: present + image: ubuntu:14.04 + command: sleep infinity + +- name: Stop a container + docker_container: + name: mycontainer + state: stopped + +- name: Start 4 load-balanced containers + docker_container: + name: "container{{ item }}" + recreate: yes + image: someuser/anotherappimage + command: sleep 1d + with_sequence: count=4 + +- name: remove container + docker_container: + name: ohno + state: absent + +- name: Syslogging output + docker_container: + name: myservice + image: busybox + log_driver: syslog + log_options: + syslog-address: tcp://my-syslog-server:514 + syslog-facility: daemon + # NOTE: in Docker 1.13+ the "syslog-tag" option was renamed to "tag" for + # older docker installs, use "syslog-tag" instead + tag: myservice + +- name: Create db container and connect to network + docker_container: + name: db_test + image: "postgres:latest" + networks: + - name: "{{ docker_network_name }}" + +- name: Start container, connect to network and link + docker_container: + name: sleeper + image: ubuntu:14.04 + networks: + - name: TestingNet + ipv4_address: "172.1.1.100" + aliases: + - sleepyzz + links: + - db_test:db + - name: TestingNet2 + +- name: Start a container with a command + docker_container: + name: sleepy + image: ubuntu:14.04 + command: sleep infinity + +- name: Add container to networks + docker_container: + name: sleepy + networks: + - name: TestingNet + ipv4_address: 172.1.1.18 + links: + - sleeper + - name: TestingNet2 + ipv4_address: 172.1.10.20 + +- name: Update network with aliases + docker_container: + name: sleepy + networks: + - name: TestingNet + aliases: + - sleepyz + - zzzz + +- name: Remove container from one network + docker_container: + name: sleepy + networks: + - name: TestingNet2 + purge_networks: yes + +- name: Remove container from all networks + docker_container: + name: sleepy + purge_networks: yes + +''' + +RETURN = ''' +docker_container: + description: + - Before 2.3 this was 'ansible_docker_container' but was renamed due to conflicts with the connection plugin. + - Facts representing the current state of the container. Matches the docker inspection output. + - Note that facts are not part of registered vars but accessible directly. + - Empty if C(state) is I(absent) + - If detached is I(False), will include Output attribute containing any output from container run. + returned: always + type: dict + sample: '{ + "AppArmorProfile": "", + "Args": [], + "Config": { + "AttachStderr": false, + "AttachStdin": false, + "AttachStdout": false, + "Cmd": [ + "/usr/bin/supervisord" + ], + "Domainname": "", + "Entrypoint": null, + "Env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + ], + "ExposedPorts": { + "443/tcp": {}, + "80/tcp": {} + }, + "Hostname": "8e47bf643eb9", + "Image": "lnmp_nginx:v1", + "Labels": {}, + "OnBuild": null, + "OpenStdin": false, + "StdinOnce": false, + "Tty": false, + "User": "", + "Volumes": { + "/tmp/lnmp/nginx-sites/logs/": {} + }, + ... + }' +''' + +import re + +from ansible.module_utils.docker_common import * + +try: + from docker import utils + if HAS_DOCKER_PY_2: + from docker.types import Ulimit + else: + from docker.utils.types import Ulimit +except: + # missing docker-py handled in ansible.module_utils.docker + pass + + +REQUIRES_CONVERSION_TO_BYTES = [ + 'memory', + 'memory_reservation', + 'memory_swap', + 'shm_size' +] + +VOLUME_PERMISSIONS = ('rw', 'ro', 'z', 'Z') + +class TaskParameters(DockerBaseClass): + ''' + Access and parse module parameters + ''' + + def __init__(self, client): + super(TaskParameters, self).__init__() + self.client = client + + self.blkio_weight = None + self.capabilities = None + self.cleanup = None + self.command = None + self.cpu_period = None + self.cpu_quota = None + self.cpuset_cpus = None + self.cpuset_mems = None + self.cpu_shares = None + self.detach = None + self.debug = None + self.devices = None + self.dns_servers = None + self.dns_opts = None + self.dns_search_domains = None + self.env = None + self.env_file = None + self.entrypoint = None + self.etc_hosts = None + self.exposed_ports = None + self.force_kill = None + self.groups = None + self.hostname = None + self.ignore_image = None + self.image = None + self.interactive = None + self.ipc_mode = None + self.keep_volumes = None + self.kernel_memory = None + self.kill_signal = None + self.labels = None + self.links = None + self.log_driver = None + self.log_options = None + self.mac_address = None + self.memory = None + self.memory_reservation = None + self.memory_swap = None + self.memory_swappiness = None + self.name = None + self.network_mode = None + self.networks = None + self.oom_killer = None + self.oom_score_adj = None + self.paused = None + self.pid_mode = None + self.privileged = None + self.purge_networks = None + self.pull = None + self.read_only = None + self.recreate = None + self.restart = None + self.restart_retries = None + self.restart_policy = None + self.shm_size = None + self.security_opts = None + self.state = None + self.stop_signal = None + self.stop_timeout = None + self.trust_image_content = None + self.tty = None + self.user = None + self.uts = None + self.volumes = None + self.volume_binds = dict() + self.volumes_from = None + self.volume_driver = None + + for key, value in client.module.params.items(): + setattr(self, key, value) + + for param_name in REQUIRES_CONVERSION_TO_BYTES: + if client.module.params.get(param_name): + try: + setattr(self, param_name, human_to_bytes(client.module.params.get(param_name))) + except ValueError as exc: + self.fail("Failed to convert %s to bytes: %s" % (param_name, exc)) + + self.publish_all_ports = False + self.published_ports = self._parse_publish_ports() + if self.published_ports in ('all', 'ALL'): + self.publish_all_ports = True + self.published_ports = None + + self.ports = self._parse_exposed_ports(self.published_ports) + self.log("expose ports:") + self.log(self.ports, pretty_print=True) + + self.links = self._parse_links(self.links) + + if self.volumes: + self.volumes = self._expand_host_paths() + + self.env = self._get_environment() + self.ulimits = self._parse_ulimits() + self.log_config = self._parse_log_config() + self.exp_links = None + self.volume_binds = self._get_volume_binds(self.volumes) + + self.log("volumes:") + self.log(self.volumes, pretty_print=True) + self.log("volume binds:") + self.log(self.volume_binds, pretty_print=True) + + if self.networks: + for network in self.networks: + if not network.get('name'): + self.fail("Parameter error: network must have a name attribute.") + network['id'] = self._get_network_id(network['name']) + if not network['id']: + self.fail("Parameter error: network named %s could not be found. Does it exist?" % network['name']) + if network.get('links'): + network['links'] = self._parse_links(network['links']) + + def fail(self, msg): + self.client.module.fail_json(msg=msg) + + @property + def update_parameters(self): + ''' + Returns parameters used to update a container + ''' + + update_parameters = dict( + blkio_weight='blkio_weight', + cpu_period='cpu_period', + cpu_quota='cpu_quota', + cpu_shares='cpu_shares', + cpuset_cpus='cpuset_cpus', + mem_limit='memory', + mem_reservation='mem_reservation', + memswap_limit='memory_swap', + kernel_memory='kernel_memory' + ) + result = dict() + for key, value in update_parameters.items(): + if getattr(self, value, None) is not None: + result[key] = getattr(self, value) + return result + + @property + def create_parameters(self): + ''' + Returns parameters used to create a container + ''' + create_params = dict( + command='command', + hostname='hostname', + user='user', + detach='detach', + stdin_open='interactive', + tty='tty', + ports='ports', + environment='env', + name='name', + entrypoint='entrypoint', + cpu_shares='cpu_shares', + mac_address='mac_address', + labels='labels', + stop_signal='stop_signal', + volume_driver='volume_driver', + ) + + result = dict( + host_config=self._host_config(), + volumes=self._get_mounts(), + ) + + for key, value in create_params.items(): + if getattr(self, value, None) is not None: + result[key] = getattr(self, value) + return result + + def _expand_host_paths(self): + new_vols = [] + for vol in self.volumes: + if ':' in vol: + if len(vol.split(':')) == 3: + host, container, mode = vol.split(':') + if re.match(r'[\.~]', host): + host = os.path.abspath(host) + new_vols.append("%s:%s:%s" % (host, container, mode)) + continue + elif len(vol.split(':')) == 2: + parts = vol.split(':') + if parts[1] not in VOLUME_PERMISSIONS and re.match(r'[\.~]', parts[0]): + host = os.path.abspath(parts[0]) + new_vols.append("%s:%s:rw" % (host, parts[1])) + continue + new_vols.append(vol) + return new_vols + + def _get_mounts(self): + ''' + Return a list of container mounts. + :return: + ''' + result = [] + if self.volumes: + for vol in self.volumes: + if ':' in vol: + if len(vol.split(':')) == 3: + host, container, _ = vol.split(':') + result.append(container) + continue + if len(vol.split(':')) == 2: + parts = vol.split(':') + if parts[1] not in VOLUME_PERMISSIONS: + result.append(parts[1]) + continue + result.append(vol) + self.log("mounts:") + self.log(result, pretty_print=True) + return result + + def _host_config(self): + ''' + Returns parameters used to create a HostConfig object + ''' + + host_config_params=dict( + port_bindings='published_ports', + publish_all_ports='publish_all_ports', + links='links', + privileged='privileged', + dns='dns_servers', + dns_search='dns_search_domains', + binds='volume_binds', + volumes_from='volumes_from', + network_mode='network_mode', + cap_add='capabilities', + extra_hosts='etc_hosts', + read_only='read_only', + ipc_mode='ipc_mode', + security_opt='security_opts', + ulimits='ulimits', + log_config='log_config', + mem_limit='memory', + memswap_limit='memory_swap', + mem_swappiness='memory_swappiness', + oom_score_adj='oom_score_adj', + shm_size='shm_size', + group_add='groups', + devices='devices', + pid_mode='pid_mode' + ) + params = dict() + for key, value in host_config_params.items(): + if getattr(self, value, None) is not None: + params[key] = getattr(self, value) + + if self.restart_policy: + params['restart_policy'] = dict(Name=self.restart_policy, + MaximumRetryCount=self.restart_retries) + + return self.client.create_host_config(**params) + + @property + def default_host_ip(self): + ip = '0.0.0.0' + if not self.networks: + return ip + for net in self.networks: + if net.get('name'): + network = self.client.inspect_network(net['name']) + if network.get('Driver') == 'bridge' and \ + network.get('Options', {}).get('com.docker.network.bridge.host_binding_ipv4'): + ip = network['Options']['com.docker.network.bridge.host_binding_ipv4'] + break + return ip + + def _parse_publish_ports(self): + ''' + Parse ports from docker CLI syntax + ''' + if self.published_ports is None: + return None + + if 'all' in self.published_ports: + return 'all' + + default_ip = self.default_host_ip + + binds = {} + for port in self.published_ports: + parts = str(port).split(':') + container_port = parts[-1] + if '/' not in container_port: + container_port = int(parts[-1]) + + p_len = len(parts) + if p_len == 1: + bind = (default_ip,) + elif p_len == 2: + bind = (default_ip, int(parts[0])) + elif p_len == 3: + bind = (parts[0], int(parts[1])) if parts[1] else (parts[0],) + + if container_port in binds: + old_bind = binds[container_port] + if isinstance(old_bind, list): + old_bind.append(bind) + else: + binds[container_port] = [binds[container_port], bind] + else: + binds[container_port] = bind + return binds + + @staticmethod + def _get_volume_binds(volumes): + ''' + Extract host bindings, if any, from list of volume mapping strings. + + :return: dictionary of bind mappings + ''' + result = dict() + if volumes: + for vol in volumes: + host = None + if ':' in vol: + if len(vol.split(':')) == 3: + host, container, mode = vol.split(':') + if len(vol.split(':')) == 2: + parts = vol.split(':') + if parts[1] not in VOLUME_PERMISSIONS: + host, container, mode = (vol.split(':') + ['rw']) + if host is not None: + result[host] = dict( + bind=container, + mode=mode + ) + return result + + def _parse_exposed_ports(self, published_ports): + ''' + Parse exposed ports from docker CLI-style ports syntax. + ''' + exposed = [] + if self.exposed_ports: + for port in self.exposed_ports: + port = str(port).strip() + protocol = 'tcp' + match = re.search(r'(/.+$)', port) + if match: + protocol = match.group(1).replace('/', '') + port = re.sub(r'/.+$', '', port) + exposed.append((port, protocol)) + if published_ports: + # Any published port should also be exposed + for publish_port in published_ports: + match = False + if isinstance(publish_port, basestring) and '/' in publish_port: + port, protocol = publish_port.split('/') + port = int(port) + else: + protocol = 'tcp' + port = int(publish_port) + for exposed_port in exposed: + if isinstance(exposed_port[0], basestring) and '-' in exposed_port[0]: + start_port, end_port = exposed_port[0].split('-') + if int(start_port) <= port <= int(end_port): + match = True + elif exposed_port[0] == port: + match = True + if not match: + exposed.append((port, protocol)) + return exposed + + @staticmethod + def _parse_links(links): + ''' + Turn links into a dictionary + ''' + if links is None: + return None + + result = {} + for link in links: + parsed_link = link.split(':', 1) + if len(parsed_link) == 2: + result[parsed_link[0]] = parsed_link[1] + else: + result[parsed_link[0]] = parsed_link[0] + return result + + def _parse_ulimits(self): + ''' + Turn ulimits into an array of Ulimit objects + ''' + if self.ulimits is None: + return None + + results = [] + for limit in self.ulimits: + limits = dict() + pieces = limit.split(':') + if len(pieces) >= 2: + limits['name'] = pieces[0] + limits['soft'] = int(pieces[1]) + limits['hard'] = int(pieces[1]) + if len(pieces) == 3: + limits['hard'] = int(pieces[2]) + try: + results.append(Ulimit(**limits)) + except ValueError as exc: + self.fail("Error parsing ulimits value %s - %s" % (limit, exc)) + return results + + def _parse_log_config(self): + ''' + Create a LogConfig object + ''' + if self.log_driver is None: + return None + + options = dict( + Type=self.log_driver, + Config = dict() + ) + + if self.log_options is not None: + options['Config'] = self.log_options + + try: + return LogConfig(**options) + except ValueError as exc: + self.fail('Error parsing logging options - %s' % (exc)) + + def _get_environment(self): + """ + If environment file is combined with explicit environment variables, the explicit environment variables + take precedence. + """ + final_env = {} + if self.env_file: + parsed_env_file = utils.parse_env_file(self.env_file) + for name, value in parsed_env_file.items(): + final_env[name] = str(value) + if self.env: + for name, value in self.env.items(): + final_env[name] = str(value) + return final_env + + def _get_network_id(self, network_name): + network_id = None + try: + for network in self.client.networks(names=[network_name]): + if network['Name'] == network_name: + network_id = network['Id'] + break + except Exception as exc: + self.fail("Error getting network id for %s - %s" % (network_name, str(exc))) + return network_id + + + +class Container(DockerBaseClass): + + def __init__(self, container, parameters): + super(Container, self).__init__() + self.raw = container + self.Id = None + self.container = container + if container: + self.Id = container['Id'] + self.Image = container['Image'] + self.log(self.container, pretty_print=True) + self.parameters = parameters + self.parameters.expected_links = None + self.parameters.expected_ports = None + self.parameters.expected_exposed = None + self.parameters.expected_volumes = None + self.parameters.expected_ulimits = None + self.parameters.expected_etc_hosts = None + self.parameters.expected_env = None + + def fail(self, msg): + self.parameters.client.module.fail_json(msg=msg) + + @property + def exists(self): + return True if self.container else False + + @property + def running(self): + if self.container and self.container.get('State'): + if self.container['State'].get('Running') and not self.container['State'].get('Ghost', False): + return True + return False + + def has_different_configuration(self, image): + ''' + Diff parameters vs existing container config. Returns tuple: (True | False, List of differences) + ''' + self.log('Starting has_different_configuration') + self.parameters.expected_entrypoint = self._get_expected_entrypoint() + self.parameters.expected_links = self._get_expected_links() + self.parameters.expected_ports = self._get_expected_ports() + self.parameters.expected_exposed = self._get_expected_exposed(image) + self.parameters.expected_volumes = self._get_expected_volumes(image) + self.parameters.expected_binds = self._get_expected_binds(image) + self.parameters.expected_ulimits = self._get_expected_ulimits(self.parameters.ulimits) + self.parameters.expected_etc_hosts = self._convert_simple_dict_to_list('etc_hosts') + self.parameters.expected_env = self._get_expected_env(image) + self.parameters.expected_cmd = self._get_expected_cmd() + self.parameters.expected_devices = self._get_expected_devices() + + if not self.container.get('HostConfig'): + self.fail("has_config_diff: Error parsing container properties. HostConfig missing.") + if not self.container.get('Config'): + self.fail("has_config_diff: Error parsing container properties. Config missing.") + if not self.container.get('NetworkSettings'): + self.fail("has_config_diff: Error parsing container properties. NetworkSettings missing.") + + host_config = self.container['HostConfig'] + log_config = host_config.get('LogConfig', dict()) + restart_policy = host_config.get('RestartPolicy', dict()) + config = self.container['Config'] + network = self.container['NetworkSettings'] + + # The previous version of the docker module ignored the detach state by + # assuming if the container was running, it must have been detached. + detach = not (config.get('AttachStderr') and config.get('AttachStdout')) + + # "ExposedPorts": null returns None type & causes AttributeError - PR #5517 + if config.get('ExposedPorts') is not None: + expected_exposed = [re.sub(r'/.+$', '', p) for p in config.get('ExposedPorts', dict()).keys()] + else: + expected_exposed = [] + + # Map parameters to container inspect results + config_mapping = dict( + image=config.get('Image'), + expected_cmd=config.get('Cmd'), + hostname=config.get('Hostname'), + user=config.get('User'), + detach=detach, + interactive=config.get('OpenStdin'), + capabilities=host_config.get('CapAdd'), + expected_devices=host_config.get('Devices'), + dns_servers=host_config.get('Dns'), + dns_opts=host_config.get('DnsOptions'), + dns_search_domains=host_config.get('DnsSearch'), + expected_env=(config.get('Env') or []), + expected_entrypoint=config.get('Entrypoint'), + expected_etc_hosts=host_config['ExtraHosts'], + expected_exposed=expected_exposed, + groups=host_config.get('GroupAdd'), + ipc_mode=host_config.get("IpcMode"), + labels=config.get('Labels'), + expected_links=host_config.get('Links'), + log_driver=log_config.get('Type'), + log_options=log_config.get('Config'), + mac_address=network.get('MacAddress'), + memory_swappiness=host_config.get('MemorySwappiness'), + network_mode=host_config.get('NetworkMode'), + oom_killer=host_config.get('OomKillDisable'), + oom_score_adj=host_config.get('OomScoreAdj'), + pid_mode=host_config.get('PidMode'), + privileged=host_config.get('Privileged'), + expected_ports=host_config.get('PortBindings'), + read_only=host_config.get('ReadonlyRootfs'), + restart_policy=restart_policy.get('Name'), + restart_retries=restart_policy.get('MaximumRetryCount'), + # Cannot test shm_size, as shm_size is not included in container inspection results. + # shm_size=host_config.get('ShmSize'), + security_opts=host_config.get("SecuriytOpt"), + stop_signal=config.get("StopSignal"), + tty=config.get('Tty'), + expected_ulimits=host_config.get('Ulimits'), + uts=host_config.get('UTSMode'), + expected_volumes=config.get('Volumes'), + expected_binds=host_config.get('Binds'), + volumes_from=host_config.get('VolumesFrom'), + volume_driver=host_config.get('VolumeDriver') + ) + + differences = [] + for key, value in config_mapping.items(): + self.log('check differences %s %s vs %s' % (key, getattr(self.parameters, key), str(value))) + if getattr(self.parameters, key, None) is not None: + if isinstance(getattr(self.parameters, key), list) and isinstance(value, list): + if len(getattr(self.parameters, key)) > 0 and isinstance(getattr(self.parameters, key)[0], dict): + # compare list of dictionaries + self.log("comparing list of dict: %s" % key) + match = self._compare_dictionary_lists(getattr(self.parameters, key), value) + else: + # compare two lists. Is list_a in list_b? + self.log("comparing lists: %s" % key) + set_a = set(getattr(self.parameters, key)) + set_b = set(value) + match = (set_a <= set_b) + elif isinstance(getattr(self.parameters, key), dict) and isinstance(value, dict): + # compare two dicts + self.log("comparing two dicts: %s" % key) + match = self._compare_dicts(getattr(self.parameters, key), value) + else: + # primitive compare + self.log("primitive compare: %s" % key) + match = (getattr(self.parameters, key) == value) + + if not match: + # no match. record the differences + item = dict() + item[key] = dict( + parameter=getattr(self.parameters, key), + container=value + ) + differences.append(item) + + has_differences = True if len(differences) > 0 else False + return has_differences, differences + + def _compare_dictionary_lists(self, list_a, list_b): + ''' + If all of list_a exists in list_b, return True + ''' + if not isinstance(list_a, list) or not isinstance(list_b, list): + return False + matches = 0 + for dict_a in list_a: + for dict_b in list_b: + if self._compare_dicts(dict_a, dict_b): + matches += 1 + break + result = (matches == len(list_a)) + return result + + def _compare_dicts(self, dict_a, dict_b): + ''' + If dict_a in dict_b, return True + ''' + if not isinstance(dict_a, dict) or not isinstance(dict_b, dict): + return False + for key, value in dict_a.items(): + if isinstance(value, dict): + match = self._compare_dicts(value, dict_b.get(key)) + elif isinstance(value, list): + if len(value) > 0 and isinstance(value[0], dict): + match = self._compare_dictionary_lists(value, dict_b.get(key)) + else: + set_a = set(value) + set_b = set(dict_b.get(key)) + match = (set_a == set_b) + else: + match = (value == dict_b.get(key)) + if not match: + return False + return True + + def has_different_resource_limits(self): + ''' + Diff parameters and container resource limits + ''' + if not self.container.get('HostConfig'): + self.fail("limits_differ_from_container: Error parsing container properties. HostConfig missing.") + + host_config = self.container['HostConfig'] + + config_mapping = dict( + cpu_period=host_config.get('CpuPeriod'), + cpu_quota=host_config.get('CpuQuota'), + cpuset_cpus=host_config.get('CpusetCpus'), + cpuset_mems=host_config.get('CpusetMems'), + cpu_shares=host_config.get('CpuShares'), + kernel_memory=host_config.get("KernelMemory"), + memory=host_config.get('Memory'), + memory_reservation=host_config.get('MemoryReservation'), + memory_swap=host_config.get('MemorySwap'), + oom_score_adj=host_config.get('OomScoreAdj'), + ) + + differences = [] + for key, value in config_mapping.items(): + if getattr(self.parameters, key, None) and getattr(self.parameters, key) != value: + # no match. record the differences + item = dict() + item[key] = dict( + parameter=getattr(self.parameters, key), + container=value + ) + differences.append(item) + different = (len(differences) > 0) + return different, differences + + def has_network_differences(self): + ''' + Check if the container is connected to requested networks with expected options: links, aliases, ipv4, ipv6 + ''' + different = False + differences = [] + + if not self.parameters.networks: + return different, differences + + if not self.container.get('NetworkSettings'): + self.fail("has_missing_networks: Error parsing container properties. NetworkSettings missing.") + + connected_networks = self.container['NetworkSettings']['Networks'] + for network in self.parameters.networks: + if connected_networks.get(network['name'], None) is None: + different = True + differences.append(dict( + parameter=network, + container=None + )) + else: + diff = False + if network.get('ipv4_address') and network['ipv4_address'] != connected_networks[network['name']].get('IPAddress'): + diff = True + if network.get('ipv6_address') and network['ipv6_address'] != connected_networks[network['name']].get('GlobalIPv6Address'): + diff = True + if network.get('aliases') and not connected_networks[network['name']].get('Aliases'): + diff = True + if network.get('aliases') and connected_networks[network['name']].get('Aliases'): + for alias in network.get('aliases'): + if alias not in connected_networks[network['name']].get('Aliases', []): + diff = True + if network.get('links') and not connected_networks[network['name']].get('Links'): + diff = True + if network.get('links') and connected_networks[network['name']].get('Links'): + expected_links = [] + for link, alias in network['links'].items(): + expected_links.append("%s:%s" % (link, alias)) + for link in expected_links: + if link not in connected_networks[network['name']].get('Links', []): + diff = True + if diff: + different = True + differences.append(dict( + parameter=network, + container=dict( + name=network['name'], + ipv4_address=connected_networks[network['name']].get('IPAddress'), + ipv6_address=connected_networks[network['name']].get('GlobalIPv6Address'), + aliases=connected_networks[network['name']].get('Aliases'), + links=connected_networks[network['name']].get('Links') + ) + )) + return different, differences + + def has_extra_networks(self): + ''' + Check if the container is connected to non-requested networks + ''' + extra_networks = [] + extra = False + + if not self.container.get('NetworkSettings'): + self.fail("has_extra_networks: Error parsing container properties. NetworkSettings missing.") + + connected_networks = self.container['NetworkSettings'].get('Networks') + if connected_networks: + for network, network_config in connected_networks.items(): + keep = False + if self.parameters.networks: + for expected_network in self.parameters.networks: + if expected_network['name'] == network: + keep = True + if not keep: + extra = True + extra_networks.append(dict(name=network, id=network_config['NetworkID'])) + return extra, extra_networks + + def _get_expected_devices(self): + if not self.parameters.devices: + return None + expected_devices = [] + for device in self.parameters.devices: + parts = device.split(':') + if len(parts) == 1: + expected_devices.append( + dict( + CgroupPermissions='rwm', + PathInContainer=parts[0], + PathOnHost=parts[0] + )) + elif len(parts) == 2: + parts = device.split(':') + expected_devices.append( + dict( + CgroupPermissions='rwm', + PathInContainer=parts[1], + PathOnHost=parts[0] + ) + ) + else: + expected_devices.append( + dict( + CgroupPermissions=parts[2], + PathInContainer=parts[1], + PathOnHost=parts[0] + )) + return expected_devices + + def _get_expected_entrypoint(self): + self.log('_get_expected_entrypoint') + if not self.parameters.entrypoint: + return None + return shlex.split(self.parameters.entrypoint) + + def _get_expected_ports(self): + if not self.parameters.published_ports: + return None + expected_bound_ports = {} + for container_port, config in self.parameters.published_ports.items(): + if isinstance(container_port, int): + container_port = "%s/tcp" % container_port + if len(config) == 1: + expected_bound_ports[container_port] = [{'HostIp': "0.0.0.0", 'HostPort': ""}] + elif isinstance(config[0], tuple): + expected_bound_ports[container_port] = [] + for host_ip, host_port in config: + expected_bound_ports[container_port].append({'HostIp': host_ip, 'HostPort': str(host_port)}) + else: + expected_bound_ports[container_port] = [{'HostIp': config[0], 'HostPort': str(config[1])}] + return expected_bound_ports + + def _get_expected_links(self): + if self.parameters.links is None: + return None + self.log('parameter links:') + self.log(self.parameters.links, pretty_print=True) + exp_links = [] + for link, alias in self.parameters.links.items(): + exp_links.append("/%s:%s/%s" % (link, ('/' + self.parameters.name), alias)) + return exp_links + + def _get_expected_binds(self, image): + self.log('_get_expected_binds') + image_vols = [] + if image: + image_vols = self._get_image_binds(image['ContainerConfig'].get('Volumes')) + param_vols = [] + if self.parameters.volumes: + for vol in self.parameters.volumes: + host = None + if ':' in vol: + if len(vol.split(':')) == 3: + host, container, mode = vol.split(':') + if len(vol.split(':')) == 2: + parts = vol.split(':') + if parts[1] not in VOLUME_PERMISSIONS: + host, container, mode = vol.split(':') + ['rw'] + if host: + param_vols.append("%s:%s:%s" % (host, container, mode)) + result = list(set(image_vols + param_vols)) + self.log("expected_binds:") + self.log(result, pretty_print=True) + return result + + def _get_image_binds(self, volumes): + ''' + Convert array of binds to array of strings with format host_path:container_path:mode + + :param volumes: array of bind dicts + :return: array of strings + ''' + results = [] + if isinstance(volumes, dict): + results += self._get_bind_from_dict(volumes) + elif isinstance(volumes, list): + for vol in volumes: + results += self._get_bind_from_dict(vol) + return results + + @staticmethod + def _get_bind_from_dict(volume_dict): + results = [] + if volume_dict: + for host_path, config in volume_dict.items(): + if isinstance(config, dict) and config.get('bind'): + container_path = config.get('bind') + mode = config.get('mode', 'rw') + results.append("%s:%s:%s" % (host_path, container_path, mode)) + return results + + def _get_expected_volumes(self, image): + self.log('_get_expected_volumes') + expected_vols = dict() + if image and image['ContainerConfig'].get('Volumes'): + expected_vols.update(image['ContainerConfig'].get('Volumes')) + + if self.parameters.volumes: + for vol in self.parameters.volumes: + container = None + if ':' in vol: + if len(vol.split(':')) == 3: + host, container, mode = vol.split(':') + if len(vol.split(':')) == 2: + parts = vol.split(':') + if parts[1] not in VOLUME_PERMISSIONS: + host, container, mode = vol.split(':') + ['rw'] + new_vol = dict() + if container: + new_vol[container] = dict() + else: + new_vol[vol] = dict() + expected_vols.update(new_vol) + + if not expected_vols: + expected_vols = None + self.log("expected_volumes:") + self.log(expected_vols, pretty_print=True) + return expected_vols + + def _get_expected_env(self, image): + self.log('_get_expected_env') + expected_env = dict() + if image and image['ContainerConfig'].get('Env'): + for env_var in image['ContainerConfig']['Env']: + parts = env_var.split('=', 1) + expected_env[parts[0]] = parts[1] + if self.parameters.env: + expected_env.update(self.parameters.env) + param_env = [] + for key, value in expected_env.items(): + param_env.append("%s=%s" % (key, value)) + return param_env + + def _get_expected_exposed(self, image): + self.log('_get_expected_exposed') + image_ports = [] + if image: + image_ports = [re.sub(r'/.+$', '', p) for p in (image['ContainerConfig'].get('ExposedPorts') or {}).keys()] + param_ports = [] + if self.parameters.ports: + param_ports = [str(p[0]) for p in self.parameters.ports] + result = list(set(image_ports + param_ports)) + self.log(result, pretty_print=True) + return result + + def _get_expected_ulimits(self, config_ulimits): + self.log('_get_expected_ulimits') + if config_ulimits is None: + return None + results = [] + for limit in config_ulimits: + results.append(dict( + Name=limit.name, + Soft=limit.soft, + Hard=limit.hard + )) + return results + + def _get_expected_cmd(self): + self.log('_get_expected_cmd') + if not self.parameters.command: + return None + return shlex.split(self.parameters.command) + + def _convert_simple_dict_to_list(self, param_name, join_with=':'): + if getattr(self.parameters, param_name, None) is None: + return None + results = [] + for key, value in getattr(self.parameters, param_name).items(): + results.append("%s%s%s" % (key, join_with, value)) + return results + + +class ContainerManager(DockerBaseClass): + ''' + Perform container management tasks + ''' + + def __init__(self, client): + + super(ContainerManager, self).__init__() + + self.client = client + self.parameters = TaskParameters(client) + self.check_mode = self.client.check_mode + self.results = {'changed': False, 'actions': []} + self.diff = {} + self.facts = {} + + state = self.parameters.state + if state in ('stopped', 'started', 'present'): + self.present(state) + elif state == 'absent': + self.absent() + + if not self.check_mode and not self.parameters.debug: + self.results.pop('actions') + + if self.client.module._diff or self.parameters.debug: + self.results['diff'] = self.diff + + if self.facts: + self.results['ansible_facts'] = {'docker_container': self.facts} + + def present(self, state): + container = self._get_container(self.parameters.name) + image = self._get_image() + + if not container.exists: + # New container + self.log('No container found') + new_container = self.container_create(self.parameters.image, self.parameters.create_parameters) + if new_container: + container = new_container + else: + # Existing container + different, differences = container.has_different_configuration(image) + image_different = False + if not self.parameters.ignore_image: + image_different = self._image_is_different(image, container) + if image_different or different or self.parameters.recreate: + self.diff['differences'] = differences + if image_different: + self.diff['image_different'] = True + self.log("differences") + self.log(differences, pretty_print=True) + if container.running: + self.container_stop(container.Id) + self.container_remove(container.Id) + new_container = self.container_create(self.parameters.image, self.parameters.create_parameters) + if new_container: + container = new_container + + if container and container.exists: + container = self.update_limits(container) + container = self.update_networks(container) + + if state == 'started' and not container.running: + container = self.container_start(container.Id) + elif state == 'started' and self.parameters.restart: + self.container_stop(container.Id) + container = self.container_start(container.Id) + elif state == 'stopped' and container.running: + self.container_stop(container.Id) + container = self._get_container(container.Id) + + self.facts = container.raw + + def absent(self): + container = self._get_container(self.parameters.name) + if container.exists: + if container.running: + self.container_stop(container.Id) + self.container_remove(container.Id) + + def fail(self, msg, **kwargs): + self.client.module.fail_json(msg=msg, **kwargs) + + def _get_container(self, container): + ''' + Expects container ID or Name. Returns a container object + ''' + return Container(self.client.get_container(container), self.parameters) + + def _get_image(self): + if not self.parameters.image: + self.log('No image specified') + return None + repository, tag = utils.parse_repository_tag(self.parameters.image) + if not tag: + tag = "latest" + image = self.client.find_image(repository, tag) + if not self.check_mode: + if not image or self.parameters.pull: + self.log("Pull the image.") + image, alreadyToLatest = self.client.pull_image(repository, tag) + if alreadyToLatest: + self.results['changed'] = False + else: + self.results['changed'] = True + self.results['actions'].append(dict(pulled_image="%s:%s" % (repository, tag))) + self.log("image") + self.log(image, pretty_print=True) + return image + + def _image_is_different(self, image, container): + if image and image.get('Id'): + if container and container.Image: + if image.get('Id') != container.Image: + return True + return False + + def update_limits(self, container): + limits_differ, different_limits = container.has_different_resource_limits() + if limits_differ: + self.log("limit differences:") + self.log(different_limits, pretty_print=True) + if limits_differ and not self.check_mode: + self.container_update(container.Id, self.parameters.update_parameters) + return self._get_container(container.Id) + return container + + def update_networks(self, container): + has_network_differences, network_differences = container.has_network_differences() + updated_container = container + if has_network_differences: + if self.diff.get('differences'): + self.diff['differences'].append(dict(network_differences=network_differences)) + else: + self.diff['differences'] = [dict(network_differences=network_differences)] + self.results['changed'] = True + updated_container = self._add_networks(container, network_differences) + + if self.parameters.purge_networks: + has_extra_networks, extra_networks = container.has_extra_networks() + if has_extra_networks: + if self.diff.get('differences'): + self.diff['differences'].append(dict(purge_networks=extra_networks)) + else: + self.diff['differences'] = [dict(purge_networks=extra_networks)] + self.results['changed'] = True + updated_container = self._purge_networks(container, extra_networks) + return updated_container + + def _add_networks(self, container, differences): + for diff in differences: + # remove the container from the network, if connected + if diff.get('container'): + self.results['actions'].append(dict(removed_from_network=diff['parameter']['name'])) + if not self.check_mode: + try: + self.client.disconnect_container_from_network(container.Id, diff['parameter']['id']) + except Exception as exc: + self.fail("Error disconnecting container from network %s - %s" % (diff['parameter']['name'], + str(exc))) + # connect to the network + params = dict( + ipv4_address=diff['parameter'].get('ipv4_address', None), + ipv6_address=diff['parameter'].get('ipv6_address', None), + links=diff['parameter'].get('links', None), + aliases=diff['parameter'].get('aliases', None) + ) + self.results['actions'].append(dict(added_to_network=diff['parameter']['name'], network_parameters=params)) + if not self.check_mode: + try: + self.log("Connecting container to network %s" % diff['parameter']['id']) + self.log(params, pretty_print=True) + self.client.connect_container_to_network(container.Id, diff['parameter']['id'], **params) + except Exception as exc: + self.fail("Error connecting container to network %s - %s" % (diff['parameter']['name'], str(exc))) + return self._get_container(container.Id) + + def _purge_networks(self, container, networks): + for network in networks: + self.results['actions'].append(dict(removed_from_network=network['name'])) + if not self.check_mode: + try: + self.client.disconnect_container_from_network(container.Id, network['name']) + except Exception as exc: + self.fail("Error disconnecting container from network %s - %s" % (network['name'], + str(exc))) + return self._get_container(container.Id) + + def container_create(self, image, create_parameters): + self.log("create container") + self.log("image: %s parameters:" % image) + self.log(create_parameters, pretty_print=True) + self.results['actions'].append(dict(created="Created container", create_parameters=create_parameters)) + self.results['changed'] = True + new_container = None + if not self.check_mode: + try: + new_container = self.client.create_container(image, **create_parameters) + except Exception as exc: + self.fail("Error creating container: %s" % str(exc)) + return self._get_container(new_container['Id']) + return new_container + + def container_start(self, container_id): + self.log("start container %s" % (container_id)) + self.results['actions'].append(dict(started=container_id)) + self.results['changed'] = True + if not self.check_mode: + try: + self.client.start(container=container_id) + except Exception as exc: + self.fail("Error starting container %s: %s" % (container_id, str(exc))) + + if not self.parameters.detach: + status = self.client.wait(container_id) + output = self.client.logs(container_id, stdout=True, stderr=True, stream=False, timestamps=False) + if status != 0: + self.fail(output, status=status) + if self.parameters.cleanup: + self.container_remove(container_id, force=True) + insp = self._get_container(container_id) + if insp.raw: + insp.raw['Output'] = output + else: + insp.raw = dict(Output=output) + return insp + return self._get_container(container_id) + + def container_remove(self, container_id, link=False, force=False): + volume_state = (not self.parameters.keep_volumes) + self.log("remove container container:%s v:%s link:%s force%s" % (container_id, volume_state, link, force)) + self.results['actions'].append(dict(removed=container_id, volume_state=volume_state, link=link, force=force)) + self.results['changed'] = True + response = None + if not self.check_mode: + try: + response = self.client.remove_container(container_id, v=volume_state, link=link, force=force) + except Exception as exc: + self.fail("Error removing container %s: %s" % (container_id, str(exc))) + return response + + def container_update(self, container_id, update_parameters): + if update_parameters: + self.log("update container %s" % (container_id)) + self.log(update_parameters, pretty_print=True) + self.results['actions'].append(dict(updated=container_id, update_parameters=update_parameters)) + self.results['changed'] = True + if not self.check_mode and callable(getattr(self.client, 'update_container')): + try: + self.client.update_container(container_id, **update_parameters) + except Exception as exc: + self.fail("Error updating container %s: %s" % (container_id, str(exc))) + return self._get_container(container_id) + + def container_kill(self, container_id): + self.results['actions'].append(dict(killed=container_id, signal=self.parameters.kill_signal)) + self.results['changed'] = True + response = None + if not self.check_mode: + try: + if self.parameters.kill_signal: + response = self.client.kill(container_id, signal=self.parameters.kill_signal) + else: + response = self.client.kill(container_id) + except Exception as exc: + self.fail("Error killing container %s: %s" % (container_id, exc)) + return response + + def container_stop(self, container_id): + if self.parameters.force_kill: + self.container_kill(container_id) + return + self.results['actions'].append(dict(stopped=container_id, timeout=self.parameters.stop_timeout)) + self.results['changed'] = True + response = None + if not self.check_mode: + try: + if self.parameters.stop_timeout: + response = self.client.stop(container_id, timeout=self.parameters.stop_timeout) + else: + response = self.client.stop(container_id) + except Exception as exc: + self.fail("Error stopping container %s: %s" % (container_id, str(exc))) + return response + + +def main(): + argument_spec = dict( + blkio_weight=dict(type='int'), + capabilities=dict(type='list'), + cleanup=dict(type='bool', default=False), + command=dict(type='str'), + cpu_period=dict(type='int'), + cpu_quota=dict(type='int'), + cpuset_cpus=dict(type='str'), + cpuset_mems=dict(type='str'), + cpu_shares=dict(type='int'), + detach=dict(type='bool', default=True), + devices=dict(type='list'), + dns_servers=dict(type='list'), + dns_opts=dict(type='list'), + dns_search_domains=dict(type='list'), + env=dict(type='dict'), + env_file=dict(type='path'), + entrypoint=dict(type='str'), + etc_hosts=dict(type='dict'), + exposed_ports=dict(type='list', aliases=['exposed', 'expose']), + force_kill=dict(type='bool', default=False, aliases=['forcekill']), + groups=dict(type='list'), + hostname=dict(type='str'), + ignore_image=dict(type='bool', default=False), + image=dict(type='str'), + interactive=dict(type='bool', default=False), + ipc_mode=dict(type='str'), + keep_volumes=dict(type='bool', default=True), + kernel_memory=dict(type='str'), + kill_signal=dict(type='str'), + labels=dict(type='dict'), + links=dict(type='list'), + log_driver=dict(type='str', + choices=['none', 'json-file', 'syslog', 'journald', 'gelf', 'fluentd', 'awslogs', 'splunk'], + default=None), + log_options=dict(type='dict', aliases=['log_opt']), + mac_address=dict(type='str'), + memory=dict(type='str', default='0'), + memory_reservation=dict(type='str'), + memory_swap=dict(type='str'), + memory_swappiness=dict(type='int'), + name=dict(type='str', required=True), + network_mode=dict(type='str'), + networks=dict(type='list'), + oom_killer=dict(type='bool'), + oom_score_adj=dict(type='int'), + paused=dict(type='bool', default=False), + pid_mode=dict(type='str'), + privileged=dict(type='bool', default=False), + published_ports=dict(type='list', aliases=['ports']), + pull=dict(type='bool', default=False), + purge_networks=dict(type='bool', default=False), + read_only=dict(type='bool', default=False), + recreate=dict(type='bool', default=False), + restart=dict(type='bool', default=False), + restart_policy=dict(type='str', choices=['no', 'on-failure', 'always', 'unless-stopped']), + restart_retries=dict(type='int', default=None), + shm_size=dict(type='str'), + security_opts=dict(type='list'), + state=dict(type='str', choices=['absent', 'present', 'started', 'stopped'], default='started'), + stop_signal=dict(type='str'), + stop_timeout=dict(type='int'), + trust_image_content=dict(type='bool', default=False), + tty=dict(type='bool', default=False), + ulimits=dict(type='list'), + user=dict(type='str'), + uts=dict(type='str'), + volumes=dict(type='list'), + volumes_from=dict(type='list'), + volume_driver=dict(type='str'), + ) + + required_if = [ + ('state', 'present', ['image']) + ] + + client = AnsibleDockerClient( + argument_spec=argument_spec, + required_if=required_if, + supports_check_mode=True + ) + + cm = ContainerManager(client) + client.module.exit_json(**cm.results) + +# import module snippets +from ansible.module_utils.basic import * + +if __name__ == '__main__': + main()
\ No newline at end of file diff --git a/roles/openshift_health_checker/library/docker_info.py b/roles/openshift_health_checker/library/docker_info.py new file mode 100644 index 000000000..0d0ddae8b --- /dev/null +++ b/roles/openshift_health_checker/library/docker_info.py @@ -0,0 +1,24 @@ +""" +Ansible module for determining information about the docker host. + +While there are several ansible modules that make use of the docker +api to expose container and image facts in a remote host, they +are unable to return specific information about the host machine +itself. This module exposes the same information obtained through +executing the `docker info` command on a docker host, in json format. +""" + +from ansible.module_utils.docker_common import AnsibleDockerClient + + +def main(): + """Entrypoint for running an Ansible module.""" + client = AnsibleDockerClient() + + client.module.exit_json( + info=client.info(), + ) + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/library/etcdkeysize.py b/roles/openshift_health_checker/library/etcdkeysize.py new file mode 100644 index 000000000..620e82d87 --- /dev/null +++ b/roles/openshift_health_checker/library/etcdkeysize.py @@ -0,0 +1,122 @@ +#!/usr/bin/python +"""Ansible module that recursively determines if the size of a key in an etcd cluster exceeds a given limit.""" + +from ansible.module_utils.basic import AnsibleModule + + +try: + import etcd + + IMPORT_EXCEPTION_MSG = None +except ImportError as err: + IMPORT_EXCEPTION_MSG = str(err) + + from collections import namedtuple + EtcdMock = namedtuple("etcd", ["EtcdKeyNotFound"]) + etcd = EtcdMock(KeyError) + + +# pylint: disable=too-many-arguments +def check_etcd_key_size(client, key, size_limit, total_size=0, depth=0, depth_limit=1000, visited=None): + """Check size of an etcd path starting at given key. Returns tuple (string, bool)""" + if visited is None: + visited = set() + + if key in visited: + return 0, False + + visited.add(key) + + try: + result = client.read(key, recursive=False) + except etcd.EtcdKeyNotFound: + return 0, False + + size = 0 + limit_exceeded = False + + for node in result.leaves: + if depth >= depth_limit: + raise Exception("Maximum recursive stack depth ({}) exceeded.".format(depth_limit)) + + if size_limit and total_size + size > size_limit: + return size, True + + if not node.dir: + size += len(node.value) + continue + + key_size, limit_exceeded = check_etcd_key_size(client, node.key, + size_limit, + total_size + size, + depth + 1, + depth_limit, visited) + size += key_size + + max_limit_exceeded = limit_exceeded or (total_size + size > size_limit) + return size, max_limit_exceeded + + +def main(): # pylint: disable=missing-docstring,too-many-branches + module = AnsibleModule( + argument_spec=dict( + size_limit_bytes=dict(type="int", default=0), + paths=dict(type="list", default=["/openshift.io/images"]), + host=dict(type="str", default="127.0.0.1"), + port=dict(type="int", default=4001), + protocol=dict(type="str", default="http"), + version_prefix=dict(type="str", default=""), + allow_redirect=dict(type="bool", default=False), + cert=dict(type="dict", default=""), + ca_cert=dict(type="str", default=None), + ), + supports_check_mode=True + ) + + module.params["cert"] = ( + module.params["cert"]["cert"], + module.params["cert"]["key"], + ) + + size_limit = module.params.pop("size_limit_bytes") + paths = module.params.pop("paths") + + limit_exceeded = False + + try: + # pylint: disable=no-member + client = etcd.Client(**module.params) + except AttributeError as attrerr: + msg = str(attrerr) + if IMPORT_EXCEPTION_MSG: + msg = IMPORT_EXCEPTION_MSG + if "No module named etcd" in IMPORT_EXCEPTION_MSG: + # pylint: disable=redefined-variable-type + msg = ('Unable to import the python "etcd" dependency. ' + 'Make sure python-etcd is installed on the host.') + + module.exit_json( + failed=True, + changed=False, + size_limit_exceeded=limit_exceeded, + msg=msg, + ) + + return + + size = 0 + for path in paths: + path_size, limit_exceeded = check_etcd_key_size(client, path, size_limit - size) + size += path_size + + if limit_exceeded: + break + + module.exit_json( + changed=False, + size_limit_exceeded=limit_exceeded, + ) + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/library/ocutil.py b/roles/openshift_health_checker/library/ocutil.py new file mode 100644 index 000000000..c72f4c5b3 --- /dev/null +++ b/roles/openshift_health_checker/library/ocutil.py @@ -0,0 +1,73 @@ +#!/usr/bin/python +"""Interface to OpenShift oc command""" + +import os +import shlex +import shutil +import subprocess + +from ansible.module_utils.basic import AnsibleModule + + +ADDITIONAL_PATH_LOOKUPS = ['/usr/local/bin', os.path.expanduser('~/bin')] + + +def locate_oc_binary(): + """Find and return oc binary file""" + # https://github.com/openshift/openshift-ansible/issues/3410 + # oc can be in /usr/local/bin in some cases, but that may not + # be in $PATH due to ansible/sudo + paths = os.environ.get("PATH", os.defpath).split(os.pathsep) + ADDITIONAL_PATH_LOOKUPS + + oc_binary = 'oc' + + # Use shutil.which if it is available, otherwise fallback to a naive path search + try: + which_result = shutil.which(oc_binary, path=os.pathsep.join(paths)) + if which_result is not None: + oc_binary = which_result + except AttributeError: + for path in paths: + if os.path.exists(os.path.join(path, oc_binary)): + oc_binary = os.path.join(path, oc_binary) + break + + return oc_binary + + +def main(): + """Module that executes commands on a remote OpenShift cluster""" + + module = AnsibleModule( + argument_spec=dict( + namespace=dict(type="str", required=False), + config_file=dict(type="str", required=True), + cmd=dict(type="str", required=True), + extra_args=dict(type="list", default=[]), + ), + ) + + cmd = [locate_oc_binary(), '--config', module.params["config_file"]] + if module.params["namespace"]: + cmd += ['-n', module.params["namespace"]] + cmd += shlex.split(module.params["cmd"]) + module.params["extra_args"] + + failed = True + try: + cmd_result = subprocess.check_output(list(cmd), stderr=subprocess.STDOUT) + failed = False + except subprocess.CalledProcessError as exc: + cmd_result = '[rc {}] {}\n{}'.format(exc.returncode, ' '.join(exc.cmd), exc.output) + except OSError as exc: + # we get this when 'oc' is not there + cmd_result = str(exc) + + module.exit_json( + changed=False, + failed=failed, + result=cmd_result, + ) + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/library/rpm_version.py b/roles/openshift_health_checker/library/rpm_version.py new file mode 100644 index 000000000..c24fbba3b --- /dev/null +++ b/roles/openshift_health_checker/library/rpm_version.py @@ -0,0 +1,133 @@ +#!/usr/bin/python +""" +Ansible module for rpm-based systems determining existing package version information in a host. +""" + +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.six import string_types + +IMPORT_EXCEPTION = None +try: + import rpm # pylint: disable=import-error +except ImportError as err: + IMPORT_EXCEPTION = err # in tox test env, rpm import fails + + +class RpmVersionException(Exception): + """Base exception class for package version problems""" + def __init__(self, message, problem_pkgs=None): + Exception.__init__(self, message) + self.problem_pkgs = problem_pkgs + + +def main(): + """Entrypoint for this Ansible module""" + module = AnsibleModule( + argument_spec=dict( + package_list=dict(type="list", required=True), + ), + supports_check_mode=True + ) + + if IMPORT_EXCEPTION: + module.fail_json(msg="rpm_version module could not import rpm: %s" % IMPORT_EXCEPTION) + + # determine the packages we will look for + pkg_list = module.params['package_list'] + if not pkg_list: + module.fail_json(msg="package_list must not be empty") + + # get list of packages available and complain if any + # of them are missing or if any errors occur + try: + pkg_versions = _retrieve_expected_pkg_versions(_to_dict(pkg_list)) + _check_pkg_versions(pkg_versions, _to_dict(pkg_list)) + except RpmVersionException as excinfo: + module.fail_json(msg=str(excinfo)) + module.exit_json(changed=False) + + +def _to_dict(pkg_list): + return {pkg["name"]: pkg for pkg in pkg_list} + + +def _retrieve_expected_pkg_versions(expected_pkgs_dict): + """Search for installed packages matching given pkg names + and versions. Returns a dictionary: {pkg_name: [versions]}""" + + transaction = rpm.TransactionSet() + pkgs = {} + + for pkg_name in expected_pkgs_dict: + matched_pkgs = transaction.dbMatch("name", pkg_name) + if not matched_pkgs: + continue + + for header in matched_pkgs: + if header['name'] == pkg_name: + if pkg_name not in pkgs: + pkgs[pkg_name] = [] + + pkgs[pkg_name].append(header['version']) + + return pkgs + + +def _check_pkg_versions(found_pkgs_dict, expected_pkgs_dict): + invalid_pkg_versions = {} + not_found_pkgs = [] + + for pkg_name, pkg in expected_pkgs_dict.items(): + if not found_pkgs_dict.get(pkg_name): + not_found_pkgs.append(pkg_name) + continue + + found_versions = [_parse_version(version) for version in found_pkgs_dict[pkg_name]] + + if isinstance(pkg["version"], string_types): + expected_versions = [_parse_version(pkg["version"])] + else: + expected_versions = [_parse_version(version) for version in pkg["version"]] + + if not set(expected_versions) & set(found_versions): + invalid_pkg_versions[pkg_name] = { + "found_versions": found_versions, + "required_versions": expected_versions, + } + + if not_found_pkgs: + raise RpmVersionException( + '\n'.join([ + "The following packages were not found to be installed: {}".format('\n '.join([ + "{}".format(pkg) + for pkg in not_found_pkgs + ])) + ]), + not_found_pkgs, + ) + + if invalid_pkg_versions: + raise RpmVersionException( + '\n '.join([ + "The following packages were found to be installed with an incorrect version: {}".format('\n'.join([ + " \n{}\n Required version: {}\n Found versions: {}".format( + pkg_name, + ', '.join(pkg["required_versions"]), + ', '.join([version for version in pkg["found_versions"]])) + for pkg_name, pkg in invalid_pkg_versions.items() + ])) + ]), + invalid_pkg_versions, + ) + + +def _parse_version(version_str): + segs = version_str.split('.') + if not segs or len(segs) <= 2: + return version_str + + return '.'.join(segs[0:2]) + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/library/search_journalctl.py b/roles/openshift_health_checker/library/search_journalctl.py new file mode 100644 index 000000000..3631f71c8 --- /dev/null +++ b/roles/openshift_health_checker/library/search_journalctl.py @@ -0,0 +1,150 @@ +#!/usr/bin/python +"""Interface to journalctl.""" + +from time import time +import json +import re +import subprocess + +from ansible.module_utils.basic import AnsibleModule + + +class InvalidMatcherRegexp(Exception): + """Exception class for invalid matcher regexp.""" + pass + + +class InvalidLogEntry(Exception): + """Exception class for invalid / non-json log entries.""" + pass + + +class LogInputSubprocessError(Exception): + """Exception class for errors that occur while executing a subprocess.""" + pass + + +def main(): + """Scan a given list of "log_matchers" for journalctl messages containing given patterns. + "log_matchers" is a list of dicts consisting of three keys that help fine-tune log searching: + 'start_regexp', 'regexp', and 'unit'. + + Sample "log_matchers" list: + + [ + { + 'start_regexp': r'Beginning of systemd unit', + 'regexp': r'the specific log message to find', + 'unit': 'etcd', + } + ] + """ + module = AnsibleModule( + argument_spec=dict( + log_count_limit=dict(type="int", default=500), + log_matchers=dict(type="list", required=True), + ), + ) + + timestamp_limit_seconds = time() - 60 * 60 # 1 hour + + log_count_limit = module.params["log_count_limit"] + log_matchers = module.params["log_matchers"] + + matched_regexp, errors = get_log_matches(log_matchers, log_count_limit, timestamp_limit_seconds) + + module.exit_json( + changed=False, + failed=bool(errors), + errors=errors, + matched=matched_regexp, + ) + + +def get_log_matches(matchers, log_count_limit, timestamp_limit_seconds): + """Return a list of up to log_count_limit matches for each matcher. + + Log entries are only considered if newer than timestamp_limit_seconds. + """ + matched_regexp = [] + errors = [] + + for matcher in matchers: + try: + log_output = get_log_output(matcher) + except LogInputSubprocessError as err: + errors.append(str(err)) + continue + + try: + matched = find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds) + if matched: + matched_regexp.append(matcher.get("regexp", "")) + except InvalidMatcherRegexp as err: + errors.append(str(err)) + except InvalidLogEntry as err: + errors.append(str(err)) + + return matched_regexp, errors + + +def get_log_output(matcher): + """Return an iterator on the logs of a given matcher.""" + try: + cmd_output = subprocess.Popen(list([ + '/bin/journalctl', + '-ru', matcher.get("unit", ""), + '--output', 'json', + ]), stdout=subprocess.PIPE) + + return iter(cmd_output.stdout.readline, '') + + except subprocess.CalledProcessError as exc: + msg = "Could not obtain journalctl logs for the specified systemd unit: {}: {}" + raise LogInputSubprocessError(msg.format(matcher.get("unit", "<missing>"), str(exc))) + except OSError as exc: + raise LogInputSubprocessError(str(exc)) + + +def find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds): + """Return log messages matched in iterable log_output by a given matcher. + + Ignore any log_output items older than timestamp_limit_seconds. + """ + try: + regexp = re.compile(matcher.get("regexp", "")) + start_regexp = re.compile(matcher.get("start_regexp", "")) + except re.error as err: + msg = "A log matcher object was provided with an invalid regular expression: {}" + raise InvalidMatcherRegexp(msg.format(str(err))) + + matched = None + + for log_count, line in enumerate(log_output): + if log_count >= log_count_limit: + break + + try: + obj = json.loads(line) + + # don't need to look past the most recent service restart + if start_regexp.match(obj["MESSAGE"]): + break + + log_timestamp_seconds = float(obj["__REALTIME_TIMESTAMP"]) / 1000000 + if log_timestamp_seconds < timestamp_limit_seconds: + break + + if regexp.match(obj["MESSAGE"]): + matched = line + break + + except ValueError: + msg = "Log entry for systemd unit {} contained invalid json syntax: {}" + raise InvalidLogEntry(msg.format(matcher.get("unit"), line)) + + return matched + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/meta/main.yml b/roles/openshift_health_checker/meta/main.yml new file mode 100644 index 000000000..bc8e7bdcf --- /dev/null +++ b/roles/openshift_health_checker/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: +- role: openshift_facts diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py new file mode 100644 index 000000000..ce05b44a4 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -0,0 +1,375 @@ +""" +Health checks for OpenShift clusters. +""" + +import json +import operator +import os +import time +import collections + +from abc import ABCMeta, abstractmethod, abstractproperty +from importlib import import_module + +from ansible.module_utils import six +from ansible.module_utils.six.moves import reduce # pylint: disable=import-error,redefined-builtin +from ansible.module_utils.six import string_types +from ansible.plugins.filter.core import to_bool as ansible_to_bool + + +class OpenShiftCheckException(Exception): + """Raised when a check encounters a failure condition.""" + + def __init__(self, name, msg=None): + # msg is for the message the user will see when this is raised. + # name is for test code to identify the error without looking at msg text. + if msg is None: # for parameter backward compatibility + msg = name + name = self.__class__.__name__ + self.name = name + super(OpenShiftCheckException, self).__init__(msg) + + +class OpenShiftCheckExceptionList(OpenShiftCheckException): + """A container for multiple errors that may be detected in one check.""" + def __init__(self, errors): + self.errors = errors + super(OpenShiftCheckExceptionList, self).__init__( + 'OpenShiftCheckExceptionList', + '\n'.join(str(msg) for msg in errors) + ) + + # make iterable + def __getitem__(self, index): + return self.errors[index] + + +FileToSave = collections.namedtuple("FileToSave", "filename contents remote_filename") + + +# pylint: disable=too-many-instance-attributes; all represent significantly different state. +# Arguably they could be separated into two hashes, one for storing parameters, and one for +# storing result state; but that smells more like clutter than clarity. +@six.add_metaclass(ABCMeta) +class OpenShiftCheck(object): + """A base class for defining checks for an OpenShift cluster environment. + + Optional init params: method execute_module, dict task_vars, and string tmp + execute_module is expected to have a signature compatible with _execute_module + from ansible plugins/action/__init__.py, e.g.: + def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *args): + This is stored so that it can be invoked in subclasses via check.execute_module("name", args) + which provides the check's stored task_vars and tmp. + + Optional init param: want_full_results + If the check can gather logs, tarballs, etc., do so when True; but no need to spend + the time if they're not wanted (won't be written to output directory). + """ + + def __init__(self, execute_module=None, task_vars=None, tmp=None, want_full_results=False): + # store a method for executing ansible modules from the check + self._execute_module = execute_module + # the task variables and tmpdir passed into the health checker task + self.task_vars = task_vars or {} + self.tmp = tmp + # a boolean for disabling the gathering of results (files, computations) that won't + # actually be recorded/used + self.want_full_results = want_full_results + + # mainly for testing purposes; see execute_module_with_retries + self._module_retries = 3 + self._module_retry_interval = 5 # seconds + + # state to be recorded for inspection after the check runs: + # + # set to True when the check changes the host, for accurate total "changed" count + self.changed = False + # list of OpenShiftCheckException for check to report (alternative to returning a failed result) + self.failures = [] + # list of FileToSave - files the check specifies to be written locally if so configured + self.files_to_save = [] + # log messages for the check - tuples of (description, msg) where msg is serializable. + # These are intended to be a sequential record of what the check observed and determined. + self.logs = [] + + @abstractproperty + def name(self): + """The name of this check, usually derived from the class name.""" + return "openshift_check" + + @property + def tags(self): + """A list of tags that this check satisfy. + + Tags are used to reference multiple checks with a single '@tagname' + special check name. + """ + return [] + + @staticmethod + def is_active(): + """Returns true if this check applies to the ansible-playbook run.""" + return True + + def is_first_master(self): + """Determine if running on first master. Returns: bool""" + masters = self.get_var("groups", "oo_first_master", default=None) or [None] + return masters[0] == self.get_var("ansible_host") + + @abstractmethod + def run(self): + """Executes a check against a host and returns a result hash similar to Ansible modules. + + Actually the direction ahead is to record state in the attributes and + not bother building a result hash. Instead, return an empty hash and let + the action plugin fill it in. Or raise an OpenShiftCheckException. + Returning a hash may become deprecated if it does not prove necessary. + """ + return {} + + @classmethod + def subclasses(cls): + """Returns a generator of subclasses of this class and its subclasses.""" + # AUDIT: no-member makes sense due to this having a metaclass + for subclass in cls.__subclasses__(): # pylint: disable=no-member + yield subclass + for subclass in subclass.subclasses(): + yield subclass + + def register_failure(self, error): + """Record in the check that a failure occurred. + + Recorded failures are merged into the result hash for now. They are also saved to output directory + (if provided) <check>.failures.json and registered as a log entry for context <check>.log.json. + """ + # It should be an exception; make it one if not + if not isinstance(error, OpenShiftCheckException): + error = OpenShiftCheckException(str(error)) + self.failures.append(error) + # duplicate it in the logs so it can be seen in the context of any + # information that led to the failure + self.register_log("failure: " + error.name, str(error)) + + def register_log(self, context, msg): + """Record an entry for the check log. + + Notes are intended to serve as context of the whole sequence of what the check observed. + They are be saved as an ordered list in a local check log file. + They are not to included in the result or in the ansible log; it's just for the record. + """ + self.logs.append([context, msg]) + + def register_file(self, filename, contents=None, remote_filename=""): + """Record a file that a check makes available to be saved individually to output directory. + + Either file contents should be passed in, or a file to be copied from the remote host + should be specified. Contents that are not a string are to be serialized as JSON. + + NOTE: When copying a file from remote host, it is slurped into memory as base64, meaning + you should avoid using this on huge files (more than say 10M). + """ + if contents is None and not remote_filename: + raise OpenShiftCheckException("File data/source not specified; this is a bug in the check.") + self.files_to_save.append(FileToSave(filename, contents, remote_filename)) + + def execute_module(self, module_name=None, module_args=None, save_as_name=None, register=True): + """Invoke an Ansible module from a check. + + Invoke stored _execute_module, normally copied from the action + plugin, with its params and the task_vars and tmp given at + check initialization. No positional parameters beyond these + are specified. If it's necessary to specify any of the other + parameters to _execute_module then that should just be invoked + directly (with awareness of changes in method signature per + Ansible version). + + So e.g. check.execute_module("foo", dict(arg1=...)) + + save_as_name specifies a file name for saving the result to an output directory, + if needed, and is intended to uniquely identify the result of invoking execute_module. + If not provided, the module name will be used. + If register is set False, then the result won't be registered in logs or files to save. + + Return: result hash from module execution. + """ + if self._execute_module is None: + raise NotImplementedError( + self.__class__.__name__ + + " invoked execute_module without providing the method at initialization." + ) + result = self._execute_module(module_name, module_args, self.tmp, self.task_vars) + if result.get("changed"): + self.changed = True + for output in ["result", "stdout"]: + # output is often JSON; attempt to decode + try: + result[output + "_json"] = json.loads(result[output]) + except (KeyError, ValueError): + pass + + if register: + self.register_log("execute_module: " + module_name, result) + self.register_file(save_as_name or module_name + ".json", result) + return result + + def execute_module_with_retries(self, module_name, module_args): + """Run execute_module and retry on failure.""" + result = {} + tries = 0 + while True: + res = self.execute_module(module_name, module_args) + if tries > self._module_retries or not res.get("failed"): + result.update(res) + return result + result["last_failed"] = res + tries += 1 + time.sleep(self._module_retry_interval) + + def get_var(self, *keys, **kwargs): + """Get deeply nested values from task_vars. + + Ansible task_vars structures are Python dicts, often mapping strings to + other dicts. This helper makes it easier to get a nested value, raising + OpenShiftCheckException when a key is not found. + + Keyword args: + default: + On missing key, return this as default value instead of raising exception. + convert: + Supply a function to apply to normalize the value before returning it. + None is the default (return as-is). + This function should raise ValueError if the user has provided a value + that cannot be converted, or OpenShiftCheckException if some other + problem needs to be described to the user. + """ + if len(keys) == 1: + keys = keys[0].split(".") + + try: + value = reduce(operator.getitem, keys, self.task_vars) + except (KeyError, TypeError): + if "default" not in kwargs: + raise OpenShiftCheckException( + "This check expects the '{}' inventory variable to be defined\n" + "in order to proceed, but it is undefined. There may be a bug\n" + "in Ansible, the checks, or their dependencies." + "".format(".".join(map(str, keys))) + ) + value = kwargs["default"] + + convert = kwargs.get("convert", None) + try: + if convert is None: + return value + elif convert is bool: # interpret bool as Ansible does, instead of python truthiness + return ansible_to_bool(value) + else: + return convert(value) + + except ValueError as error: # user error in specifying value + raise OpenShiftCheckException( + 'Cannot convert inventory variable to expected type:\n' + ' "{var}={value}"\n' + '{error}'.format(var=".".join(keys), value=value, error=error) + ) + + except OpenShiftCheckException: # some other check-specific problem + raise + + except Exception as error: # probably a bug in the function + raise OpenShiftCheckException( + 'There is a bug in this check. While trying to convert variable \n' + ' "{var}={value}"\n' + 'the given converter cannot be used or failed unexpectedly:\n' + '{type}: {error}'.format( + var=".".join(keys), + value=value, + type=error.__class__.__name__, + error=error + )) + + @staticmethod + def normalize(name_list): + """Return a clean list of names. + + The input may be a comma-separated string or a sequence. Leading and + trailing whitespace characters are removed. Empty items are discarded. + """ + if isinstance(name_list, string_types): + name_list = name_list.split(',') + return [name.strip() for name in name_list if name.strip()] + + @staticmethod + def get_major_minor_version(openshift_image_tag): + """Parse and return the deployed version of OpenShift as a tuple.""" + if openshift_image_tag and openshift_image_tag[0] == 'v': + openshift_image_tag = openshift_image_tag[1:] + + # map major release versions across releases + # to a common major version + openshift_major_release_version = { + "1": "3", + } + + components = openshift_image_tag.split(".") + if not components or len(components) < 2: + msg = "An invalid version of OpenShift was found for this host: {}" + raise OpenShiftCheckException(msg.format(openshift_image_tag)) + + if components[0] in openshift_major_release_version: + components[0] = openshift_major_release_version[components[0]] + + components = tuple(int(x) for x in components[:2]) + return components + + def find_ansible_mount(self, path): + """Return the mount point for path from ansible_mounts.""" + + # reorganize list of mounts into dict by path + mount_for_path = { + mount['mount']: mount + for mount + in self.get_var('ansible_mounts') + } + + # NOTE: including base cases '/' and '' to ensure the loop ends + mount_targets = set(mount_for_path.keys()) | {'/', ''} + mount_point = path + while mount_point not in mount_targets: + mount_point = os.path.dirname(mount_point) + + try: + mount = mount_for_path[mount_point] + self.register_log("mount point for " + path, mount) + return mount + except KeyError: + known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) + raise OpenShiftCheckException( + 'Unable to determine mount point for path "{}".\n' + 'Known mount points: {}.'.format(path, known_mounts or 'none') + ) + + +LOADER_EXCLUDES = ( + "__init__.py", + "mixins.py", + "logging.py", +) + + +def load_checks(path=None, subpkg=""): + """Dynamically import all check modules for the side effect of registering checks.""" + if path is None: + path = os.path.dirname(__file__) + + modules = [] + + for name in os.listdir(path): + if os.path.isdir(os.path.join(path, name)): + modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name) + continue + + if name.endswith(".py") and name not in LOADER_EXCLUDES: + modules.append(import_module(__package__ + subpkg + "." + name[:-3])) + + return modules diff --git a/roles/openshift_health_checker/openshift_checks/diagnostics.py b/roles/openshift_health_checker/openshift_checks/diagnostics.py new file mode 100644 index 000000000..1cfdc1129 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/diagnostics.py @@ -0,0 +1,62 @@ +""" +A check to run relevant diagnostics via `oc adm diagnostics`. +""" + +import os + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException + + +DIAGNOSTIC_LIST = ( + "AggregatedLogging ClusterRegistry ClusterRoleBindings ClusterRoles " + "ClusterRouter DiagnosticPod NetworkCheck" +).split() + + +class DiagnosticCheck(OpenShiftCheck): + """A check to run relevant diagnostics via `oc adm diagnostics`.""" + + name = "diagnostics" + tags = ["health"] + + def is_active(self): + return super(DiagnosticCheck, self).is_active() and self.is_first_master() + + def run(self): + if self.exec_diagnostic("ConfigContexts"): + # only run the other diagnostics if that one succeeds (otherwise, all will fail) + diagnostics = self.get_var("openshift_check_diagnostics", default=DIAGNOSTIC_LIST) + for diagnostic in self.normalize(diagnostics): + self.exec_diagnostic(diagnostic) + return {} + + def exec_diagnostic(self, diagnostic): + """ + Execute an 'oc adm diagnostics' command on the remote host. + Raises OcNotFound or registers OcDiagFailed. + Returns True on success or False on failure (non-zero rc). + """ + config_base = self.get_var("openshift.common.config_base") + args = { + "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), + "cmd": "adm diagnostics", + "extra_args": [diagnostic], + } + + result = self.execute_module("ocutil", args, save_as_name=diagnostic + ".failure.json") + self.register_file(diagnostic + ".txt", result['result']) + if result.get("failed"): + if result['result'] == '[Errno 2] No such file or directory': + raise OpenShiftCheckException( + "OcNotFound", + "This host is supposed to be a master but does not have the `oc` command where expected.\n" + "Has an installation been run on this host yet?" + ) + + self.register_failure(OpenShiftCheckException( + 'OcDiagFailed', + 'The {diag} diagnostic reported an error:\n' + '{error}'.format(diag=diagnostic, error=result['result']) + )) + return False + return True diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py new file mode 100644 index 000000000..7956559c6 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py @@ -0,0 +1,137 @@ +"""Check that there is enough disk space in predefined paths.""" + +import tempfile + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException + + +class DiskAvailability(OpenShiftCheck): + """Check that recommended disk space is available before a first-time install.""" + + name = "disk_availability" + tags = ["preflight"] + + # Values taken from the official installation documentation: + # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements + recommended_disk_space_bytes = { + '/var': { + 'oo_masters_to_config': 40 * 10**9, + 'oo_nodes_to_config': 15 * 10**9, + 'oo_etcd_to_config': 20 * 10**9, + }, + # Used to copy client binaries into, + # see roles/openshift_cli/library/openshift_container_binary_sync.py. + '/usr/local/bin': { + 'oo_masters_to_config': 1 * 10**9, + 'oo_nodes_to_config': 1 * 10**9, + 'oo_etcd_to_config': 1 * 10**9, + }, + # Used as temporary storage in several cases. + tempfile.gettempdir(): { + 'oo_masters_to_config': 1 * 10**9, + 'oo_nodes_to_config': 1 * 10**9, + 'oo_etcd_to_config': 1 * 10**9, + }, + } + + # recommended disk space for each location under an upgrade context + recommended_disk_upgrade_bytes = { + '/var': { + 'oo_masters_to_config': 10 * 10**9, + 'oo_nodes_to_config': 5 * 10 ** 9, + 'oo_etcd_to_config': 5 * 10 ** 9, + }, + } + + def is_active(self): + """Skip hosts that do not have recommended disk space requirements.""" + group_names = self.get_var("group_names", default=[]) + active_groups = set() + for recommendation in self.recommended_disk_space_bytes.values(): + active_groups.update(recommendation.keys()) + has_disk_space_recommendation = bool(active_groups.intersection(group_names)) + return super(DiskAvailability, self).is_active() and has_disk_space_recommendation + + def run(self): + group_names = self.get_var("group_names") + user_config = self.get_var("openshift_check_min_host_disk_gb", default={}) + try: + # For backwards-compatibility, if openshift_check_min_host_disk_gb + # is a number, then it overrides the required config for '/var'. + number = float(user_config) + user_config = { + '/var': { + 'oo_masters_to_config': number, + 'oo_nodes_to_config': number, + 'oo_etcd_to_config': number, + }, + } + except TypeError: + # If it is not a number, then it should be a nested dict. + pass + + self.register_log("recommended thresholds", self.recommended_disk_space_bytes) + if user_config: + self.register_log("user-configured thresholds", user_config) + + # TODO: as suggested in + # https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021, + # maybe we could support checking disk availability in paths that are + # not part of the official recommendation but present in the user + # configuration. + for path, recommendation in self.recommended_disk_space_bytes.items(): + free_bytes = self.free_bytes(path) + recommended_bytes = max(recommendation.get(name, 0) for name in group_names) + + config = user_config.get(path, {}) + # NOTE: the user config is in GB, but we compare bytes, thus the + # conversion. + config_bytes = max(config.get(name, 0) for name in group_names) * 10**9 + recommended_bytes = config_bytes or recommended_bytes + + # if an "upgrade" context is set, update the minimum disk requirement + # as this signifies an in-place upgrade - the node might have the + # required total disk space, but some of that space may already be + # in use by the existing OpenShift deployment. + context = self.get_var("r_openshift_health_checker_playbook_context", default="") + if context == "upgrade": + recommended_upgrade_paths = self.recommended_disk_upgrade_bytes.get(path, {}) + if recommended_upgrade_paths: + recommended_bytes = config_bytes or max(recommended_upgrade_paths.get(name, 0) + for name in group_names) + + if free_bytes < recommended_bytes: + free_gb = float(free_bytes) / 10**9 + recommended_gb = float(recommended_bytes) / 10**9 + msg = ( + 'Available disk space in "{}" ({:.1f} GB) ' + 'is below minimum recommended ({:.1f} GB)' + ).format(path, free_gb, recommended_gb) + + # warn if check failed under an "upgrade" context + # due to limits imposed by the user config + if config_bytes and context == "upgrade": + msg += ('\n\nMake sure to account for decreased disk space during an upgrade\n' + 'due to an existing OpenShift deployment. Please check the value of\n' + ' openshift_check_min_host_disk_gb={}\n' + 'in your Ansible inventory, and lower the recommended disk space availability\n' + 'if necessary for this upgrade.').format(config_bytes) + + self.register_failure(msg) + + return {} + + def free_bytes(self, path): + """Return the size available in path based on ansible_mounts.""" + mount = self.find_ansible_mount(path) + try: + return mount['size_available'] + except KeyError: + raise OpenShiftCheckException( + 'Unable to retrieve disk availability for "{path}".\n' + 'Ansible facts included a matching mount point for this path:\n' + ' {mount}\n' + 'however it is missing the size_available field.\n' + 'To investigate, you can inspect the output of `ansible -m setup <host>`' + ''.format(path=path, mount=mount) + ) diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py new file mode 100644 index 000000000..7c8ac78fe --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py @@ -0,0 +1,240 @@ +"""Check that required Docker images are available.""" + +from pipes import quote +from ansible.module_utils import six +from openshift_checks import OpenShiftCheck +from openshift_checks.mixins import DockerHostMixin + + +NODE_IMAGE_SUFFIXES = ["haproxy-router", "docker-registry", "deployer", "pod"] +DEPLOYMENT_IMAGE_INFO = { + "origin": { + "namespace": "openshift", + "name": "origin", + "registry_console_image": "cockpit/kubernetes", + }, + "openshift-enterprise": { + "namespace": "openshift3", + "name": "ose", + "registry_console_image": "registry.access.redhat.com/openshift3/registry-console", + }, +} + + +class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): + """Check that required Docker images are available. + + Determine docker images that an install would require and check that they + are either present in the host's docker index, or available for the host to pull + with known registries as defined in our inventory file (or defaults). + """ + + name = "docker_image_availability" + tags = ["preflight"] + # we use python-docker-py to check local docker for images, and skopeo + # to look for images available remotely without waiting to pull them. + dependencies = ["python-docker-py", "skopeo"] + # command for checking if remote registries have an image, without docker pull + skopeo_command = "timeout 10 skopeo inspect --tls-verify={tls} {creds} docker://{registry}/{image}" + skopeo_example_command = "skopeo inspect [--tls-verify=false] [--creds=<user>:<pass>] docker://<registry>/<image>" + + def __init__(self, *args, **kwargs): + super(DockerImageAvailability, self).__init__(*args, **kwargs) + + self.registries = dict( + # set of registries that need to be checked insecurely (note: not accounting for CIDR entries) + insecure=set(self.ensure_list("openshift_docker_insecure_registries")), + # set of registries that should never be queried even if given in the image + blocked=set(self.ensure_list("openshift_docker_blocked_registries")), + ) + + # ordered list of registries (according to inventory vars) that docker will try for unscoped images + regs = self.ensure_list("openshift_docker_additional_registries") + # currently one of these registries is added whether the user wants it or not. + deployment_type = self.get_var("openshift_deployment_type") + if deployment_type == "origin" and "docker.io" not in regs: + regs.append("docker.io") + elif deployment_type == 'openshift-enterprise' and "registry.access.redhat.com" not in regs: + regs.append("registry.access.redhat.com") + self.registries["configured"] = regs + + # for the oreg_url registry there may be credentials specified + components = self.get_var("oreg_url", default="").split('/') + self.registries["oreg"] = "" if len(components) < 3 else components[0] + self.skopeo_command_creds = "" + oreg_auth_user = self.get_var('oreg_auth_user', default='') + oreg_auth_password = self.get_var('oreg_auth_password', default='') + if oreg_auth_user != '' and oreg_auth_password != '': + self.skopeo_command_creds = "--creds={}:{}".format(quote(oreg_auth_user), quote(oreg_auth_password)) + + # record whether we could reach a registry or not (and remember results) + self.reachable_registries = {} + + def is_active(self): + """Skip hosts with unsupported deployment types.""" + deployment_type = self.get_var("openshift_deployment_type") + has_valid_deployment_type = deployment_type in DEPLOYMENT_IMAGE_INFO + + return super(DockerImageAvailability, self).is_active() and has_valid_deployment_type + + def run(self): + msg, failed = self.ensure_dependencies() + if failed: + return { + "failed": True, + "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg + } + + required_images = self.required_images() + missing_images = set(required_images) - set(self.local_images(required_images)) + + # exit early if all images were found locally + if not missing_images: + return {} + + available_images = self.available_images(missing_images) + unavailable_images = set(missing_images) - set(available_images) + + if unavailable_images: + unreachable = [reg for reg, reachable in self.reachable_registries.items() if not reachable] + unreachable_msg = "Failed connecting to: {}\n".format(", ".join(unreachable)) + blocked_msg = "Blocked registries: {}\n".format(", ".join(self.registries["blocked"])) + msg = ( + "One or more required container images are not available:\n {missing}\n" + "Checked with: {cmd}\n" + "Default registries searched: {registries}\n" + "{blocked}" + "{unreachable}" + ).format( + missing=",\n ".join(sorted(unavailable_images)), + cmd=self.skopeo_example_command, + registries=", ".join(self.registries["configured"]), + blocked=blocked_msg if self.registries["blocked"] else "", + unreachable=unreachable_msg if unreachable else "", + ) + + return dict(failed=True, msg=msg) + + return {} + + def required_images(self): + """ + Determine which images we expect to need for this host. + Returns: a set of required images like 'openshift/origin:v3.6' + + The thorny issue of determining the image names from the variables is under consideration + via https://github.com/openshift/openshift-ansible/issues/4415 + + For now we operate as follows: + * For containerized components (master, node, ...) we look at the deployment type and + use openshift/origin or openshift3/ose as the base for those component images. The + version is openshift_image_tag as determined by the openshift_version role. + * For OpenShift-managed infrastructure (router, registry...) we use oreg_url if + it is defined; otherwise we again use the base that depends on the deployment type. + Registry is not included in constructed images. It may be in oreg_url or etcd image. + """ + required = set() + deployment_type = self.get_var("openshift_deployment_type") + host_groups = self.get_var("group_names") + # containerized etcd may not have openshift_image_tag, see bz 1466622 + image_tag = self.get_var("openshift_image_tag", default="latest") + image_info = DEPLOYMENT_IMAGE_INFO[deployment_type] + + # template for images that run on top of OpenShift + image_url = "{}/{}-{}:{}".format(image_info["namespace"], image_info["name"], "${component}", "${version}") + image_url = self.get_var("oreg_url", default="") or image_url + if 'oo_nodes_to_config' in host_groups: + for suffix in NODE_IMAGE_SUFFIXES: + required.add(image_url.replace("${component}", suffix).replace("${version}", image_tag)) + # The registry-console is for some reason not prefixed with ose- like the other components. + # Nor is it versioned the same, so just look for latest. + # Also a completely different name is used for Origin. + required.add(image_info["registry_console_image"]) + + # images for containerized components + if self.get_var("openshift", "common", "is_containerized"): + components = set() + if 'oo_nodes_to_config' in host_groups: + components.update(["node", "openvswitch"]) + if 'oo_masters_to_config' in host_groups: # name is "origin" or "ose" + components.add(image_info["name"]) + for component in components: + required.add("{}/{}:{}".format(image_info["namespace"], component, image_tag)) + if 'oo_etcd_to_config' in host_groups: # special case, note it is the same for origin/enterprise + required.add("registry.access.redhat.com/rhel7/etcd") # and no image tag + + return required + + def local_images(self, images): + """Filter a list of images and return those available locally.""" + found_images = [] + for image in images: + # docker could have the image name as-is or prefixed with any registry + imglist = [image] + [reg + "/" + image for reg in self.registries["configured"]] + if self.is_image_local(imglist): + found_images.append(image) + return found_images + + def is_image_local(self, image): + """Check if image is already in local docker index.""" + result = self.execute_module("docker_image_facts", {"name": image}) + return bool(result.get("images")) and not result.get("failed") + + def ensure_list(self, registry_param): + """Return the task var as a list.""" + # https://bugzilla.redhat.com/show_bug.cgi?id=1497274 + # If the result was a string type, place it into a list. We must do this + # as using list() on a string will split the string into its characters. + # Otherwise cast to a list as was done previously. + registry = self.get_var(registry_param, default=[]) + if not isinstance(registry, six.string_types): + return list(registry) + return self.normalize(registry) + + def available_images(self, images): + """Search remotely for images. Returns: list of images found.""" + return [ + image for image in images + if self.is_available_skopeo_image(image) + ] + + def is_available_skopeo_image(self, image): + """Use Skopeo to determine if required image exists in known registry(s).""" + registries = self.registries["configured"] + # If image already includes a registry, only use that. + # NOTE: This logic would incorrectly identify images that do not use a namespace, e.g. + # registry.access.redhat.com/rhel7 as if the registry were a namespace. + # It's not clear that there's any way to distinguish them, but fortunately + # the current set of images all look like [registry/]namespace/name[:version]. + if image.count("/") > 1: + registry, image = image.split("/", 1) + registries = [registry] + + for registry in registries: + if registry in self.registries["blocked"]: + continue # blocked will never be consulted + if registry not in self.reachable_registries: + self.reachable_registries[registry] = self.connect_to_registry(registry) + if not self.reachable_registries[registry]: + continue # do not keep trying unreachable registries + + args = dict(registry=registry, image=image) + args["tls"] = "false" if registry in self.registries["insecure"] else "true" + args["creds"] = self.skopeo_command_creds if registry == self.registries["oreg"] else "" + + result = self.execute_module_with_retries("command", {"_raw_params": self.skopeo_command.format(**args)}) + if result.get("rc", 0) == 0 and not result.get("failed"): + return True + if result.get("rc") == 124: # RC 124 == timed out; mark unreachable + self.reachable_registries[registry] = False + + return False + + def connect_to_registry(self, registry): + """Use ansible wait_for module to test connectivity from host to registry. Returns bool.""" + # test a simple TCP connection + host, _, port = registry.partition(":") + port = port or 443 + args = dict(host=host, port=port, state="started", timeout=30) + result = self.execute_module("wait_for", args) + return result.get("rc", 0) == 0 and not result.get("failed") diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py new file mode 100644 index 000000000..0558ddf14 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -0,0 +1,276 @@ +"""Check Docker storage driver and usage.""" +import json +import re +from openshift_checks import OpenShiftCheck, OpenShiftCheckException +from openshift_checks.mixins import DockerHostMixin + + +class DockerStorage(DockerHostMixin, OpenShiftCheck): + """Check Docker storage driver compatibility. + + This check ensures that Docker is using a supported storage driver, + and that loopback is not being used (if using devicemapper). + Also that storage usage is not above threshold. + """ + + name = "docker_storage" + tags = ["pre-install", "health", "preflight"] + + dependencies = ["python-docker-py"] + storage_drivers = ["devicemapper", "overlay", "overlay2"] + max_thinpool_data_usage_percent = 90.0 + max_thinpool_meta_usage_percent = 90.0 + max_overlay_usage_percent = 90.0 + + # TODO(lmeyer): mention these in the output when check fails + configuration_variables = [ + ( + "max_thinpool_data_usage_percent", + "For 'devicemapper' storage driver, usage threshold percentage for data. " + "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent), + ), + ( + "max_thinpool_meta_usage_percent", + "For 'devicemapper' storage driver, usage threshold percentage for metadata. " + "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent), + ), + ( + "max_overlay_usage_percent", + "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. " + "Format: float. Default: {:.1f}".format(max_overlay_usage_percent), + ), + ] + + def run(self): + msg, failed = self.ensure_dependencies() + if failed: + return { + "failed": True, + "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg + } + + # attempt to get the docker info hash from the API + docker_info = self.execute_module("docker_info", {}) + if docker_info.get("failed"): + return {"failed": True, + "msg": "Failed to query Docker API. Is docker running on this host?"} + if not docker_info.get("info"): # this would be very strange + return {"failed": True, + "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))} + docker_info = docker_info["info"] + + # check if the storage driver we saw is valid + driver = docker_info.get("Driver", "[NONE]") + if driver not in self.storage_drivers: + msg = ( + "Detected unsupported Docker storage driver '{driver}'.\n" + "Supported storage drivers are: {drivers}" + ).format(driver=driver, drivers=', '.join(self.storage_drivers)) + return {"failed": True, "msg": msg} + + # driver status info is a list of tuples; convert to dict and validate based on driver + driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])} + + result = {} + + if driver == "devicemapper": + result = self.check_devicemapper_support(driver_status) + + if driver in ['overlay', 'overlay2']: + result = self.check_overlay_support(docker_info, driver_status) + + return result + + def check_devicemapper_support(self, driver_status): + """Check if dm storage driver is supported as configured. Return: result dict.""" + if driver_status.get("Data loop file"): + msg = ( + "Use of loopback devices with the Docker devicemapper storage driver\n" + "(the default storage configuration) is unsupported in production.\n" + "Please use docker-storage-setup to configure a backing storage volume.\n" + "See http://red.ht/2rNperO for further information." + ) + return {"failed": True, "msg": msg} + result = self.check_dm_usage(driver_status) + return result + + def check_dm_usage(self, driver_status): + """Check usage thresholds for Docker dm storage driver. Return: result dict. + Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool + implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures + devicemapper storage. The LV is "thin" because it does not use all available storage + from its VG, instead expanding as needed; so to determine available space, we gather + current usage as the Docker API reports for the driver as well as space available for + expansion in the pool's VG. + Usage within the LV is divided into pools allocated to data and metadata, either of which + could run out of space first; so we check both. + """ + vals = dict( + vg_free=self.get_vg_free(driver_status.get("Pool Name")), + data_used=driver_status.get("Data Space Used"), + data_total=driver_status.get("Data Space Total"), + metadata_used=driver_status.get("Metadata Space Used"), + metadata_total=driver_status.get("Metadata Space Total"), + ) + + # convert all human-readable strings to bytes + for key, value in vals.copy().items(): + try: + vals[key + "_bytes"] = self.convert_to_bytes(value) + except ValueError as err: # unlikely to hit this from API info, but just to be safe + return { + "failed": True, + "values": vals, + "msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err)) + } + + # determine the threshold percentages which usage should not exceed + for name, default in [("data", self.max_thinpool_data_usage_percent), + ("metadata", self.max_thinpool_meta_usage_percent)]: + percent = self.get_var("max_thinpool_" + name + "_usage_percent", default=default) + try: + vals[name + "_threshold"] = float(percent) + except ValueError: + return { + "failed": True, + "msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent) + } + + # test whether the thresholds are exceeded + messages = [] + for name in ["data", "metadata"]: + vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / ( + vals[name + "_total_bytes"] + vals["vg_free_bytes"]) + if vals[name + "_pct_used"] > vals[name + "_threshold"]: + messages.append( + "Docker thinpool {name} usage percentage {pct:.1f} " + "is higher than threshold {thresh:.1f}.".format( + name=name, + pct=vals[name + "_pct_used"], + thresh=vals[name + "_threshold"], + )) + vals["failed"] = True + + vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."]) + return vals + + def get_vg_free(self, pool): + """Determine which VG to examine according to the pool name. Return: size vgs reports. + Pool name is the only indicator currently available from the Docker API driver info. + We assume a name that looks like "vg--name-docker--pool"; + vg and lv names with inner hyphens doubled, joined by a hyphen. + """ + match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen + if not match: # unlikely, but... be clear if we assumed wrong + raise OpenShiftCheckException( + "This host's Docker reports it is using a storage pool named '{}'.\n" + "However this name does not have the expected format of 'vgname-lvname'\n" + "so the available storage in the VG cannot be determined.".format(pool) + ) + vg_name = match.groups()[0].replace("--", "-") + vgs_cmd = "/sbin/vgs --noheadings -o vg_free --units g --select vg_name=" + vg_name + # should return free space like " 12.00g" if the VG exists; empty if it does not + + ret = self.execute_module("command", {"_raw_params": vgs_cmd}) + if ret.get("failed") or ret.get("rc", 0) != 0: + raise OpenShiftCheckException( + "Is LVM installed? Failed to run /sbin/vgs " + "to determine docker storage usage:\n" + ret.get("msg", "") + ) + size = ret.get("stdout", "").strip() + if not size: + raise OpenShiftCheckException( + "This host's Docker reports it is using a storage pool named '{pool}'.\n" + "which we expect to come from local VG '{vg}'.\n" + "However, /sbin/vgs did not find this VG. Is Docker for this host" + "running and using the storage on the host?".format(pool=pool, vg=vg_name) + ) + return size + + @staticmethod + def convert_to_bytes(string): + """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes.""" + units = dict( + b=1, + k=1024, + m=1024**2, + g=1024**3, + t=1024**4, + p=1024**5, + ) + string = string or "" + match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string) # float followed by optional unit + if not match: + raise ValueError("Cannot convert to a byte size: " + string) + + number, unit = match.groups() + multiplier = 1 if not unit else units.get(unit.lower()) + if not multiplier: + raise ValueError("Cannot convert to a byte size: " + string) + + return float(number) * multiplier + + def check_overlay_support(self, docker_info, driver_status): + """Check if overlay storage driver is supported for this host. Return: result dict.""" + # check for xfs as backing store + backing_fs = driver_status.get("Backing Filesystem", "[NONE]") + if backing_fs != "xfs": + msg = ( + "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n" + "'xfs' as the backing storage, but this host's storage is type '{fs}'." + ).format(fs=backing_fs) + return {"failed": True, "msg": msg} + + # check support for OS and kernel version + o_s = docker_info.get("OperatingSystem", "[NONE]") + if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s: + # keep it simple, only check enterprise kernel versions; assume everyone else is good + kernel = docker_info.get("KernelVersion", "[NONE]") + kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)] + if kernel_arr < [3, 10, 0, 514]: # rhel < 7.3 + msg = ( + "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n" + "kernel version 3.10.0-514; but Docker reports kernel version {version}." + ).format(version=kernel) + return {"failed": True, "msg": msg} + # NOTE: we could check for --selinux-enabled here but docker won't even start with + # that option until it's supported in the kernel so we don't need to. + + return self.check_overlay_usage(docker_info) + + def check_overlay_usage(self, docker_info): + """Check disk usage on OverlayFS backing store volume. Return: result dict.""" + path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"] + + threshold = self.get_var("max_overlay_usage_percent", default=self.max_overlay_usage_percent) + try: + threshold = float(threshold) + except ValueError: + return { + "failed": True, + "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold), + } + + mount = self.find_ansible_mount(path) + try: + free_bytes = mount['size_available'] + total_bytes = mount['size_total'] + usage = 100.0 * (total_bytes - free_bytes) / total_bytes + except (KeyError, ZeroDivisionError): + return { + "failed": True, + "msg": "The ansible_mount found for path {} is invalid.\n" + "This is likely to be an Ansible bug. The record was:\n" + "{}".format(path, json.dumps(mount, indent=2)), + } + + if usage > threshold: + return { + "failed": True, + "msg": ( + "For Docker OverlayFS mount point {path},\n" + "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}." + ).format(path=mount["mount"], pct=usage, thresh=threshold) + } + + return {} diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py new file mode 100644 index 000000000..f4296753a --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py @@ -0,0 +1,72 @@ +""" +Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster. +""" + +from openshift_checks import OpenShiftCheck + + +class EtcdImageDataSize(OpenShiftCheck): + """Check that total size of OpenShift image data does not exceed the recommended limit in an etcd cluster""" + + name = "etcd_imagedata_size" + tags = ["etcd"] + + def run(self): + etcd_mountpath = self.find_ansible_mount("/var/lib/etcd") + etcd_avail_diskspace = etcd_mountpath["size_available"] + etcd_total_diskspace = etcd_mountpath["size_total"] + + etcd_imagedata_size_limit = self.get_var( + "etcd_max_image_data_size_bytes", + default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace)) + ) + + etcd_is_ssl = self.get_var("openshift", "master", "etcd_use_ssl", default=False) + etcd_port = self.get_var("openshift", "master", "etcd_port", default=2379) + etcd_hosts = self.get_var("openshift", "master", "etcd_hosts") + + config_base = self.get_var("openshift", "common", "config_base") + + cert = self.get_var("etcd_client_cert", default=config_base + "/master/master.etcd-client.crt") + key = self.get_var("etcd_client_key", default=config_base + "/master/master.etcd-client.key") + ca_cert = self.get_var("etcd_client_ca_cert", default=config_base + "/master/master.etcd-ca.crt") + + for etcd_host in list(etcd_hosts): + args = { + "size_limit_bytes": etcd_imagedata_size_limit, + "paths": ["/openshift.io/images", "/openshift.io/imagestreams"], + "host": etcd_host, + "port": etcd_port, + "protocol": "https" if etcd_is_ssl else "http", + "version_prefix": "/v2", + "allow_redirect": True, + "ca_cert": ca_cert, + "cert": { + "cert": cert, + "key": key, + }, + } + + etcdkeysize = self.execute_module("etcdkeysize", args) + + if etcdkeysize.get("rc", 0) != 0 or etcdkeysize.get("failed"): + msg = 'Failed to retrieve stats for etcd host "{host}": {reason}' + reason = etcdkeysize.get("msg") + if etcdkeysize.get("module_stderr"): + reason = etcdkeysize["module_stderr"] + + msg = msg.format(host=etcd_host, reason=reason) + return {"failed": True, "msg": msg} + + if etcdkeysize["size_limit_exceeded"]: + limit = self._to_gigabytes(etcd_imagedata_size_limit) + msg = ("The size of OpenShift image data stored in etcd host " + "\"{host}\" exceeds the maximum recommended limit of {limit:.2f} GB. " + "Use the `oadm prune images` command to cleanup unused Docker images.") + return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)} + + return {} + + @staticmethod + def _to_gigabytes(byte_size): + return float(byte_size) / 10.0**9 diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py new file mode 100644 index 000000000..8b20ccb49 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py @@ -0,0 +1,44 @@ +"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic.""" + +from openshift_checks import OpenShiftCheck + + +class EtcdTraffic(OpenShiftCheck): + """Check if host is being affected by an increase in etcd traffic.""" + + name = "etcd_traffic" + tags = ["health", "etcd"] + + def is_active(self): + """Skip hosts that do not have etcd in their group names.""" + group_names = self.get_var("group_names", default=[]) + valid_group_names = "oo_etcd_to_config" in group_names + + version = self.get_major_minor_version(self.get_var("openshift_image_tag")) + valid_version = version in ((3, 4), (3, 5)) + + return super(EtcdTraffic, self).is_active() and valid_group_names and valid_version + + def run(self): + is_containerized = self.get_var("openshift", "common", "is_containerized") + unit = "etcd_container" if is_containerized else "etcd" + + log_matchers = [{ + "start_regexp": r"Starting Etcd Server", + "regexp": r"etcd: sync duration of [^,]+, expected less than 1s", + "unit": unit + }] + + match = self.execute_module("search_journalctl", {"log_matchers": log_matchers}) + + if match.get("matched"): + msg = ("Higher than normal etcd traffic detected.\n" + "OpenShift 3.4 introduced an increase in etcd traffic.\n" + "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n" + "Please refer to https://access.redhat.com/solutions/2916381 for more information.") + return {"failed": True, "msg": msg} + + if match.get("failed"): + return {"failed": True, "msg": "\n".join(match.get("errors"))} + + return {} diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py new file mode 100644 index 000000000..3d75da6f9 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -0,0 +1,47 @@ +"""A health check for OpenShift clusters.""" + +from openshift_checks import OpenShiftCheck + + +class EtcdVolume(OpenShiftCheck): + """Ensures etcd storage usage does not exceed a given threshold.""" + + name = "etcd_volume" + tags = ["etcd", "health"] + + # Default device usage threshold. Value should be in the range [0, 100]. + default_threshold_percent = 90 + # Where to find etcd data + etcd_mount_path = "/var/lib/etcd" + + def is_active(self): + etcd_hosts = ( + self.get_var("groups", "oo_etcd_to_config", default=[]) or + self.get_var("groups", "oo_masters_to_config", default=[]) or + [] + ) + is_etcd_host = self.get_var("ansible_host") in etcd_hosts + return super(EtcdVolume, self).is_active() and is_etcd_host + + def run(self): + mount_info = self.find_ansible_mount(self.etcd_mount_path) + available = mount_info["size_available"] + total = mount_info["size_total"] + used = total - available + + threshold = self.get_var( + "etcd_device_usage_threshold_percent", + default=self.default_threshold_percent + ) + + used_percent = 100.0 * used / total + + if used_percent > threshold: + device = mount_info.get("device", "unknown") + mount = mount_info.get("mount", "unknown") + msg = "etcd storage usage ({:.1f}%) is above threshold ({:.1f}%). Device: {}, mount: {}.".format( + used_percent, threshold, device, mount + ) + return {"failed": True, "msg": msg} + + return {} diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py new file mode 100644 index 000000000..b27f97172 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -0,0 +1,43 @@ +"""Check for an aggregated logging Curator deployment""" + +from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck + + +class Curator(LoggingCheck): + """Check for an aggregated logging Curator deployment""" + + name = "curator" + tags = ["health", "logging"] + + def run(self): + """Check various things and gather errors. Returns: result as hash""" + + curator_pods = self.get_pods_for_component("curator") + self.check_curator(curator_pods) + # TODO(lmeyer): run it all again for the ops cluster + + return {} + + def check_curator(self, pods): + """Check to see if curator is up and working. Returns: error string""" + if not pods: + raise OpenShiftCheckException( + "MissingComponentPods", + "There are no Curator pods for the logging stack,\n" + "so nothing will prune Elasticsearch indexes.\n" + "Is Curator correctly deployed?" + ) + + not_running = self.not_running_pods(pods) + if len(not_running) == len(pods): + raise OpenShiftCheckException( + "CuratorNotRunning", + "The Curator pod is not currently in a running state,\n" + "so Elasticsearch indexes may increase without bound." + ) + if len(pods) - len(not_running) > 1: + raise OpenShiftCheckException( + "TooManyCurators", + "There is more than one Curator pod running. This should not normally happen.\n" + "Although this doesn't cause any problems, you may want to investigate." + ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py new file mode 100644 index 000000000..986a01f38 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -0,0 +1,212 @@ +"""Check for an aggregated logging Elasticsearch deployment""" + +import json +import re + +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList +from openshift_checks.logging.logging import LoggingCheck + + +class Elasticsearch(LoggingCheck): + """Check for an aggregated logging Elasticsearch deployment""" + + name = "elasticsearch" + tags = ["health", "logging"] + + def run(self): + """Check various things and gather errors. Returns: result as hash""" + + es_pods = self.get_pods_for_component("es") + self.check_elasticsearch(es_pods) + # TODO(lmeyer): run it all again for the ops cluster + + return {} + + def check_elasticsearch(self, es_pods): + """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors.""" + running_pods, errors = self.running_elasticsearch_pods(es_pods) + pods_by_name = { + pod['metadata']['name']: pod for pod in running_pods + # Filter out pods that are not members of a DC + if pod['metadata'].get('labels', {}).get('deploymentconfig') + } + if not pods_by_name: + # nothing running, cannot run the rest of the check + errors.append(OpenShiftCheckException( + 'NoRunningPods', + 'No logging Elasticsearch pods were found running, so no logs are being aggregated.' + )) + raise OpenShiftCheckExceptionList(errors) + + errors += self.check_elasticsearch_masters(pods_by_name) + errors += self.check_elasticsearch_node_list(pods_by_name) + errors += self.check_es_cluster_health(pods_by_name) + errors += self.check_elasticsearch_diskspace(pods_by_name) + if errors: + raise OpenShiftCheckExceptionList(errors) + + def running_elasticsearch_pods(self, es_pods): + """Returns: list of running pods, list of errors about non-running pods""" + not_running = self.not_running_pods(es_pods) + running_pods = [pod for pod in es_pods if pod not in not_running] + if not_running: + return running_pods, [OpenShiftCheckException( + 'PodNotRunning', + 'The following Elasticsearch pods are defined but not running:\n' + '{pods}'.format(pods=''.join( + " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) + for pod in not_running + )) + )] + return running_pods, [] + + @staticmethod + def _build_es_curl_cmd(pod_name, url): + base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'" + return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) + + def check_elasticsearch_masters(self, pods_by_name): + """Check that Elasticsearch masters are sane. Returns: list of errors""" + es_master_names = set() + errors = [] + for pod_name in pods_by_name.keys(): + # Compare what each ES node reports as master and compare for split brain + get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") + master_name_str = self.exec_oc(get_master_cmd, [], save_as_name="get_master_names.json") + master_names = (master_name_str or '').split(' ') + if len(master_names) > 1: + es_master_names.add(master_names[1]) + else: + errors.append(OpenShiftCheckException( + 'NoMasterName', + 'Elasticsearch {pod} gave unexpected response when asked master name:\n' + ' {response}'.format(pod=pod_name, response=master_name_str) + )) + + if not es_master_names: + errors.append(OpenShiftCheckException( + 'NoMasterFound', + 'No logging Elasticsearch masters were found.' + )) + return errors + + if len(es_master_names) > 1: + errors.append(OpenShiftCheckException( + 'SplitBrainMasters', + 'Found multiple Elasticsearch masters according to the pods:\n' + '{master_list}\n' + 'This implies that the masters have "split brain" and are not correctly\n' + 'replicating data for the logging cluster. Log loss is likely to occur.' + .format(master_list='\n'.join(' ' + master for master in es_master_names)) + )) + + return errors + + def check_elasticsearch_node_list(self, pods_by_name): + """Check that reported ES masters are accounted for by pods. Returns: list of errors""" + + if not pods_by_name: + return [OpenShiftCheckException( + 'MissingComponentPods', + 'No logging Elasticsearch pods were found.' + )] + + # get ES cluster nodes + node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') + cluster_node_data = self.exec_oc(node_cmd, [], save_as_name="get_es_nodes.json") + try: + cluster_nodes = json.loads(cluster_node_data)['nodes'] + except (ValueError, KeyError): + return [OpenShiftCheckException( + 'MissingNodeList', + 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' + + cluster_node_data + )] + + # Try to match all ES-reported node hosts to known pods. + errors = [] + for node in cluster_nodes.values(): + # Note that with 1.4/3.4 the pod IP may be used as the master name + if not any(node['host'] in (pod_name, pod['status'].get('podIP')) + for pod_name, pod in pods_by_name.items()): + errors.append(OpenShiftCheckException( + 'EsPodNodeMismatch', + 'The Elasticsearch cluster reports a member node "{node}"\n' + 'that does not correspond to any known ES pod.'.format(node=node['host']) + )) + + return errors + + def check_es_cluster_health(self, pods_by_name): + """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" + errors = [] + for pod_name in pods_by_name.keys(): + cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') + cluster_health_data = self.exec_oc(cluster_health_cmd, [], save_as_name='get_es_health.json') + try: + health_res = json.loads(cluster_health_data) + if not health_res or not health_res.get('status'): + raise ValueError() + except ValueError: + errors.append(OpenShiftCheckException( + 'BadEsResponse', + 'Could not retrieve cluster health status from logging ES pod "{pod}".\n' + 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data) + )) + continue + + if health_res['status'] not in ['green', 'yellow']: + errors.append(OpenShiftCheckException( + 'EsClusterHealthRed', + 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name) + )) + + return errors + + def check_elasticsearch_diskspace(self, pods_by_name): + """ + Exec into an ES pod and query the diskspace on the persistent volume. + Returns: list of errors + """ + errors = [] + for pod_name in pods_by_name.keys(): + df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) + disk_output = self.exec_oc(df_cmd, [], save_as_name='get_pv_diskspace.json') + lines = disk_output.splitlines() + # expecting one header looking like 'IUse% Use%' and one body line + body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' + if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]): + errors.append(OpenShiftCheckException( + 'BadDfResponse', + 'Could not retrieve storage usage from logging ES pod "{pod}".\n' + 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output) + )) + continue + inode_pct, disk_pct = re.match(body_re, lines[1]).groups() + + inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90') + if int(inode_pct) >= int(inode_pct_thresh): + errors.append(OpenShiftCheckException( + 'InodeUsageTooHigh', + 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n' + ' is {pct}, greater than threshold {limit}.\n' + ' Note: threshold can be specified in inventory with {param}'.format( + pod=pod_name, + pct=str(inode_pct), + limit=str(inode_pct_thresh), + param='openshift_check_efk_es_inode_pct', + ))) + disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80') + if int(disk_pct) >= int(disk_pct_thresh): + errors.append(OpenShiftCheckException( + 'DiskUsageTooHigh', + 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n' + ' is {pct}, greater than threshold {limit}.\n' + ' Note: threshold can be specified in inventory with {param}'.format( + pod=pod_name, + pct=str(disk_pct), + limit=str(disk_pct_thresh), + param='openshift_check_efk_es_storage_pct', + ))) + + return errors diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py new file mode 100644 index 000000000..3b192a281 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -0,0 +1,154 @@ +"""Check for an aggregated logging Fluentd deployment""" + +import json + + +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList +from openshift_checks.logging.logging import LoggingCheck + + +class Fluentd(LoggingCheck): + """Check for an aggregated logging Fluentd deployment""" + + name = "fluentd" + tags = ["health", "logging"] + + def run(self): + """Check the Fluentd deployment and raise an error if any problems are found.""" + + fluentd_pods = self.get_pods_for_component("fluentd") + self.check_fluentd(fluentd_pods) + return {} + + def check_fluentd(self, pods): + """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found.""" + + node_selector = self.get_var( + 'openshift_logging_fluentd_nodeselector', + default='logging-infra-fluentd=true' + ) + + nodes_by_name = self.get_nodes_by_name() + fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector) + + errors = [] + errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) + errors += self.check_nodes_have_fluentd(pods, fluentd_nodes) + errors += self.check_fluentd_pods_running(pods) + + # Make sure there are no extra fluentd pods + if len(pods) > len(fluentd_nodes): + errors.append(OpenShiftCheckException( + 'TooManyFluentdPods', + 'There are more Fluentd pods running than nodes labeled.\n' + 'This may not cause problems with logging but it likely indicates something wrong.' + )) + + if errors: + raise OpenShiftCheckExceptionList(errors) + + def get_nodes_by_name(self): + """Retrieve all the node definitions. Returns: dict(name: node)""" + nodes_json = self.exec_oc("get nodes -o json", []) + try: + nodes = json.loads(nodes_json) + except ValueError: # no valid json - should not happen + raise OpenShiftCheckException( + "BadOcNodeList", + "Could not obtain a list of nodes to validate fluentd.\n" + "Output from oc get:\n" + nodes_json + ) + if not nodes or not nodes.get('items'): # also should not happen + raise OpenShiftCheckException( + "NoNodesDefined", + "No nodes appear to be defined according to the API." + ) + return { + node['metadata']['name']: node + for node in nodes['items'] + } + + @staticmethod + def filter_fluentd_labeled_nodes(nodes_by_name, node_selector): + """Filter to all nodes with fluentd label. Returns dict(name: node)""" + label, value = node_selector.split('=', 1) + fluentd_nodes = { + name: node for name, node in nodes_by_name.items() + if node['metadata']['labels'].get(label) == value + } + if not fluentd_nodes: + raise OpenShiftCheckException( + 'NoNodesLabeled', + 'There are no nodes with the fluentd label {label}.\n' + 'This means no logs will be aggregated from the nodes.'.format(label=node_selector) + ) + return fluentd_nodes + + def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): + """Note if nodes are not labeled as expected. Returns: error list""" + intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all']) + if not intended_nodes or '--all' in intended_nodes: + intended_nodes = nodes_by_name.keys() + nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) + if nodes_missing_labels: + return [OpenShiftCheckException( + 'NodesUnlabeled', + 'The following nodes are supposed to be labeled with {label} but are not:\n' + ' {nodes}\n' + 'Fluentd will not aggregate logs from these nodes.'.format( + label=node_selector, nodes=', '.join(nodes_missing_labels) + ))] + + return [] + + @staticmethod + def check_nodes_have_fluentd(pods, fluentd_nodes): + """Make sure fluentd is on all the labeled nodes. Returns: error list""" + unmatched_nodes = fluentd_nodes.copy() + node_names_by_label = { + node['metadata']['labels']['kubernetes.io/hostname']: name + for name, node in fluentd_nodes.items() + } + node_names_by_internal_ip = { + address['address']: name + for name, node in fluentd_nodes.items() + for address in node['status']['addresses'] + if address['type'] == "InternalIP" + } + for pod in pods: + for name in [ + pod['spec']['nodeName'], + node_names_by_internal_ip.get(pod['spec']['nodeName']), + node_names_by_label.get(pod.get('spec', {}).get('host')), + ]: + unmatched_nodes.pop(name, None) + if unmatched_nodes: + return [OpenShiftCheckException( + 'MissingFluentdPod', + 'The following nodes are supposed to have a Fluentd pod but do not:\n' + ' {nodes}\n' + 'These nodes will not have their logs aggregated.'.format( + nodes='\n '.join(unmatched_nodes.keys()) + ))] + + return [] + + def check_fluentd_pods_running(self, pods): + """Make sure all fluentd pods are running. Returns: error string""" + not_running = super(Fluentd, self).not_running_pods(pods) + if not_running: + return [OpenShiftCheckException( + 'FluentdNotRunning', + 'The following Fluentd pods are supposed to be running but are not:\n' + ' {pods}\n' + 'These pods will not aggregate logs from their nodes.'.format( + pods='\n'.join( + " {name} ({host})".format( + name=pod['metadata']['name'], + host=pod['spec'].get('host', 'None') + ) + for pod in not_running + ) + ))] + + return [] diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py new file mode 100644 index 000000000..e93cc9028 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py @@ -0,0 +1,131 @@ +""" +Module for performing checks on a Fluentd logging deployment configuration +""" + +from openshift_checks import OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck + + +class FluentdConfig(LoggingCheck): + """Module that checks logging configuration of an integrated logging Fluentd deployment""" + name = "fluentd_config" + tags = ["health"] + + def is_active(self): + logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) + + try: + version = self.get_major_minor_version(self.get_var("openshift_image_tag")) + except ValueError: + # if failed to parse OpenShift version, perform check anyway (if logging enabled) + return logging_deployed + + return logging_deployed and version < (3, 6) + + def run(self): + """Check that Fluentd has running pods, and that its logging config matches Docker's logging config.""" + config_error = self.check_logging_config() + if config_error: + msg = ("The following Fluentd logging configuration problem was found:" + "\n{}".format(config_error)) + return {"failed": True, "msg": msg} + + return {} + + def check_logging_config(self): + """Ensure that the configured Docker logging driver matches fluentd settings. + This means that, at least for now, if the following condition is met: + + openshift_logging_fluentd_use_journal == True + + then the value of the configured Docker logging driver should be "journald". + Otherwise, the value of the Docker logging driver should be "json-file". + Returns an error string if the above condition is not met, or None otherwise.""" + use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True) + + # if check is running on a master, retrieve all running pods + # and check any pod's container for the env var "USE_JOURNAL" + group_names = self.get_var("group_names") + if "oo_masters_to_config" in group_names: + use_journald = self.check_fluentd_env_var() + + docker_info = self.execute_module("docker_info", {}) + try: + logging_driver = docker_info["info"]["LoggingDriver"] + except KeyError: + return "Unable to determine Docker logging driver." + + logging_driver = docker_info["info"]["LoggingDriver"] + recommended_logging_driver = "journald" + error = None + + # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver` + # option as an inventory file variable, or adding the log driver value as part of the + # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that + # can be passed to the Docker binary; the only other recommendation that can be made, would be + # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running + # individual containers. + if use_journald and logging_driver != "journald": + error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n' + 'This differs from your Docker configuration, which has been set to use "{driver}" ' + 'as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + elif not use_journald and logging_driver != "json-file": + recommended_logging_driver = "json-file" + error = ('Your Fluentd configuration is set to aggregate Docker container logs from ' + 'individual json log files per container.\n ' + 'This differs from your Docker configuration, which has been set to use ' + '"{driver}" as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + + if error: + error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n' + ' openshift_docker_options="--log-driver={driver}"\n\n' + 'Alternatively, you can add the following option to your Docker configuration, located in' + '"/etc/docker/daemon.json":\n\n' + '{{ "log-driver": "{driver}" }}\n\n' + 'See https://docs.docker.com/engine/admin/logging/json-file ' + 'for more information.').format(driver=recommended_logging_driver) + + return error + + def check_fluentd_env_var(self): + """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod.""" + running_pods = self.running_fluentd_pods() + + try: + pod_containers = running_pods[0]["spec"]["containers"] + except KeyError: + return "Unable to detect running containers on selected Fluentd pod." + + if not pod_containers: + msg = ('There are no running containers on selected Fluentd pod "{}".\n' + 'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", "")) + raise OpenShiftCheckException(msg) + + pod_env = pod_containers[0].get("env") + if not pod_env: + msg = ('There are no environment variables set on the Fluentd container "{}".\n' + 'Unable to calculate expected logging driver.').format(pod_containers[0].get("name")) + raise OpenShiftCheckException(msg) + + for env in pod_env: + if env["name"] == "USE_JOURNAL": + return env.get("value", "false") != "false" + + return False + + def running_fluentd_pods(self): + """Return a list of running fluentd pods.""" + fluentd_pods = self.get_pods_for_component("fluentd") + + running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running'] + if not running_fluentd_pods: + raise OpenShiftCheckException( + 'No Fluentd pods were found to be in the "Running" state. ' + 'At least one Fluentd pod is required in order to perform this check.' + ) + + return running_fluentd_pods diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py new file mode 100644 index 000000000..3b1cf8baa --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -0,0 +1,226 @@ +""" +Module for performing checks on a Kibana logging deployment +""" + +import json +import ssl + +try: + from urllib2 import HTTPError, URLError + import urllib2 +except ImportError: + from urllib.error import HTTPError, URLError + import urllib.request as urllib2 + +from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException + + +class Kibana(LoggingCheck): + """Module that checks an integrated logging Kibana deployment""" + + name = "kibana" + tags = ["health", "logging"] + + def run(self): + """Check various things and gather errors. Returns: result as hash""" + + kibana_pods = self.get_pods_for_component("kibana") + self.check_kibana(kibana_pods) + self.check_kibana_route() + # TODO(lmeyer): run it all again for the ops cluster + + return {} + + def _verify_url_internal(self, url): + """ + Try to reach a URL from the host. + Returns: success (bool), reason (for failure) + """ + args = dict( + url=url, + follow_redirects='none', + validate_certs='no', # likely to be signed with internal CA + # TODO(lmeyer): give users option to validate certs + status_code=302, + ) + result = self.execute_module('uri', args) + if result.get('failed'): + return result['msg'] + return None + + @staticmethod + def _verify_url_external(url): + """ + Try to reach a URL from ansible control host. + Raise an OpenShiftCheckException if anything goes wrong. + """ + # This actually checks from the ansible control host, which may or may not + # really be "external" to the cluster. + + # Disable SSL cert validation to work around internally signed certs + ctx = ssl.create_default_context() + ctx.check_hostname = False # or setting CERT_NONE is refused + ctx.verify_mode = ssl.CERT_NONE + + # Verify that the url is returning a valid response + try: + # We only care if the url connects and responds + return_code = urllib2.urlopen(url, context=ctx).getcode() + except HTTPError as httperr: + return httperr.reason + except URLError as urlerr: + return str(urlerr) + + # there appears to be no way to prevent urlopen from following redirects + if return_code != 200: + return 'Expected success (200) but got return code {}'.format(int(return_code)) + + return None + + def check_kibana(self, pods): + """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not.""" + + if not pods: + raise OpenShiftCheckException( + "MissingComponentPods", + "There are no Kibana pods deployed, so no access to the logging UI." + ) + + not_running = self.not_running_pods(pods) + if len(not_running) == len(pods): + raise OpenShiftCheckException( + "NoRunningPods", + "No Kibana pod is in a running state, so there is no access to the logging UI." + ) + elif not_running: + raise OpenShiftCheckException( + "PodNotRunning", + "The following Kibana pods are not currently in a running state:\n" + " {pods}\n" + "However at least one is, so service may not be impacted.".format( + pods="\n ".join(pod['metadata']['name'] for pod in not_running) + ) + ) + + def _get_kibana_url(self): + """ + Get kibana route or report error. + Returns: url + """ + + # Get logging url + get_route = self.exec_oc("get route logging-kibana -o json", []) + if not get_route: + raise OpenShiftCheckException( + 'no_route_exists', + 'No route is defined for Kibana in the logging namespace,\n' + 'so the logging stack is not accessible. Is logging deployed?\n' + 'Did something remove the logging-kibana route?' + ) + + try: + route = json.loads(get_route) + # check that the route has been accepted by a router + ingress = route["status"]["ingress"] + except (ValueError, KeyError): + raise OpenShiftCheckException( + 'get_route_failed', + '"oc get route" returned an unexpected response:\n' + get_route + ) + + # ingress can be null if there is no router, or empty if not routed + if not ingress or not ingress[0]: + raise OpenShiftCheckException( + 'route_not_accepted', + 'The logging-kibana route is not being routed by any router.\n' + 'Is the router deployed and working?' + ) + + host = route.get("spec", {}).get("host") + if not host: + raise OpenShiftCheckException( + 'route_missing_host', + 'The logging-kibana route has no hostname defined,\n' + 'which should never happen. Did something alter its definition?' + ) + + return 'https://{}/'.format(host) + + def check_kibana_route(self): + """ + Check to see if kibana route is up and working. + Raises exception if not. + """ + + kibana_url = self._get_kibana_url() + + # first, check that kibana is reachable from the master. + error = self._verify_url_internal(kibana_url) + if error: + if 'urlopen error [Errno 111] Connection refused' in error: + raise OpenShiftCheckException( + 'FailedToConnectInternal', + 'Failed to connect from this master to Kibana URL {url}\n' + 'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url) + ) + elif 'urlopen error [Errno -2] Name or service not known' in error: + raise OpenShiftCheckException( + 'FailedToResolveInternal', + 'Failed to connect from this master to Kibana URL {url}\n' + 'because the hostname does not resolve.\n' + 'Is DNS configured for the Kibana hostname?'.format(url=kibana_url) + ) + elif 'Status code was not' in error: + raise OpenShiftCheckException( + 'WrongReturnCodeInternal', + 'A request from this master to the Kibana URL {url}\n' + 'did not return the correct status code (302).\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues. The output was:\n' + ' {error}'.format(url=kibana_url, error=error) + ) + raise OpenShiftCheckException( + 'MiscRouteErrorInternal', + 'Error validating the logging Kibana route internally:\n' + error + ) + + # in production we would like the kibana route to work from outside the + # cluster too; but that may not be the case, so allow disabling just this part. + if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true": + return + error = self._verify_url_external(kibana_url) + + if not error: + return + + error_fmt = ( + 'Error validating the logging Kibana route:\n{error}\n' + 'To disable external Kibana route validation, set the variable:\n' + ' openshift_check_efk_kibana_external=False' + ) + if 'urlopen error [Errno 111] Connection refused' in error: + msg = ( + 'Failed to connect from the Ansible control host to Kibana URL {url}\n' + 'Is the router for the Kibana hostname exposed externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg)) + elif 'urlopen error [Errno -2] Name or service not known' in error: + msg = ( + 'Failed to resolve the Kibana hostname in {url}\n' + 'from the Ansible control host.\n' + 'Is DNS configured to resolve this Kibana hostname externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg)) + elif 'Expected success (200)' in error: + msg = ( + 'A request to Kibana at {url}\n' + 'returned the wrong error code:\n' + ' {error}\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues.' + ).format(url=kibana_url, error=error) + raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg)) + raise OpenShiftCheckException( + 'MiscRouteError', + 'Error validating the logging Kibana route externally:\n' + error + ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py new file mode 100644 index 000000000..05ba73ca1 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -0,0 +1,101 @@ +""" +Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack +""" + +import json +import os + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException + + +class MissingComponentPods(OpenShiftCheckException): + """Raised when a component has no pods in the namespace.""" + pass + + +class CouldNotUseOc(OpenShiftCheckException): + """Raised when ocutil has a failure running oc.""" + pass + + +class LoggingCheck(OpenShiftCheck): + """Base class for OpenShift aggregated logging component checks""" + + # FIXME: this should not be listed as a check, since it is not meant to be + # run by itself. + + name = "logging" + + def is_active(self): + logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False) + return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master() + + def run(self): + return {} + + def get_pods_for_component(self, logging_component): + """Get all pods for a given component. Returns: list of pods.""" + pod_output = self.exec_oc( + "get pods -l component={} -o json".format(logging_component), + [], + ) + try: + pods = json.loads(pod_output) # raises ValueError if deserialize fails + if not pods or not pods.get('items'): # also a broken response, treat the same + raise ValueError() + except ValueError: + # successful run but non-parsing data generally means there were no pods to be found + raise MissingComponentPods( + 'There are no "{}" component pods in the "{}" namespace.\n' + 'Is logging deployed?'.format(logging_component, self.logging_namespace()) + ) + + return pods['items'] + + @staticmethod + def not_running_pods(pods): + """Returns: list of pods not in a ready and running state""" + return [ + pod for pod in pods + if not pod.get("status", {}).get("containerStatuses") or any( + container['ready'] is False + for container in pod['status']['containerStatuses'] + ) or not any( + condition['type'] == 'Ready' and condition['status'] == 'True' + for condition in pod['status'].get('conditions', []) + ) + ] + + def logging_namespace(self): + """Returns the namespace in which logging is configured to deploy.""" + return self.get_var("openshift_logging_namespace", default="logging") + + def exec_oc(self, cmd_str="", extra_args=None, save_as_name=None): + """ + Execute an 'oc' command in the remote host. + Returns: output of command and namespace, + or raises CouldNotUseOc on error + """ + config_base = self.get_var("openshift", "common", "config_base") + args = { + "namespace": self.logging_namespace(), + "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), + "cmd": cmd_str, + "extra_args": list(extra_args) if extra_args else [], + } + + result = self.execute_module("ocutil", args, save_as_name=save_as_name) + if result.get("failed"): + if result['result'] == '[Errno 2] No such file or directory': + raise CouldNotUseOc( + "This host is supposed to be a master but does not have the `oc` command where expected.\n" + "Has an installation been run on this host yet?" + ) + + raise CouldNotUseOc( + 'Unexpected error using `oc` to validate the logging stack components.\n' + 'Error executing `oc {cmd}`:\n' + '{error}'.format(cmd=args['cmd'], error=result['result']) + ) + + return result.get("result", "") diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py new file mode 100644 index 000000000..cacdf4213 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py @@ -0,0 +1,129 @@ +""" +Check for ensuring logs from pods can be queried in a reasonable amount of time. +""" + +import json +import time + +from uuid import uuid4 + +from openshift_checks import OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck + + +ES_CMD_TIMEOUT_SECONDS = 30 + + +class LoggingIndexTime(LoggingCheck): + """Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time.""" + name = "logging_index_time" + tags = ["health", "logging"] + + def run(self): + """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs.""" + try: + log_index_timeout = int( + self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS) + ) + except ValueError: + raise OpenShiftCheckException( + 'InvalidTimeout', + 'Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' + 'Value must be an integer representing an amount in seconds.' + ) + + running_component_pods = dict() + + # get all component pods + for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']): + pods = self.get_pods_for_component(component) + running_pods = self.running_pods(pods) + + if not running_pods: + raise OpenShiftCheckException( + component + 'NoRunningPods', + 'No {} pods in the "Running" state were found.' + 'At least one pod is required in order to perform this check.'.format(name) + ) + + running_component_pods[component] = running_pods + + uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0]) + self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout) + return {} + + def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs): + """Retry an Elasticsearch query every second until query success, or a defined + length of time has passed.""" + deadline = time.time() + timeout_secs + interval = 1 + while not self.query_es_from_es(es_pod, uuid): + if time.time() + interval > deadline: + raise OpenShiftCheckException( + "NoMatchFound", + "expecting match in Elasticsearch for message with uuid {}, " + "but no matches were found after {}s.".format(uuid, timeout_secs) + ) + time.sleep(interval) + + def curl_kibana_with_uuid(self, kibana_pod): + """curl Kibana with a unique uuid.""" + uuid = self.generate_uuid() + pod_name = kibana_pod["metadata"]["name"] + exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}" + exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid) + + error_str = self.exec_oc(exec_cmd, []) + + try: + error_code = json.loads(error_str)["statusCode"] + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'kibanaInvalidResponse', + 'invalid response returned from Kibana request:\n' + 'Command: {}\nResponse: {}'.format(exec_cmd, error_str) + ) + + if error_code != 404: + raise OpenShiftCheckException( + 'kibanaInvalidReturnCode', + 'invalid error code returned from Kibana request.\n' + 'Expecting error code "404", but got "{}" instead.'.format(error_code) + ) + + return uuid + + def query_es_from_es(self, es_pod, uuid): + """curl the Elasticsearch pod and look for a unique uuid in its logs.""" + pod_name = es_pod["metadata"]["name"] + exec_cmd = ( + "exec {pod_name} -- curl --max-time 30 -s -f " + "--cacert /etc/elasticsearch/secret/admin-ca " + "--cert /etc/elasticsearch/secret/admin-cert " + "--key /etc/elasticsearch/secret/admin-key " + "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}" + ) + exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid) + result = self.exec_oc(exec_cmd, [], save_as_name="query_for_uuid.json") + + try: + count = json.loads(result)["count"] + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'esInvalidResponse', + 'Invalid response from Elasticsearch query:\n' + ' {}\n' + 'Response was:\n{}'.format(exec_cmd, result) + ) + + return count + + @staticmethod + def running_pods(pods): + """Filter pods that are running.""" + return [pod for pod in pods if pod['status']['phase'] == 'Running'] + + @staticmethod + def generate_uuid(): + """Wrap uuid generator. Allows for testing with expected values.""" + return str(uuid4()) diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py new file mode 100644 index 000000000..e7a8ec976 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py @@ -0,0 +1,50 @@ +"""Check that recommended memory is available.""" +from openshift_checks import OpenShiftCheck + +MIB = 2**20 +GIB = 2**30 + + +class MemoryAvailability(OpenShiftCheck): + """Check that recommended memory is available.""" + + name = "memory_availability" + tags = ["preflight"] + + # Values taken from the official installation documentation: + # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements + recommended_memory_bytes = { + "oo_masters_to_config": 16 * GIB, + "oo_nodes_to_config": 8 * GIB, + "oo_etcd_to_config": 8 * GIB, + } + # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal + memtotal_adjustment = 1 * GIB + + def is_active(self): + """Skip hosts that do not have recommended memory requirements.""" + group_names = self.get_var("group_names", default=[]) + has_memory_recommendation = bool(set(group_names).intersection(self.recommended_memory_bytes)) + return super(MemoryAvailability, self).is_active() and has_memory_recommendation + + def run(self): + group_names = self.get_var("group_names") + total_memory_bytes = self.get_var("ansible_memtotal_mb") * MIB + + recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names) + configured_min = float(self.get_var("openshift_check_min_host_memory_gb", default=0)) * GIB + min_memory_bytes = configured_min or recommended_min + + if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes: + return { + 'failed': True, + 'msg': ( + 'Available memory ({available:.1f} GiB) is too far ' + 'below recommended value ({recommended:.1f} GiB)' + ).format( + available=float(total_memory_bytes) / GIB, + recommended=float(min_memory_bytes) / GIB, + ), + } + + return {} diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py new file mode 100644 index 000000000..cfbdea303 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/mixins.py @@ -0,0 +1,54 @@ +""" +Mixin classes meant to be used with subclasses of OpenShiftCheck. +""" + + +class NotContainerizedMixin(object): + """Mixin for checks that are only active when not in containerized mode.""" + # permanent # pylint: disable=too-few-public-methods + # Reason: The mixin is not intended to stand on its own as a class. + + def is_active(self): + """Only run on non-containerized hosts.""" + is_containerized = self.get_var("openshift", "common", "is_containerized") + return super(NotContainerizedMixin, self).is_active() and not is_containerized + + +class DockerHostMixin(object): + """Mixin for checks that are only active on hosts that require Docker.""" + + dependencies = [] + + def is_active(self): + """Only run on hosts that depend on Docker.""" + group_names = set(self.get_var("group_names", default=[])) + needs_docker = set(["oo_nodes_to_config"]) + if self.get_var("openshift.common.is_containerized"): + needs_docker.update(["oo_masters_to_config", "oo_etcd_to_config"]) + return super(DockerHostMixin, self).is_active() and bool(group_names.intersection(needs_docker)) + + def ensure_dependencies(self): + """ + Ensure that docker-related packages exist, but not on atomic hosts + (which would not be able to install but should already have them). + Returns: msg, failed + """ + if self.get_var("openshift", "common", "is_atomic"): + return "", False + + # NOTE: we would use the "package" module but it's actually an action plugin + # and it's not clear how to invoke one of those. This is about the same anyway: + result = self.execute_module_with_retries( + self.get_var("ansible_pkg_mgr", default="yum"), + {"name": self.dependencies, "state": "present"}, + ) + msg = result.get("msg", "") + if result.get("failed"): + if "No package matching" in msg: + msg = "Ensure that all required dependencies can be installed via `yum`.\n" + msg = ( + "Unable to install required packages on this host:\n" + " {deps}\n{msg}" + ).format(deps=',\n '.join(self.dependencies), msg=msg) + failed = result.get("failed", False) or result.get("rc", 0) != 0 + return msg, failed diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py new file mode 100644 index 000000000..416805c4d --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py @@ -0,0 +1,54 @@ +""" +Ansible module for determining if an installed version of Open vSwitch is incompatible with the +currently installed version of OpenShift. +""" + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException +from openshift_checks.mixins import NotContainerizedMixin + + +class OvsVersion(NotContainerizedMixin, OpenShiftCheck): + """Check that packages in a package_list are installed on the host + and are the correct version as determined by an OpenShift installation. + """ + + name = "ovs_version" + tags = ["health"] + + openshift_to_ovs_version = { + "3.6": ["2.6", "2.7"], + "3.5": ["2.6", "2.7"], + "3.4": "2.4", + } + + def is_active(self): + """Skip hosts that do not have package requirements.""" + group_names = self.get_var("group_names", default=[]) + master_or_node = 'oo_masters_to_config' in group_names or 'oo_nodes_to_config' in group_names + return super(OvsVersion, self).is_active() and master_or_node + + def run(self): + args = { + "package_list": [ + { + "name": "openvswitch", + "version": self.get_required_ovs_version(), + }, + ], + } + return self.execute_module("rpm_version", args) + + def get_required_ovs_version(self): + """Return the correct Open vSwitch version for the current OpenShift version""" + openshift_version_tuple = self.get_major_minor_version(self.get_var("openshift_image_tag")) + + if openshift_version_tuple < (3, 5): + return self.openshift_to_ovs_version["3.4"] + + openshift_version = ".".join(str(x) for x in openshift_version_tuple) + ovs_version = self.openshift_to_ovs_version.get(openshift_version) + if ovs_version: + return self.openshift_to_ovs_version[openshift_version] + + msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}" + raise OpenShiftCheckException(msg.format(openshift_version)) diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py new file mode 100644 index 000000000..090e438ff --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/package_availability.py @@ -0,0 +1,72 @@ +"""Check that required RPM packages are available.""" + +from openshift_checks import OpenShiftCheck +from openshift_checks.mixins import NotContainerizedMixin + + +class PackageAvailability(NotContainerizedMixin, OpenShiftCheck): + """Check that required RPM packages are available.""" + + name = "package_availability" + tags = ["preflight"] + + def is_active(self): + """Run only when yum is the package manager as the code is specific to it.""" + return super(PackageAvailability, self).is_active() and self.get_var("ansible_pkg_mgr") == "yum" + + def run(self): + rpm_prefix = self.get_var("openshift", "common", "service_type") + group_names = self.get_var("group_names", default=[]) + + packages = set() + + if "oo_masters_to_config" in group_names: + packages.update(self.master_packages(rpm_prefix)) + if "oo_nodes_to_config" in group_names: + packages.update(self.node_packages(rpm_prefix)) + + args = {"packages": sorted(set(packages))} + return self.execute_module_with_retries("check_yum_update", args) + + @staticmethod + def master_packages(rpm_prefix): + """Return a list of RPMs that we expect a master install to have available.""" + return [ + "{rpm_prefix}".format(rpm_prefix=rpm_prefix), + "{rpm_prefix}-clients".format(rpm_prefix=rpm_prefix), + "{rpm_prefix}-master".format(rpm_prefix=rpm_prefix), + "bash-completion", + "cockpit-bridge", + "cockpit-docker", + "cockpit-system", + "cockpit-ws", + "etcd", + "httpd-tools", + ] + + @staticmethod + def node_packages(rpm_prefix): + """Return a list of RPMs that we expect a node install to have available.""" + return [ + "{rpm_prefix}".format(rpm_prefix=rpm_prefix), + "{rpm_prefix}-node".format(rpm_prefix=rpm_prefix), + "{rpm_prefix}-sdn-ovs".format(rpm_prefix=rpm_prefix), + "bind", + "ceph-common", + "dnsmasq", + "docker", + "firewalld", + "flannel", + "glusterfs-fuse", + "iptables-services", + "iptables", + "iscsi-initiator-utils", + "libselinux-python", + "nfs-utils", + "ntp", + "openssl", + "pyparted", + "python-httplib2", + "PyYAML", + "yum-utils", + ] diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py new file mode 100644 index 000000000..8464e8a5e --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/package_update.py @@ -0,0 +1,14 @@ +"""Check that a yum update would not run into conflicts with available packages.""" +from openshift_checks import OpenShiftCheck +from openshift_checks.mixins import NotContainerizedMixin + + +class PackageUpdate(NotContainerizedMixin, OpenShiftCheck): + """Check that a yum update would not run into conflicts with available packages.""" + + name = "package_update" + tags = ["preflight"] + + def run(self): + args = {"packages": []} + return self.execute_module_with_retries("check_yum_update", args) diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py new file mode 100644 index 000000000..2f09b22fc --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/package_version.py @@ -0,0 +1,127 @@ +"""Check that available RPM packages match the required versions.""" + +import re + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException +from openshift_checks.mixins import NotContainerizedMixin + + +class PackageVersion(NotContainerizedMixin, OpenShiftCheck): + """Check that available RPM packages match the required versions.""" + + name = "package_version" + tags = ["preflight"] + + # NOTE: versions outside those specified are mapped to least/greatest + openshift_to_ovs_version = { + (3, 4): "2.4", + (3, 5): ["2.6", "2.7"], + (3, 6): ["2.6", "2.7"], + } + + openshift_to_docker_version = { + (3, 1): "1.8", + (3, 2): "1.10", + (3, 3): "1.10", + (3, 4): "1.12", + (3, 5): "1.12", + (3, 6): "1.12", + } + + # map major OpenShift release versions across releases to a common major version + map_major_release_version = { + 1: 3, + } + + def is_active(self): + """Skip hosts that do not have package requirements.""" + group_names = self.get_var("group_names", default=[]) + master_or_node = 'oo_masters_to_config' in group_names or 'oo_nodes_to_config' in group_names + return super(PackageVersion, self).is_active() and master_or_node + + def run(self): + rpm_prefix = self.get_var("openshift", "common", "service_type") + openshift_release = self.get_var("openshift_release", default='') + deployment_type = self.get_var("openshift_deployment_type") + check_multi_minor_release = deployment_type in ['openshift-enterprise'] + + args = { + "package_mgr": self.get_var("ansible_pkg_mgr"), + "package_list": [ + { + "name": "openvswitch", + "version": self.get_required_ovs_version(), + "check_multi": False, + }, + { + "name": "docker", + "version": self.get_required_docker_version(), + "check_multi": False, + }, + { + "name": "{}".format(rpm_prefix), + "version": openshift_release, + "check_multi": check_multi_minor_release, + }, + { + "name": "{}-master".format(rpm_prefix), + "version": openshift_release, + "check_multi": check_multi_minor_release, + }, + { + "name": "{}-node".format(rpm_prefix), + "version": openshift_release, + "check_multi": check_multi_minor_release, + }, + ], + } + + return self.execute_module_with_retries("aos_version", args) + + def get_required_ovs_version(self): + """Return the correct Open vSwitch version(s) for the current OpenShift version.""" + openshift_version = self.get_openshift_version_tuple() + + earliest = min(self.openshift_to_ovs_version) + latest = max(self.openshift_to_ovs_version) + if openshift_version < earliest: + return self.openshift_to_ovs_version[earliest] + if openshift_version > latest: + return self.openshift_to_ovs_version[latest] + + ovs_version = self.openshift_to_ovs_version.get(openshift_version) + if not ovs_version: + msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}" + raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version))) + + return ovs_version + + def get_required_docker_version(self): + """Return the correct Docker version(s) for the current OpenShift version.""" + openshift_version = self.get_openshift_version_tuple() + + earliest = min(self.openshift_to_docker_version) + latest = max(self.openshift_to_docker_version) + if openshift_version < earliest: + return self.openshift_to_docker_version[earliest] + if openshift_version > latest: + return self.openshift_to_docker_version[latest] + + docker_version = self.openshift_to_docker_version.get(openshift_version) + if not docker_version: + msg = "There is no recommended version of Docker for the current version of OpenShift: {}" + raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version))) + + return docker_version + + def get_openshift_version_tuple(self): + """Return received image tag as a normalized (X, Y) minor version tuple.""" + version = self.get_var("openshift_image_tag") + comps = [int(component) for component in re.findall(r'\d+', version)] + + if len(comps) < 2: + msg = "An invalid version of OpenShift was found for this host: {}" + raise OpenShiftCheckException(msg.format(version)) + + comps[0] = self.map_major_release_version.get(comps[0], comps[0]) + return tuple(comps[0:2]) diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py new file mode 100644 index 000000000..40ad27d5d --- /dev/null +++ b/roles/openshift_health_checker/test/action_plugin_test.py @@ -0,0 +1,348 @@ +import pytest + +from ansible.playbook.play_context import PlayContext + +from openshift_health_check import ActionModule, resolve_checks +from openshift_health_check import copy_remote_file_to_dir, write_result_to_output_dir, write_to_output_file +from openshift_checks import OpenShiftCheckException, FileToSave + + +def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None, + run_logs=None, run_files=None, changed=False, get_var_return=None): + """Returns a new class that is compatible with OpenShiftCheck for testing.""" + + _name, _tags = name, tags + + class FakeCheck(object): + name = _name + tags = _tags or [] + + def __init__(self, **_): + self.changed = False + self.failures = [] + self.logs = run_logs or [] + self.files_to_save = run_files or [] + + def is_active(self): + if isinstance(is_active, Exception): + raise is_active + return is_active + + def run(self): + self.changed = changed + if run_exception is not None: + raise run_exception + return run_return + + def get_var(*args, **_): + return get_var_return + + def register_failure(self, exc): + self.failures.append(OpenShiftCheckException(str(exc))) + return + + return FakeCheck + + +# Fixtures + + +@pytest.fixture +def plugin(): + task = FakeTask('openshift_health_check', {'checks': ['fake_check']}) + plugin = ActionModule(task, None, PlayContext(), None, None, None) + return plugin + + +class FakeTask(object): + def __init__(self, action, args): + self.action = action + self.args = args + self.async = 0 + + +@pytest.fixture +def task_vars(): + return dict(openshift=dict(), ansible_host='unit-test-host') + + +# Assertion helpers + + +def failed(result, msg_has=None): + if msg_has is not None: + assert 'msg' in result + for term in msg_has: + assert term.lower() in result['msg'].lower() + return result.get('failed', False) + + +def changed(result): + return result.get('changed', False) + + +# tests whether task is skipped, not individual checks +def skipped(result): + return result.get('skipped', False) + + +# Tests + + +@pytest.mark.parametrize('task_vars', [ + None, + {}, +]) +def test_action_plugin_missing_openshift_facts(plugin, task_vars, monkeypatch): + monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {}) + monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) + result = plugin.run(tmp=None, task_vars=task_vars) + + assert failed(result, msg_has=['openshift_facts']) + + +def test_action_plugin_cannot_load_checks_with_the_same_name(plugin, task_vars, monkeypatch): + FakeCheck1 = fake_check('duplicate_name') + FakeCheck2 = fake_check('duplicate_name') + checks = [FakeCheck1, FakeCheck2] + monkeypatch.setattr('openshift_checks.OpenShiftCheck.subclasses', classmethod(lambda cls: checks)) + + result = plugin.run(tmp=None, task_vars=task_vars) + + assert failed(result, msg_has=['duplicate', 'duplicate_name', 'FakeCheck']) + + +@pytest.mark.parametrize('is_active, skipped_reason', [ + (False, "Not active for this host"), + (Exception("borked"), "exception"), +]) +def test_action_plugin_skip_non_active_checks(is_active, skipped_reason, plugin, task_vars, monkeypatch): + checks = [fake_check(is_active=is_active)] + monkeypatch.setattr('openshift_checks.OpenShiftCheck.subclasses', classmethod(lambda cls: checks)) + + result = plugin.run(tmp=None, task_vars=task_vars) + + assert result['checks']['fake_check'].get('skipped') + assert skipped_reason in result['checks']['fake_check'].get('skipped_reason') + assert not failed(result) + assert not changed(result) + assert not skipped(result) + + +@pytest.mark.parametrize('to_disable', [ + 'fake_check', + ['fake_check', 'spam'], + '*,spam,eggs', +]) +def test_action_plugin_skip_disabled_checks(to_disable, plugin, task_vars, monkeypatch): + checks = [fake_check('fake_check', is_active=True)] + monkeypatch.setattr('openshift_checks.OpenShiftCheck.subclasses', classmethod(lambda cls: checks)) + + task_vars['openshift_disable_check'] = to_disable + result = plugin.run(tmp=None, task_vars=task_vars) + + assert result['checks']['fake_check'] == dict(skipped=True, skipped_reason="Disabled by user request") + assert not failed(result) + assert not changed(result) + assert not skipped(result) + + +def test_action_plugin_run_list_checks(monkeypatch): + task = FakeTask('openshift_health_check', {'checks': []}) + plugin = ActionModule(task, None, PlayContext(), None, None, None) + monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {}) + result = plugin.run() + + assert failed(result, msg_has="Available checks") + assert not changed(result) + assert not skipped(result) + + +def test_action_plugin_run_check_ok(plugin, task_vars, monkeypatch): + check_return_value = {'ok': 'test'} + check_class = fake_check(run_return=check_return_value, run_files=[None]) + monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {'fake_check': check_class()}) + monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) + + result = plugin.run(tmp=None, task_vars=task_vars) + + assert result['checks']['fake_check'] == check_return_value + assert not failed(result) + assert not changed(result) + assert not skipped(result) + + +def test_action_plugin_run_check_changed(plugin, task_vars, monkeypatch): + check_return_value = {'ok': 'test'} + check_class = fake_check(run_return=check_return_value, changed=True) + monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {'fake_check': check_class()}) + monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) + + result = plugin.run(tmp=None, task_vars=task_vars) + + assert result['checks']['fake_check'] == check_return_value + assert changed(result['checks']['fake_check']) + assert not failed(result) + assert changed(result) + assert not skipped(result) + + +def test_action_plugin_run_check_fail(plugin, task_vars, monkeypatch): + check_return_value = {'failed': True, 'msg': 'this is a failure'} + check_class = fake_check(run_return=check_return_value) + monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {'fake_check': check_class()}) + monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) + + result = plugin.run(tmp=None, task_vars=task_vars) + + assert result['checks']['fake_check'] == check_return_value + assert failed(result, msg_has=['failed']) + assert not changed(result) + assert not skipped(result) + + +@pytest.mark.parametrize('exc_class, expect_traceback', [ + (OpenShiftCheckException, False), + (Exception, True), +]) +def test_action_plugin_run_check_exception(plugin, task_vars, exc_class, expect_traceback, monkeypatch): + exception_msg = 'fake check has an exception' + run_exception = exc_class(exception_msg) + check_class = fake_check(run_exception=run_exception, changed=True) + monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {'fake_check': check_class()}) + monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) + + result = plugin.run(tmp=None, task_vars=task_vars) + + assert failed(result['checks']['fake_check'], msg_has=exception_msg) + assert expect_traceback == ("Traceback" in result['checks']['fake_check']['msg']) + assert failed(result, msg_has=['failed']) + assert changed(result['checks']['fake_check']) + assert changed(result) + assert not skipped(result) + + +def test_action_plugin_run_check_output_dir(plugin, task_vars, tmpdir, monkeypatch): + check_class = fake_check( + run_return={}, + run_logs=[('thing', 'note')], + run_files=[ + FileToSave('save.file', 'contents', None), + FileToSave('save.file', 'duplicate', None), + FileToSave('copy.file', None, 'foo'), # note: copy runs execute_module => exception + ], + ) + task_vars['openshift_checks_output_dir'] = str(tmpdir) + check_class.get_var = lambda self, name, **_: task_vars.get(name) + monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {'fake_check': check_class()}) + monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) + + plugin.run(tmp=None, task_vars=task_vars) + assert any(path.basename == task_vars['ansible_host'] for path in tmpdir.listdir()) + assert any(path.basename == 'fake_check.log.json' for path in tmpdir.visit()) + assert any(path.basename == 'save.file' for path in tmpdir.visit()) + assert any(path.basename == 'save.file.2' for path in tmpdir.visit()) + + +def test_action_plugin_resolve_checks_exception(plugin, task_vars, monkeypatch): + monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {}) + + result = plugin.run(tmp=None, task_vars=task_vars) + + assert failed(result, msg_has=['unknown', 'name']) + assert not changed(result) + assert not skipped(result) + + +@pytest.mark.parametrize('names,all_checks,expected', [ + ([], [], set()), + ( + ['a', 'b'], + [ + fake_check('a'), + fake_check('b'), + ], + set(['a', 'b']), + ), + ( + ['a', 'b', '@group'], + [ + fake_check('from_group_1', ['group', 'another_group']), + fake_check('not_in_group', ['another_group']), + fake_check('from_group_2', ['preflight', 'group']), + fake_check('a'), + fake_check('b'), + ], + set(['a', 'b', 'from_group_1', 'from_group_2']), + ), +]) +def test_resolve_checks_ok(names, all_checks, expected): + assert resolve_checks(names, all_checks) == expected + + +@pytest.mark.parametrize('names,all_checks,words_in_exception', [ + ( + ['testA', 'testB'], + [], + ['check', 'name', 'testA', 'testB'], + ), + ( + ['@group'], + [], + ['tag', 'name', 'group'], + ), + ( + ['testA', 'testB', '@group'], + [], + ['check', 'name', 'testA', 'testB', 'tag', 'group'], + ), + ( + ['testA', 'testB', '@group'], + [ + fake_check('from_group_1', ['group', 'another_group']), + fake_check('not_in_group', ['another_group']), + fake_check('from_group_2', ['preflight', 'group']), + ], + ['check', 'name', 'testA', 'testB'], + ), +]) +def test_resolve_checks_failure(names, all_checks, words_in_exception): + with pytest.raises(Exception) as excinfo: + resolve_checks(names, all_checks) + for word in words_in_exception: + assert word in str(excinfo.value) + + +@pytest.mark.parametrize('give_output_dir, result, expect_file', [ + (False, None, False), + (True, dict(content="c3BhbQo=", encoding="base64"), True), + (True, dict(content="encoding error", encoding="base64"), False), + (True, dict(content="spam", no_encoding=None), True), + (True, dict(failed=True, msg="could not slurp"), False), +]) +def test_copy_remote_file_to_dir(give_output_dir, result, expect_file, tmpdir): + check = fake_check()() + check.execute_module = lambda *args, **_: result + copy_remote_file_to_dir(check, "remote_file", str(tmpdir) if give_output_dir else "", "local_file") + assert expect_file == any(path.basename == "local_file" for path in tmpdir.listdir()) + + +def test_write_to_output_exceptions(tmpdir, monkeypatch, capsys): + + class Spam(object): + def __str__(self): + raise Exception("break str") + + test = {1: object(), 2: Spam()} + test[3] = test + write_result_to_output_dir(str(tmpdir), test) + assert "Error writing" in test["output_files"] + + output_dir = tmpdir.join("eggs") + output_dir.write("spam") # so now it's not a dir + write_to_output_file(str(output_dir), "somefile", "somedata") + assert "Could not write" in capsys.readouterr()[1] + + monkeypatch.setattr("openshift_health_check.prepare_output_dir", lambda *_: False) + write_result_to_output_dir(str(tmpdir), test) + assert "Error creating" in test["output_files"] diff --git a/roles/openshift_health_checker/test/aos_version_test.py b/roles/openshift_health_checker/test/aos_version_test.py new file mode 100644 index 000000000..4100f6c70 --- /dev/null +++ b/roles/openshift_health_checker/test/aos_version_test.py @@ -0,0 +1,196 @@ +import pytest +import aos_version + +from collections import namedtuple +Package = namedtuple('Package', ['name', 'version']) + +expected_pkgs = { + "spam": { + "name": "spam", + "version": "3.2.1", + "check_multi": False, + }, + "eggs": { + "name": "eggs", + "version": "3.2.1", + "check_multi": False, + }, +} + + +@pytest.mark.parametrize('pkgs,expected_pkgs_dict', [ + ( + # all found + [Package('spam', '3.2.1'), Package('eggs', '3.2.1')], + expected_pkgs, + ), + ( + # found with more specific version + [Package('spam', '3.2.1'), Package('eggs', '3.2.1.5')], + expected_pkgs, + ), + ( + [Package('ovs', '2.6'), Package('ovs', '2.4')], + { + "ovs": { + "name": "ovs", + "version": ["2.6", "2.7"], + "check_multi": False, + } + }, + ), + ( + [Package('ovs', '2.7')], + { + "ovs": { + "name": "ovs", + "version": ["2.6", "2.7"], + "check_multi": False, + } + }, + ), +]) +def test_check_precise_version_found(pkgs, expected_pkgs_dict): + aos_version._check_precise_version_found(pkgs, expected_pkgs_dict) + + +@pytest.mark.parametrize('pkgs,expect_not_found', [ + ( + [], + { + "spam": { + "name": "spam", + "version": "3.2.1", + "check_multi": False, + }, + "eggs": { + "name": "eggs", + "version": "3.2.1", + "check_multi": False, + } + }, # none found + ), + ( + [Package('spam', '3.2.1')], + { + "eggs": { + "name": "eggs", + "version": "3.2.1", + "check_multi": False, + } + }, # completely missing + ), + ( + [Package('spam', '3.2.1'), Package('eggs', '3.3.2')], + { + "eggs": { + "name": "eggs", + "version": "3.2.1", + "check_multi": False, + } + }, # not the right version + ), + ( + [Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5')], + { + "spam": { + "name": "spam", + "version": "3.2.1", + "check_multi": False, + } + }, # eggs found with multiple versions + ), +]) +def test_check_precise_version_found_fail(pkgs, expect_not_found): + with pytest.raises(aos_version.PreciseVersionNotFound) as e: + aos_version._check_precise_version_found(pkgs, expected_pkgs) + assert list(expect_not_found.values()) == e.value.problem_pkgs + + +@pytest.mark.parametrize('pkgs,expected_pkgs_dict', [ + ( + [], + expected_pkgs, + ), + ( + # more precise but not strictly higher + [Package('spam', '3.2.1.9')], + expected_pkgs, + ), + ( + [Package('ovs', '2.7')], + { + "ovs": { + "name": "ovs", + "version": ["2.6", "2.7"], + "check_multi": False, + } + }, + ), +]) +def test_check_higher_version_found(pkgs, expected_pkgs_dict): + aos_version._check_higher_version_found(pkgs, expected_pkgs_dict) + + +@pytest.mark.parametrize('pkgs,expected_pkgs_dict,expect_higher', [ + ( + [Package('spam', '3.3')], + expected_pkgs, + ['spam-3.3'], # lower precision, but higher + ), + ( + [Package('spam', '3.2.1'), Package('eggs', '3.3.2')], + expected_pkgs, + ['eggs-3.3.2'], # one too high + ), + ( + [Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5'), Package('eggs', '3.4')], + expected_pkgs, + ['eggs-3.4'], # multiple versions, one is higher + ), + ( + [Package('eggs', '3.2.1'), Package('eggs', '3.4'), Package('eggs', '3.3')], + expected_pkgs, + ['eggs-3.4'], # multiple versions, two are higher + ), + ( + [Package('ovs', '2.8')], + { + "ovs": { + "name": "ovs", + "version": ["2.6", "2.7"], + "check_multi": False, + } + }, + ['ovs-2.8'], + ), +]) +def test_check_higher_version_found_fail(pkgs, expected_pkgs_dict, expect_higher): + with pytest.raises(aos_version.FoundHigherVersion) as e: + aos_version._check_higher_version_found(pkgs, expected_pkgs_dict) + assert set(expect_higher) == set(e.value.problem_pkgs) + + +@pytest.mark.parametrize('pkgs', [ + [], + [Package('spam', '3.2.1')], + [Package('spam', '3.2.1'), Package('eggs', '3.2.2')], +]) +def test_check_multi_minor_release(pkgs): + aos_version._check_multi_minor_release(pkgs, expected_pkgs) + + +@pytest.mark.parametrize('pkgs,expect_to_flag_pkgs', [ + ( + [Package('spam', '3.2.1'), Package('spam', '3.3.2')], + ['spam'], + ), + ( + [Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5'), Package('eggs', '3.4')], + ['eggs'], + ), +]) +def test_check_multi_minor_release_fail(pkgs, expect_to_flag_pkgs): + with pytest.raises(aos_version.FoundMultiRelease) as e: + aos_version._check_multi_minor_release(pkgs, expected_pkgs) + assert set(expect_to_flag_pkgs) == set(e.value.problem_pkgs) diff --git a/roles/openshift_health_checker/test/conftest.py b/roles/openshift_health_checker/test/conftest.py new file mode 100644 index 000000000..244a1f0fa --- /dev/null +++ b/roles/openshift_health_checker/test/conftest.py @@ -0,0 +1,12 @@ +import os +import sys + +# extend sys.path so that tests can import openshift_checks and action plugins +# from this role. +openshift_health_checker_path = os.path.dirname(os.path.dirname(__file__)) +sys.path[1:1] = [ + openshift_health_checker_path, + os.path.join(openshift_health_checker_path, 'action_plugins'), + os.path.join(openshift_health_checker_path, 'callback_plugins'), + os.path.join(openshift_health_checker_path, 'library'), +] diff --git a/roles/openshift_health_checker/test/curator_test.py b/roles/openshift_health_checker/test/curator_test.py new file mode 100644 index 000000000..62c680b74 --- /dev/null +++ b/roles/openshift_health_checker/test/curator_test.py @@ -0,0 +1,57 @@ +import pytest + +from openshift_checks.logging.curator import Curator, OpenShiftCheckException + + +plain_curator_pod = { + "metadata": { + "labels": {"component": "curator", "deploymentconfig": "logging-curator"}, + "name": "logging-curator-1", + }, + "status": { + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + "podIP": "10.10.10.10", + } +} + +not_running_curator_pod = { + "metadata": { + "labels": {"component": "curator", "deploymentconfig": "logging-curator"}, + "name": "logging-curator-2", + }, + "status": { + "containerStatuses": [{"ready": False}], + "conditions": [{"status": "False", "type": "Ready"}], + "podIP": "10.10.10.10", + } +} + + +def test_get_curator_pods(): + check = Curator() + check.get_pods_for_component = lambda *_: [plain_curator_pod] + result = check.run() + assert "failed" not in result or not result["failed"] + + +@pytest.mark.parametrize('pods, expect_error', [ + ( + [], + 'MissingComponentPods', + ), + ( + [not_running_curator_pod], + 'CuratorNotRunning', + ), + ( + [plain_curator_pod, plain_curator_pod], + 'TooManyCurators', + ), +]) +def test_get_curator_pods_fail(pods, expect_error): + check = Curator() + check.get_pods_for_component = lambda *_: pods + with pytest.raises(OpenShiftCheckException) as excinfo: + check.run() + assert excinfo.value.name == expect_error diff --git a/roles/openshift_health_checker/test/diagnostics_test.py b/roles/openshift_health_checker/test/diagnostics_test.py new file mode 100644 index 000000000..800889fa7 --- /dev/null +++ b/roles/openshift_health_checker/test/diagnostics_test.py @@ -0,0 +1,50 @@ +import pytest + +from openshift_checks.diagnostics import DiagnosticCheck, OpenShiftCheckException + + +@pytest.fixture() +def task_vars(): + return dict( + openshift=dict( + common=dict(config_base="/etc/origin/") + ) + ) + + +def test_module_succeeds(task_vars): + check = DiagnosticCheck(lambda *_: {"result": "success"}, task_vars) + check.is_first_master = lambda: True + assert check.is_active() + check.exec_diagnostic("spam") + assert not check.failures + + +def test_oc_not_there(task_vars): + def exec_module(*_): + return {"failed": True, "result": "[Errno 2] No such file or directory"} + + check = DiagnosticCheck(exec_module, task_vars) + with pytest.raises(OpenShiftCheckException) as excinfo: + check.exec_diagnostic("spam") + assert excinfo.value.name == "OcNotFound" + + +def test_module_fails(task_vars): + def exec_module(*_): + return {"failed": True, "result": "something broke"} + + check = DiagnosticCheck(exec_module, task_vars) + check.exec_diagnostic("spam") + assert check.failures and check.failures[0].name == "OcDiagFailed" + + +def test_names_executed(task_vars): + task_vars["openshift_check_diagnostics"] = diagnostics = "ConfigContexts,spam,,eggs" + + def exec_module(module, args, *_): + assert "extra_args" in args + assert args["extra_args"][0] in diagnostics + return {"result": "success"} + + DiagnosticCheck(exec_module, task_vars).run() diff --git a/roles/openshift_health_checker/test/disk_availability_test.py b/roles/openshift_health_checker/test/disk_availability_test.py new file mode 100644 index 000000000..29a325a17 --- /dev/null +++ b/roles/openshift_health_checker/test/disk_availability_test.py @@ -0,0 +1,249 @@ +import pytest + +from openshift_checks.disk_availability import DiskAvailability, OpenShiftCheckException + + +@pytest.mark.parametrize('group_names,is_active', [ + (['oo_masters_to_config'], True), + (['oo_nodes_to_config'], True), + (['oo_etcd_to_config'], True), + (['oo_masters_to_config', 'oo_nodes_to_config'], True), + (['oo_masters_to_config', 'oo_etcd_to_config'], True), + ([], False), + (['lb'], False), + (['nfs'], False), +]) +def test_is_active(group_names, is_active): + task_vars = dict( + group_names=group_names, + ) + assert DiskAvailability(None, task_vars).is_active() == is_active + + +@pytest.mark.parametrize('desc, ansible_mounts, expect_chunks', [ + ( + 'empty ansible_mounts', + [], + ['determine mount point', 'none'], + ), + ( + 'missing relevant mount paths', + [{'mount': '/mnt'}], + ['determine mount point', '/mnt'], + ), + ( + 'missing size_available', + [{'mount': '/var'}, {'mount': '/usr'}, {'mount': '/tmp'}], + ['missing', 'size_available'], + ), +]) +def test_cannot_determine_available_disk(desc, ansible_mounts, expect_chunks): + task_vars = dict( + group_names=['oo_masters_to_config'], + ansible_mounts=ansible_mounts, + ) + + with pytest.raises(OpenShiftCheckException) as excinfo: + DiskAvailability(fake_execute_module, task_vars).run() + + for chunk in expect_chunks: + assert chunk in str(excinfo.value) + + +@pytest.mark.parametrize('group_names,configured_min,ansible_mounts', [ + ( + ['oo_masters_to_config'], + 0, + [{ + 'mount': '/', + 'size_available': 40 * 10**9 + 1, + }], + ), + ( + ['oo_nodes_to_config'], + 0, + [{ + 'mount': '/', + 'size_available': 15 * 10**9 + 1, + }], + ), + ( + ['oo_etcd_to_config'], + 0, + [{ + 'mount': '/', + 'size_available': 20 * 10**9 + 1, + }], + ), + ( + ['oo_etcd_to_config'], + 1, # configure lower threshold + [{ + 'mount': '/', + 'size_available': 1 * 10**9 + 1, # way smaller than recommended + }], + ), + ( + ['oo_etcd_to_config'], + 0, + [{ + # not enough space on / ... + 'mount': '/', + 'size_available': 2 * 10**9, + }, { + # ... but enough on /var + 'mount': '/var', + 'size_available': 20 * 10**9 + 1, + }], + ), +]) +def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansible_mounts): + task_vars = dict( + group_names=group_names, + openshift_check_min_host_disk_gb=configured_min, + ansible_mounts=ansible_mounts, + ) + + result = DiskAvailability(fake_execute_module, task_vars).run() + + assert not result.get('failed', False) + + +@pytest.mark.parametrize('name,group_names,configured_min,ansible_mounts,expect_chunks', [ + ( + 'test with no space available', + ['oo_masters_to_config'], + 0, + [{ + 'mount': '/', + 'size_available': 1, + }], + ['0.0 GB'], + ), + ( + 'test with a higher configured required value', + ['oo_masters_to_config'], + 100, # set a higher threshold + [{ + 'mount': '/', + 'size_available': 50 * 10**9, # would normally be enough... + }], + ['100.0 GB'], + ), + ( + 'test with 1GB available, but "0" GB space requirement', + ['oo_nodes_to_config'], + 0, + [{ + 'mount': '/', + 'size_available': 1 * 10**9, + }], + ['1.0 GB'], + ), + ( + 'test with no space available, but "0" GB space requirement', + ['oo_etcd_to_config'], + 0, + [{ + 'mount': '/', + 'size_available': 1, + }], + ['0.0 GB'], + ), + ( + 'test with enough space for a node, but not for a master', + ['oo_nodes_to_config', 'oo_masters_to_config'], + 0, + [{ + 'mount': '/', + 'size_available': 15 * 10**9 + 1, + }], + ['15.0 GB'], + ), + ( + 'test failure with enough space on "/", but not enough on "/var"', + ['oo_etcd_to_config'], + 0, + [{ + # enough space on / ... + 'mount': '/', + 'size_available': 20 * 10**9 + 1, + }, { + # .. but not enough on /var + 'mount': '/var', + 'size_available': 0, + }], + ['0.0 GB'], + ), +], ids=lambda argval: argval[0]) +def test_fails_with_insufficient_disk_space(name, group_names, configured_min, ansible_mounts, expect_chunks): + task_vars = dict( + group_names=group_names, + openshift_check_min_host_disk_gb=configured_min, + ansible_mounts=ansible_mounts, + ) + + check = DiskAvailability(fake_execute_module, task_vars) + check.run() + + assert check.failures + for chunk in 'below recommended'.split() + expect_chunks: + assert chunk in str(check.failures[0]) + + +@pytest.mark.parametrize('name,group_names,context,ansible_mounts,failed,extra_words', [ + ( + 'test without enough space for master under "upgrade" context', + ['oo_nodes_to_config', 'oo_masters_to_config'], + "upgrade", + [{ + 'mount': '/', + 'size_available': 1 * 10**9 + 1, + 'size_total': 21 * 10**9 + 1, + }], + True, + ["1.0 GB"], + ), + ( + 'test with enough space for master under "upgrade" context', + ['oo_nodes_to_config', 'oo_masters_to_config'], + "upgrade", + [{ + 'mount': '/', + 'size_available': 10 * 10**9 + 1, + 'size_total': 21 * 10**9 + 1, + }], + False, + [], + ), + ( + 'test with not enough space for master, and non-upgrade context', + ['oo_nodes_to_config', 'oo_masters_to_config'], + "health", + [{ + 'mount': '/', + # not enough space for a master, + # "health" context should not lower requirement + 'size_available': 20 * 10**9 + 1, + }], + True, + ["20.0 GB", "below minimum"], + ), +], ids=lambda argval: argval[0]) +def test_min_required_space_changes_with_upgrade_context(name, group_names, context, ansible_mounts, failed, extra_words): + task_vars = dict( + r_openshift_health_checker_playbook_context=context, + group_names=group_names, + ansible_mounts=ansible_mounts, + ) + + check = DiskAvailability(fake_execute_module, task_vars) + check.run() + + assert bool(check.failures) == failed + for word in extra_words: + assert word in str(check.failures[0]) + + +def fake_execute_module(*args): + raise AssertionError('this function should not be called') diff --git a/roles/openshift_health_checker/test/docker_image_availability_test.py b/roles/openshift_health_checker/test/docker_image_availability_test.py new file mode 100644 index 000000000..dec99e5db --- /dev/null +++ b/roles/openshift_health_checker/test/docker_image_availability_test.py @@ -0,0 +1,263 @@ +import pytest + +from openshift_checks.docker_image_availability import DockerImageAvailability + + +@pytest.fixture() +def task_vars(): + return dict( + openshift=dict( + common=dict( + service_type='origin', + is_containerized=False, + is_atomic=False, + ), + docker=dict(), + ), + openshift_deployment_type='origin', + openshift_image_tag='', + group_names=['oo_nodes_to_config', 'oo_masters_to_config'], + ) + + +@pytest.mark.parametrize('deployment_type, is_containerized, group_names, expect_active', [ + ("invalid", True, [], False), + ("", True, [], False), + ("origin", False, [], False), + ("openshift-enterprise", False, [], False), + ("origin", False, ["oo_nodes_to_config", "oo_masters_to_config"], True), + ("openshift-enterprise", False, ["oo_etcd_to_config"], False), + ("origin", True, ["nfs"], False), + ("openshift-enterprise", True, ["lb"], False), +]) +def test_is_active(task_vars, deployment_type, is_containerized, group_names, expect_active): + task_vars['openshift_deployment_type'] = deployment_type + task_vars['openshift']['common']['is_containerized'] = is_containerized + task_vars['group_names'] = group_names + assert DockerImageAvailability(None, task_vars).is_active() == expect_active + + +@pytest.mark.parametrize("is_containerized,is_atomic", [ + (True, True), + (False, False), + (True, False), + (False, True), +]) +def test_all_images_available_locally(task_vars, is_containerized, is_atomic): + def execute_module(module_name, module_args, *_): + if module_name == "yum": + return {} + + assert module_name == "docker_image_facts" + assert 'name' in module_args + assert module_args['name'] + return { + 'images': [module_args['name']], + } + + task_vars['openshift']['common']['is_containerized'] = is_containerized + task_vars['openshift']['common']['is_atomic'] = is_atomic + result = DockerImageAvailability(execute_module, task_vars).run() + + assert not result.get('failed', False) + + +@pytest.mark.parametrize("available_locally", [ + False, + True, +]) +def test_all_images_available_remotely(task_vars, available_locally): + def execute_module(module_name, *_): + if module_name == 'docker_image_facts': + return {'images': [], 'failed': available_locally} + return {} + + task_vars['openshift_docker_additional_registries'] = ["docker.io", "registry.access.redhat.com"] + task_vars['openshift_image_tag'] = 'v3.4' + check = DockerImageAvailability(execute_module, task_vars) + check._module_retry_interval = 0 + result = check.run() + + assert not result.get('failed', False) + + +def test_all_images_unavailable(task_vars): + def execute_module(module_name=None, *args): + if module_name == "wait_for": + return {} + elif module_name == "command": + return {'failed': True} + + return {} # docker_image_facts failure + + task_vars['openshift_docker_additional_registries'] = ["docker.io"] + task_vars['openshift_deployment_type'] = "openshift-enterprise" + task_vars['openshift_image_tag'] = 'latest' + check = DockerImageAvailability(execute_module, task_vars) + check._module_retry_interval = 0 + actual = check.run() + + assert actual['failed'] + assert "required container images are not available" in actual['msg'] + + +@pytest.mark.parametrize("message,extra_words", [ + ( + "docker image update failure", + ["docker image update failure"], + ), + ( + "No package matching 'skopeo' found available, installed or updated", + ["dependencies can be installed via `yum`"] + ), +]) +def test_skopeo_update_failure(task_vars, message, extra_words): + def execute_module(module_name=None, *_): + if module_name == "yum": + return { + "failed": True, + "msg": message, + } + + return {} + + task_vars['openshift_docker_additional_registries'] = ["unknown.io"] + task_vars['openshift_deployment_type'] = "openshift-enterprise" + check = DockerImageAvailability(execute_module, task_vars) + check._module_retry_interval = 0 + actual = check.run() + + assert actual["failed"] + for word in extra_words: + assert word in actual["msg"] + + +@pytest.mark.parametrize( + "image, registries, connection_test_failed, skopeo_failed, " + "expect_success, expect_registries_reached", [ + ( + "spam/eggs:v1", ["test.reg"], + True, True, + False, + {"test.reg": False, "docker.io": False}, + ), + ( + "spam/eggs:v1", ["test.reg"], + False, True, + False, + {"test.reg": True, "docker.io": True}, + ), + ( + "eggs.reg/spam/eggs:v1", ["test.reg"], + False, False, + True, + {"eggs.reg": True}, + ), + ]) +def test_registry_availability(image, registries, connection_test_failed, skopeo_failed, + expect_success, expect_registries_reached): + def execute_module(module_name=None, *_): + if module_name == "wait_for": + return dict(msg="msg", failed=connection_test_failed) + elif module_name == "command": + return dict(msg="msg", failed=skopeo_failed) + + tv = task_vars() + tv.update({"openshift_docker_additional_registries": registries}) + check = DockerImageAvailability(execute_module, tv) + check._module_retry_interval = 0 + + available = check.is_available_skopeo_image(image) + assert available == expect_success + assert expect_registries_reached == check.reachable_registries + + +@pytest.mark.parametrize("deployment_type, is_containerized, groups, oreg_url, expected", [ + ( # standard set of stuff required on nodes + "origin", False, ['oo_nodes_to_config'], "", + set([ + 'openshift/origin-pod:vtest', + 'openshift/origin-deployer:vtest', + 'openshift/origin-docker-registry:vtest', + 'openshift/origin-haproxy-router:vtest', + 'cockpit/kubernetes', # origin version of registry-console + ]) + ), + ( # set a different URL for images + "origin", False, ['oo_nodes_to_config'], 'foo.io/openshift/origin-${component}:${version}', + set([ + 'foo.io/openshift/origin-pod:vtest', + 'foo.io/openshift/origin-deployer:vtest', + 'foo.io/openshift/origin-docker-registry:vtest', + 'foo.io/openshift/origin-haproxy-router:vtest', + 'cockpit/kubernetes', # AFAICS this is not built from the URL + ]) + ), + ( + "origin", True, ['oo_nodes_to_config', 'oo_masters_to_config', 'oo_etcd_to_config'], "", + set([ + # images running on top of openshift + 'openshift/origin-pod:vtest', + 'openshift/origin-deployer:vtest', + 'openshift/origin-docker-registry:vtest', + 'openshift/origin-haproxy-router:vtest', + 'cockpit/kubernetes', + # containerized component images + 'openshift/origin:vtest', + 'openshift/node:vtest', + 'openshift/openvswitch:vtest', + 'registry.access.redhat.com/rhel7/etcd', + ]) + ), + ( # enterprise images + "openshift-enterprise", True, ['oo_nodes_to_config'], 'foo.io/openshift3/ose-${component}:f13ac45', + set([ + 'foo.io/openshift3/ose-pod:f13ac45', + 'foo.io/openshift3/ose-deployer:f13ac45', + 'foo.io/openshift3/ose-docker-registry:f13ac45', + 'foo.io/openshift3/ose-haproxy-router:f13ac45', + # registry-console is not constructed/versioned the same as the others. + 'registry.access.redhat.com/openshift3/registry-console', + # containerized images aren't built from oreg_url + 'openshift3/node:vtest', + 'openshift3/openvswitch:vtest', + ]) + ), + ( + "openshift-enterprise", True, ['oo_etcd_to_config', 'lb'], 'foo.io/openshift3/ose-${component}:f13ac45', + set([ + 'registry.access.redhat.com/rhel7/etcd', + # lb does not yet come in a containerized version + ]) + ), + +]) +def test_required_images(deployment_type, is_containerized, groups, oreg_url, expected): + task_vars = dict( + openshift=dict( + common=dict( + is_containerized=is_containerized, + is_atomic=False, + ), + ), + openshift_deployment_type=deployment_type, + group_names=groups, + oreg_url=oreg_url, + openshift_image_tag='vtest', + ) + + assert expected == DockerImageAvailability(task_vars=task_vars).required_images() + + +def test_containerized_etcd(): + task_vars = dict( + openshift=dict( + common=dict( + is_containerized=True, + ), + ), + openshift_deployment_type="origin", + group_names=['oo_etcd_to_config'], + ) + expected = set(['registry.access.redhat.com/rhel7/etcd']) + assert expected == DockerImageAvailability(task_vars=task_vars).required_images() diff --git a/roles/openshift_health_checker/test/docker_storage_test.py b/roles/openshift_health_checker/test/docker_storage_test.py new file mode 100644 index 000000000..8fa68c378 --- /dev/null +++ b/roles/openshift_health_checker/test/docker_storage_test.py @@ -0,0 +1,305 @@ +import pytest + +from openshift_checks import OpenShiftCheckException +from openshift_checks.docker_storage import DockerStorage + + +@pytest.mark.parametrize('is_containerized, group_names, is_active', [ + (False, ["oo_masters_to_config", "oo_etcd_to_config"], False), + (False, ["oo_masters_to_config", "oo_nodes_to_config"], True), + (True, ["oo_etcd_to_config"], True), +]) +def test_is_active(is_containerized, group_names, is_active): + task_vars = dict( + openshift=dict(common=dict(is_containerized=is_containerized)), + group_names=group_names, + ) + assert DockerStorage(None, task_vars).is_active() == is_active + + +def non_atomic_task_vars(): + return {"openshift": {"common": {"is_atomic": False}}} + + +@pytest.mark.parametrize('docker_info, failed, expect_msg', [ + ( + dict(failed=True, msg="Error connecting: Error while fetching server API version"), + True, + ["Is docker running on this host?"], + ), + ( + dict(msg="I have no info"), + True, + ["missing info"], + ), + ( + dict(info={ + "Driver": "devicemapper", + "DriverStatus": [("Pool Name", "docker-docker--pool")], + }), + False, + [], + ), + ( + dict(info={ + "Driver": "devicemapper", + "DriverStatus": [("Data loop file", "true")], + }), + True, + ["loopback devices with the Docker devicemapper storage driver"], + ), + ( + dict(info={ + "Driver": "overlay2", + "DriverStatus": [("Backing Filesystem", "xfs")], + }), + False, + [], + ), + ( + dict(info={ + "Driver": "overlay", + "DriverStatus": [("Backing Filesystem", "btrfs")], + }), + True, + ["storage is type 'btrfs'", "only supported with\n'xfs'"], + ), + ( + dict(info={ + "Driver": "overlay2", + "DriverStatus": [("Backing Filesystem", "xfs")], + "OperatingSystem": "Red Hat Enterprise Linux Server release 7.2 (Maipo)", + "KernelVersion": "3.10.0-327.22.2.el7.x86_64", + }), + True, + ["Docker reports kernel version 3.10.0-327"], + ), + ( + dict(info={ + "Driver": "overlay", + "DriverStatus": [("Backing Filesystem", "xfs")], + "OperatingSystem": "CentOS", + "KernelVersion": "3.10.0-514", + }), + False, + [], + ), + ( + dict(info={ + "Driver": "unsupported", + }), + True, + ["unsupported Docker storage driver"], + ), +]) +def test_check_storage_driver(docker_info, failed, expect_msg): + def execute_module(module_name, *_): + if module_name == "yum": + return {} + if module_name != "docker_info": + raise ValueError("not expecting module " + module_name) + return docker_info + + check = DockerStorage(execute_module, non_atomic_task_vars()) + check.check_dm_usage = lambda status: dict() # stub out for this test + check.check_overlay_usage = lambda info: dict() # stub out for this test + result = check.run() + + if failed: + assert result["failed"] + else: + assert not result.get("failed", False) + + for word in expect_msg: + assert word in result["msg"] + + +enough_space = { + "Pool Name": "docker--vg-docker--pool", + "Data Space Used": "19.92 MB", + "Data Space Total": "8.535 GB", + "Metadata Space Used": "40.96 kB", + "Metadata Space Total": "25.17 MB", +} + +not_enough_space = { + "Pool Name": "docker--vg-docker--pool", + "Data Space Used": "10 GB", + "Data Space Total": "10 GB", + "Metadata Space Used": "42 kB", + "Metadata Space Total": "43 kB", +} + + +@pytest.mark.parametrize('task_vars, driver_status, vg_free, success, expect_msg', [ + ( + {"max_thinpool_data_usage_percent": "not a float"}, + enough_space, + "12g", + False, + ["is not a percentage"], + ), + ( + {}, + {}, # empty values from driver status + "bogus", # also does not parse as bytes + False, + ["Could not interpret", "as bytes"], + ), + ( + {}, + enough_space, + "12.00g", + True, + [], + ), + ( + {}, + not_enough_space, + "0.00", + False, + ["data usage", "metadata usage", "higher than threshold"], + ), +]) +def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg): + check = DockerStorage(None, task_vars) + check.get_vg_free = lambda pool: vg_free + result = check.check_dm_usage(driver_status) + result_success = not result.get("failed") + + assert result_success is success + for msg in expect_msg: + assert msg in result["msg"] + + +@pytest.mark.parametrize('pool, command_returns, raises, returns', [ + ( + "foo-bar", + { # vgs missing + "msg": "[Errno 2] No such file or directory", + "failed": True, + "cmd": "/sbin/vgs", + "rc": 2, + }, + "Failed to run /sbin/vgs", + None, + ), + ( + "foo", # no hyphen in name - should not happen + {}, + "name does not have the expected format", + None, + ), + ( + "foo-bar", + dict(stdout=" 4.00g\n"), + None, + "4.00g", + ), + ( + "foo-bar", + dict(stdout="\n"), # no matching VG + "vgs did not find this VG", + None, + ) +]) +def test_vg_free(pool, command_returns, raises, returns): + def execute_module(module_name, *_): + if module_name != "command": + raise ValueError("not expecting module " + module_name) + return command_returns + + check = DockerStorage(execute_module) + if raises: + with pytest.raises(OpenShiftCheckException) as err: + check.get_vg_free(pool) + assert raises in str(err.value) + else: + ret = check.get_vg_free(pool) + assert ret == returns + + +@pytest.mark.parametrize('string, expect_bytes', [ + ("12", 12.0), + ("12 k", 12.0 * 1024), + ("42.42 MB", 42.42 * 1024**2), + ("12g", 12.0 * 1024**3), +]) +def test_convert_to_bytes(string, expect_bytes): + got = DockerStorage.convert_to_bytes(string) + assert got == expect_bytes + + +@pytest.mark.parametrize('string', [ + "bork", + "42 Qs", +]) +def test_convert_to_bytes_error(string): + with pytest.raises(ValueError) as err: + DockerStorage.convert_to_bytes(string) + assert "Cannot convert" in str(err.value) + assert string in str(err.value) + + +ansible_mounts_enough = [{ + 'mount': '/var/lib/docker', + 'size_available': 50 * 10**9, + 'size_total': 50 * 10**9, +}] +ansible_mounts_not_enough = [{ + 'mount': '/var/lib/docker', + 'size_available': 0, + 'size_total': 50 * 10**9, +}] +ansible_mounts_missing_fields = [dict(mount='/var/lib/docker')] +ansible_mounts_zero_size = [{ + 'mount': '/var/lib/docker', + 'size_available': 0, + 'size_total': 0, +}] + + +@pytest.mark.parametrize('ansible_mounts, threshold, expect_fail, expect_msg', [ + ( + ansible_mounts_enough, + None, + False, + [], + ), + ( + ansible_mounts_not_enough, + None, + True, + ["usage percentage", "higher than threshold"], + ), + ( + ansible_mounts_not_enough, + "bogus percent", + True, + ["is not a percentage"], + ), + ( + ansible_mounts_missing_fields, + None, + True, + ["Ansible bug"], + ), + ( + ansible_mounts_zero_size, + None, + True, + ["Ansible bug"], + ), +]) +def test_overlay_usage(ansible_mounts, threshold, expect_fail, expect_msg): + task_vars = non_atomic_task_vars() + task_vars["ansible_mounts"] = ansible_mounts + if threshold is not None: + task_vars["max_overlay_usage_percent"] = threshold + check = DockerStorage(None, task_vars) + docker_info = dict(DockerRootDir="/var/lib/docker", Driver="overlay") + result = check.check_overlay_usage(docker_info) + + assert expect_fail == bool(result.get("failed")) + for msg in expect_msg: + assert msg in result["msg"] diff --git a/roles/openshift_health_checker/test/elasticsearch_test.py b/roles/openshift_health_checker/test/elasticsearch_test.py new file mode 100644 index 000000000..3fa5e8929 --- /dev/null +++ b/roles/openshift_health_checker/test/elasticsearch_test.py @@ -0,0 +1,203 @@ +import pytest +import json + +from openshift_checks.logging.elasticsearch import Elasticsearch, OpenShiftCheckExceptionList + + +task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) + + +def canned_elasticsearch(task_vars=None, exec_oc=None): + """Create an Elasticsearch check object with stubbed exec_oc method""" + check = Elasticsearch(None, task_vars or {}) + if exec_oc: + check.exec_oc = exec_oc + return check + + +def assert_error_in_list(expect_err, errorlist): + assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist)) + + +def pods_by_name(pods): + return {pod['metadata']['name']: pod for pod in pods} + + +plain_es_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es"}, + "name": "logging-es", + }, + "spec": {}, + "status": { + "conditions": [{"status": "True", "type": "Ready"}], + "containerStatuses": [{"ready": True}], + "podIP": "10.10.10.10", + }, + "_test_master_name_str": "name logging-es", +} + +split_es_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es-2"}, + "name": "logging-es-2", + }, + "spec": {}, + "status": { + "conditions": [{"status": "True", "type": "Ready"}], + "containerStatuses": [{"ready": True}], + "podIP": "10.10.10.10", + }, + "_test_master_name_str": "name logging-es-2", +} + +unready_es_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es-3"}, + "name": "logging-es-3", + }, + "spec": {}, + "status": { + "conditions": [{"status": "False", "type": "Ready"}], + "containerStatuses": [{"ready": False}], + "podIP": "10.10.10.10", + }, + "_test_master_name_str": "BAD_NAME_RESPONSE", +} + + +def test_check_elasticsearch(): + with pytest.raises(OpenShiftCheckExceptionList) as excinfo: + canned_elasticsearch().check_elasticsearch([]) + assert_error_in_list('NoRunningPods', excinfo.value) + + # canned oc responses to match so all the checks pass + def exec_oc(cmd, args, **_): + if '_cat/master' in cmd: + return 'name logging-es' + elif '/_nodes' in cmd: + return json.dumps(es_node_list) + elif '_cluster/health' in cmd: + return '{"status": "green"}' + elif ' df ' in cmd: + return 'IUse% Use%\n 3% 4%\n' + else: + raise Exception(cmd) + + check = canned_elasticsearch({}, exec_oc) + check.get_pods_for_component = lambda *_: [plain_es_pod] + assert {} == check.run() + + +def test_check_running_es_pods(): + pods, errors = Elasticsearch().running_elasticsearch_pods([plain_es_pod, unready_es_pod]) + assert plain_es_pod in pods + assert_error_in_list('PodNotRunning', errors) + + +def test_check_elasticsearch_masters(): + pods = [plain_es_pod] + check = canned_elasticsearch(task_vars_config_base, lambda *args, **_: plain_es_pod['_test_master_name_str']) + assert not check.check_elasticsearch_masters(pods_by_name(pods)) + + +@pytest.mark.parametrize('pods, expect_error', [ + ( + [], + 'NoMasterFound', + ), + ( + [unready_es_pod], + 'NoMasterName', + ), + ( + [plain_es_pod, split_es_pod], + 'SplitBrainMasters', + ), +]) +def test_check_elasticsearch_masters_error(pods, expect_error): + test_pods = list(pods) + check = canned_elasticsearch(task_vars_config_base, lambda *args, **_: test_pods.pop(0)['_test_master_name_str']) + assert_error_in_list(expect_error, check.check_elasticsearch_masters(pods_by_name(pods))) + + +es_node_list = { + 'nodes': { + 'random-es-name': { + 'host': 'logging-es', + }}} + + +def test_check_elasticsearch_node_list(): + check = canned_elasticsearch(task_vars_config_base, lambda *args, **_: json.dumps(es_node_list)) + assert not check.check_elasticsearch_node_list(pods_by_name([plain_es_pod])) + + +@pytest.mark.parametrize('pods, node_list, expect_error', [ + ( + [], + {}, + 'MissingComponentPods', + ), + ( + [plain_es_pod], + {}, # empty list of nodes triggers KeyError + 'MissingNodeList', + ), + ( + [split_es_pod], + es_node_list, + 'EsPodNodeMismatch', + ), +]) +def test_check_elasticsearch_node_list_errors(pods, node_list, expect_error): + check = canned_elasticsearch(task_vars_config_base, lambda cmd, args, **_: json.dumps(node_list)) + assert_error_in_list(expect_error, check.check_elasticsearch_node_list(pods_by_name(pods))) + + +def test_check_elasticsearch_cluster_health(): + test_health_data = [{"status": "green"}] + check = canned_elasticsearch(exec_oc=lambda *args, **_: json.dumps(test_health_data.pop(0))) + assert not check.check_es_cluster_health(pods_by_name([plain_es_pod])) + + +@pytest.mark.parametrize('pods, health_data, expect_error', [ + ( + [plain_es_pod], + [{"no-status": "should bomb"}], + 'BadEsResponse', + ), + ( + [plain_es_pod, split_es_pod], + [{"status": "green"}, {"status": "red"}], + 'EsClusterHealthRed', + ), +]) +def test_check_elasticsearch_cluster_health_errors(pods, health_data, expect_error): + test_health_data = list(health_data) + check = canned_elasticsearch(exec_oc=lambda *args, **_: json.dumps(test_health_data.pop(0))) + assert_error_in_list(expect_error, check.check_es_cluster_health(pods_by_name(pods))) + + +def test_check_elasticsearch_diskspace(): + check = canned_elasticsearch(exec_oc=lambda *args, **_: 'IUse% Use%\n 3% 4%\n') + assert not check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod])) + + +@pytest.mark.parametrize('disk_data, expect_error', [ + ( + 'df: /elasticsearch/persistent: No such file or directory\n', + 'BadDfResponse', + ), + ( + 'IUse% Use%\n 95% 40%\n', + 'InodeUsageTooHigh', + ), + ( + 'IUse% Use%\n 3% 94%\n', + 'DiskUsageTooHigh', + ), +]) +def test_check_elasticsearch_diskspace_errors(disk_data, expect_error): + check = canned_elasticsearch(exec_oc=lambda *args, **_: disk_data) + assert_error_in_list(expect_error, check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod]))) diff --git a/roles/openshift_health_checker/test/etcd_imagedata_size_test.py b/roles/openshift_health_checker/test/etcd_imagedata_size_test.py new file mode 100644 index 000000000..d3aae98f2 --- /dev/null +++ b/roles/openshift_health_checker/test/etcd_imagedata_size_test.py @@ -0,0 +1,329 @@ +import pytest + +from collections import namedtuple +from openshift_checks.etcd_imagedata_size import EtcdImageDataSize +from openshift_checks import OpenShiftCheckException +from etcdkeysize import check_etcd_key_size + + +def fake_etcd_client(root): + fake_nodes = dict() + fake_etcd_node(root, fake_nodes) + + clientclass = namedtuple("client", ["read"]) + return clientclass(lambda key, recursive: fake_etcd_result(fake_nodes[key])) + + +def fake_etcd_result(fake_node): + resultclass = namedtuple("result", ["leaves"]) + if not fake_node.dir: + return resultclass([fake_node]) + + return resultclass(fake_node.leaves) + + +def fake_etcd_node(node, visited): + min_req_fields = ["dir", "key"] + fields = list(node) + leaves = [] + + if node["dir"] and node.get("leaves"): + for leaf in node["leaves"]: + leaves.append(fake_etcd_node(leaf, visited)) + + if len(set(min_req_fields) - set(fields)) > 0: + raise ValueError("fake etcd nodes require at least {} fields.".format(min_req_fields)) + + if node.get("leaves"): + node["leaves"] = leaves + + nodeclass = namedtuple("node", fields) + nodeinst = nodeclass(**node) + visited[nodeinst.key] = nodeinst + + return nodeinst + + +@pytest.mark.parametrize('ansible_mounts,extra_words', [ + ([], ['none']), # empty ansible_mounts + ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths +]) +def test_cannot_determine_available_mountpath(ansible_mounts, extra_words): + task_vars = dict( + ansible_mounts=ansible_mounts, + ) + check = EtcdImageDataSize(fake_execute_module, task_vars) + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.run() + + for word in ['Unable to determine mount point'] + extra_words: + assert word in str(excinfo.value) + + +@pytest.mark.parametrize('ansible_mounts,tree,size_limit,should_fail,extra_words', [ + ( + # test that default image size limit evals to 1/2 * (total size in use) + [{ + 'mount': '/', + 'size_available': 40 * 10**9, + 'size_total': 80 * 10**9, + }], + {"dir": False, "key": "/", "value": "1234"}, + None, + False, + [], + ), + ( + [{ + 'mount': '/', + 'size_available': 40 * 10**9, + 'size_total': 48 * 10**9, + }], + {"dir": False, "key": "/", "value": "1234"}, + None, + False, + [], + ), + ( + # set max size limit for image data to be below total node value + # total node value is defined as the sum of the value field + # from every node + [{ + 'mount': '/', + 'size_available': 40 * 10**9, + 'size_total': 48 * 10**9, + }], + {"dir": False, "key": "/", "value": "12345678"}, + 7, + True, + ["exceeds the maximum recommended limit", "0.00 GB"], + ), + ( + [{ + 'mount': '/', + 'size_available': 48 * 10**9 - 1, + 'size_total': 48 * 10**9, + }], + {"dir": False, "key": "/", "value": "1234"}, + None, + True, + ["exceeds the maximum recommended limit", "0.00 GB"], + ) +]) +def test_check_etcd_key_size_calculates_correct_limit(ansible_mounts, tree, size_limit, should_fail, extra_words): + def execute_module(module_name, module_args, *_): + if module_name != "etcdkeysize": + return { + "changed": False, + } + + client = fake_etcd_client(tree) + s, limit_exceeded = check_etcd_key_size(client, tree["key"], module_args["size_limit_bytes"]) + + return {"size_limit_exceeded": limit_exceeded} + + task_vars = dict( + etcd_max_image_data_size_bytes=size_limit, + ansible_mounts=ansible_mounts, + openshift=dict( + master=dict(etcd_hosts=["localhost"]), + common=dict(config_base="/var/lib/origin") + ) + ) + if size_limit is None: + task_vars.pop("etcd_max_image_data_size_bytes") + + check = EtcdImageDataSize(execute_module, task_vars).run() + + if should_fail: + assert check["failed"] + + for word in extra_words: + assert word in check["msg"] + else: + assert not check.get("failed", False) + + +@pytest.mark.parametrize('ansible_mounts,tree,root_path,expected_size,extra_words', [ + ( + [{ + 'mount': '/', + 'size_available': 40 * 10**9, + 'size_total': 80 * 10**9, + }], + # test recursive size check on tree with height > 1 + { + "dir": True, + "key": "/", + "leaves": [ + {"dir": False, "key": "/foo1", "value": "1234"}, + {"dir": False, "key": "/foo2", "value": "1234"}, + {"dir": False, "key": "/foo3", "value": "1234"}, + {"dir": False, "key": "/foo4", "value": "1234"}, + { + "dir": True, + "key": "/foo5", + "leaves": [ + {"dir": False, "key": "/foo/bar1", "value": "56789"}, + {"dir": False, "key": "/foo/bar2", "value": "56789"}, + {"dir": False, "key": "/foo/bar3", "value": "56789"}, + { + "dir": True, + "key": "/foo/bar4", + "leaves": [ + {"dir": False, "key": "/foo/bar/baz1", "value": "123"}, + {"dir": False, "key": "/foo/bar/baz2", "value": "123"}, + ] + }, + ] + }, + ] + }, + "/", + 37, + [], + ), + ( + [{ + 'mount': '/', + 'size_available': 40 * 10**9, + 'size_total': 80 * 10**9, + }], + # test correct sub-tree size calculation + { + "dir": True, + "key": "/", + "leaves": [ + {"dir": False, "key": "/foo1", "value": "1234"}, + {"dir": False, "key": "/foo2", "value": "1234"}, + {"dir": False, "key": "/foo3", "value": "1234"}, + {"dir": False, "key": "/foo4", "value": "1234"}, + { + "dir": True, + "key": "/foo5", + "leaves": [ + {"dir": False, "key": "/foo/bar1", "value": "56789"}, + {"dir": False, "key": "/foo/bar2", "value": "56789"}, + {"dir": False, "key": "/foo/bar3", "value": "56789"}, + { + "dir": True, + "key": "/foo/bar4", + "leaves": [ + {"dir": False, "key": "/foo/bar/baz1", "value": "123"}, + {"dir": False, "key": "/foo/bar/baz2", "value": "123"}, + ] + }, + ] + }, + ] + }, + "/foo5", + 21, + [], + ), + ( + [{ + 'mount': '/', + 'size_available': 40 * 10**9, + 'size_total': 80 * 10**9, + }], + # test that a non-existing key is handled correctly + { + "dir": False, + "key": "/", + "value": "1234", + }, + "/missing", + 0, + [], + ), + ( + [{ + 'mount': '/', + 'size_available': 40 * 10**9, + 'size_total': 80 * 10**9, + }], + # test etcd cycle handling + { + "dir": True, + "key": "/", + "leaves": [ + {"dir": False, "key": "/foo1", "value": "1234"}, + {"dir": False, "key": "/foo2", "value": "1234"}, + {"dir": False, "key": "/foo3", "value": "1234"}, + {"dir": False, "key": "/foo4", "value": "1234"}, + { + "dir": True, + "key": "/", + "leaves": [ + {"dir": False, "key": "/foo1", "value": "1"}, + ], + }, + ] + }, + "/", + 16, + [], + ), +]) +def test_etcd_key_size_check_calculates_correct_size(ansible_mounts, tree, root_path, expected_size, extra_words): + def execute_module(module_name, module_args, *_): + if module_name != "etcdkeysize": + return { + "changed": False, + } + + client = fake_etcd_client(tree) + size, limit_exceeded = check_etcd_key_size(client, root_path, module_args["size_limit_bytes"]) + + assert size == expected_size + return { + "size_limit_exceeded": limit_exceeded, + } + + task_vars = dict( + ansible_mounts=ansible_mounts, + openshift=dict( + master=dict(etcd_hosts=["localhost"]), + common=dict(config_base="/var/lib/origin") + ) + ) + + check = EtcdImageDataSize(execute_module, task_vars).run() + assert not check.get("failed", False) + + +def test_etcdkeysize_module_failure(): + def execute_module(module_name, *_): + if module_name != "etcdkeysize": + return { + "changed": False, + } + + return { + "rc": 1, + "module_stderr": "failure", + } + + task_vars = dict( + ansible_mounts=[{ + 'mount': '/', + 'size_available': 40 * 10**9, + 'size_total': 80 * 10**9, + }], + openshift=dict( + master=dict(etcd_hosts=["localhost"]), + common=dict(config_base="/var/lib/origin") + ) + ) + + check = EtcdImageDataSize(execute_module, task_vars).run() + + assert check["failed"] + for word in "Failed to retrieve stats": + assert word in check["msg"] + + +def fake_execute_module(*args): + raise AssertionError('this function should not be called') diff --git a/roles/openshift_health_checker/test/etcd_traffic_test.py b/roles/openshift_health_checker/test/etcd_traffic_test.py new file mode 100644 index 000000000..dd6f4ad81 --- /dev/null +++ b/roles/openshift_health_checker/test/etcd_traffic_test.py @@ -0,0 +1,72 @@ +import pytest + +from openshift_checks.etcd_traffic import EtcdTraffic + + +@pytest.mark.parametrize('group_names,version,is_active', [ + (['oo_masters_to_config'], "3.5", False), + (['oo_masters_to_config'], "3.6", False), + (['oo_nodes_to_config'], "3.4", False), + (['oo_etcd_to_config'], "3.4", True), + (['oo_etcd_to_config'], "1.5", True), + (['oo_etcd_to_config'], "3.1", False), + (['oo_masters_to_config', 'oo_nodes_to_config'], "3.5", False), + (['oo_masters_to_config', 'oo_etcd_to_config'], "3.5", True), + ([], "3.4", False), +]) +def test_is_active(group_names, version, is_active): + task_vars = dict( + group_names=group_names, + openshift_image_tag=version, + ) + assert EtcdTraffic(task_vars=task_vars).is_active() == is_active + + +@pytest.mark.parametrize('group_names,matched,failed,extra_words', [ + (["oo_masters_to_config"], True, True, ["Higher than normal", "traffic"]), + (["oo_masters_to_config", "oo_etcd_to_config"], False, False, []), + (["oo_etcd_to_config"], False, False, []), +]) +def test_log_matches_high_traffic_msg(group_names, matched, failed, extra_words): + def execute_module(module_name, *_): + return { + "matched": matched, + "failed": failed, + } + + task_vars = dict( + group_names=group_names, + openshift=dict( + common=dict(service_type="origin", is_containerized=False), + ) + ) + + result = EtcdTraffic(execute_module, task_vars).run() + + for word in extra_words: + assert word in result.get("msg", "") + + assert result.get("failed", False) == failed + + +@pytest.mark.parametrize('is_containerized,expected_unit_value', [ + (False, "etcd"), + (True, "etcd_container"), +]) +def test_systemd_unit_matches_deployment_type(is_containerized, expected_unit_value): + task_vars = dict( + openshift=dict( + common=dict(is_containerized=is_containerized), + ) + ) + + def execute_module(module_name, args, *_): + assert module_name == "search_journalctl" + matchers = args["log_matchers"] + + for matcher in matchers: + assert matcher["unit"] == expected_unit_value + + return {"failed": False} + + EtcdTraffic(execute_module, task_vars).run() diff --git a/roles/openshift_health_checker/test/etcd_volume_test.py b/roles/openshift_health_checker/test/etcd_volume_test.py new file mode 100644 index 000000000..077cea3ea --- /dev/null +++ b/roles/openshift_health_checker/test/etcd_volume_test.py @@ -0,0 +1,147 @@ +import pytest + +from openshift_checks.etcd_volume import EtcdVolume +from openshift_checks import OpenShiftCheckException + + +@pytest.mark.parametrize('ansible_mounts,extra_words', [ + ([], ['none']), # empty ansible_mounts + ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths +]) +def test_cannot_determine_available_disk(ansible_mounts, extra_words): + task_vars = dict( + ansible_mounts=ansible_mounts, + ) + + with pytest.raises(OpenShiftCheckException) as excinfo: + EtcdVolume(fake_execute_module, task_vars).run() + + for word in ['Unable to determine mount point'] + extra_words: + assert word in str(excinfo.value) + + +@pytest.mark.parametrize('size_limit,ansible_mounts', [ + ( + # if no size limit is specified, expect max usage + # limit to default to 90% of size_total + None, + [{ + 'mount': '/', + 'size_available': 40 * 10**9, + 'size_total': 80 * 10**9 + }], + ), + ( + 1, + [{ + 'mount': '/', + 'size_available': 30 * 10**9, + 'size_total': 30 * 10**9, + }], + ), + ( + 20000000000, + [{ + 'mount': '/', + 'size_available': 20 * 10**9, + 'size_total': 40 * 10**9, + }], + ), + ( + 5000000000, + [{ + # not enough space on / ... + 'mount': '/', + 'size_available': 0, + 'size_total': 0, + }, { + # not enough space on /var/lib ... + 'mount': '/var/lib', + 'size_available': 2 * 10**9, + 'size_total': 21 * 10**9, + }, { + # ... but enough on /var/lib/etcd + 'mount': '/var/lib/etcd', + 'size_available': 36 * 10**9, + 'size_total': 40 * 10**9 + }], + ) +]) +def test_succeeds_with_recommended_disk_space(size_limit, ansible_mounts): + task_vars = dict( + etcd_device_usage_threshold_percent=size_limit, + ansible_mounts=ansible_mounts, + ) + + if task_vars["etcd_device_usage_threshold_percent"] is None: + task_vars.pop("etcd_device_usage_threshold_percent") + + result = EtcdVolume(fake_execute_module, task_vars).run() + + assert not result.get('failed', False) + + +@pytest.mark.parametrize('size_limit_percent,ansible_mounts,extra_words', [ + ( + # if no size limit is specified, expect max usage + # limit to default to 90% of size_total + None, + [{ + 'mount': '/', + 'size_available': 1 * 10**9, + 'size_total': 100 * 10**9, + }], + ['99.0%'], + ), + ( + 70.0, + [{ + 'mount': '/', + 'size_available': 1 * 10**6, + 'size_total': 5 * 10**9, + }], + ['100.0%'], + ), + ( + 40.0, + [{ + 'mount': '/', + 'size_available': 2 * 10**9, + 'size_total': 6 * 10**9, + }], + ['66.7%'], + ), + ( + None, + [{ + # enough space on /var ... + 'mount': '/var', + 'size_available': 20 * 10**9, + 'size_total': 20 * 10**9, + }, { + # .. but not enough on /var/lib + 'mount': '/var/lib', + 'size_available': 1 * 10**9, + 'size_total': 20 * 10**9, + }], + ['95.0%'], + ), +]) +def test_fails_with_insufficient_disk_space(size_limit_percent, ansible_mounts, extra_words): + task_vars = dict( + etcd_device_usage_threshold_percent=size_limit_percent, + ansible_mounts=ansible_mounts, + ) + + if task_vars["etcd_device_usage_threshold_percent"] is None: + task_vars.pop("etcd_device_usage_threshold_percent") + + result = EtcdVolume(fake_execute_module, task_vars).run() + + assert result['failed'] + for word in extra_words: + assert word in result['msg'] + + +def fake_execute_module(*args): + raise AssertionError('this function should not be called') diff --git a/roles/openshift_health_checker/test/fluentd_config_test.py b/roles/openshift_health_checker/test/fluentd_config_test.py new file mode 100644 index 000000000..b5b4858d6 --- /dev/null +++ b/roles/openshift_health_checker/test/fluentd_config_test.py @@ -0,0 +1,348 @@ +import pytest + +from openshift_checks.logging.fluentd_config import FluentdConfig, OpenShiftCheckException + + +def canned_fluentd_pod(containers): + return { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-1", + }, + "spec": { + "host": "node1", + "nodeName": "node1", + "containers": containers, + }, + "status": { + "phase": "Running", + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } + } + + +fluentd_pod = { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-1", + }, + "spec": { + "host": "node1", + "nodeName": "node1", + "containers": [ + { + "name": "container1", + "env": [ + { + "name": "USE_JOURNAL", + "value": "true", + } + ], + } + ], + }, + "status": { + "phase": "Running", + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} + +not_running_fluentd_pod = { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-2", + }, + "status": { + "phase": "Unknown", + "containerStatuses": [{"ready": True}, {"ready": False}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} + + +@pytest.mark.parametrize('name, use_journald, logging_driver, extra_words', [ + ( + 'test success with use_journald=false, and docker config set to use "json-file"', + False, + "json-file", + [], + ), +], ids=lambda argvals: argvals[0]) +def test_check_logging_config_non_master(name, use_journald, logging_driver, extra_words): + def execute_module(module_name, args): + if module_name == "docker_info": + return { + "info": { + "LoggingDriver": logging_driver, + } + } + + return {} + + task_vars = dict( + group_names=["oo_nodes_to_config", "oo_etcd_to_config"], + openshift_logging_fluentd_use_journal=use_journald, + openshift=dict( + common=dict(config_base=""), + ), + ) + + check = FluentdConfig(execute_module, task_vars) + check.execute_module = execute_module + error = check.check_logging_config() + + assert error is None + + +@pytest.mark.parametrize('name, use_journald, logging_driver, words', [ + ( + 'test failure with use_journald=false, but docker config set to use "journald"', + False, + "journald", + ['json log files', 'has been set to use "journald"'], + ), + ( + 'test failure with use_journald=false, but docker config set to use an "unsupported" driver', + False, + "unsupported", + ["json log files", 'has been set to use "unsupported"'], + ), + ( + 'test failure with use_journald=true, but docker config set to use "json-file"', + True, + "json-file", + ['logs from "journald"', 'has been set to use "json-file"'], + ), +], ids=lambda argvals: argvals[0]) +def test_check_logging_config_non_master_failed(name, use_journald, logging_driver, words): + def execute_module(module_name, args): + if module_name == "docker_info": + return { + "info": { + "LoggingDriver": logging_driver, + } + } + + return {} + + task_vars = dict( + group_names=["oo_nodes_to_config", "oo_etcd_to_config"], + openshift_logging_fluentd_use_journal=use_journald, + openshift=dict( + common=dict(config_base=""), + ), + ) + + check = FluentdConfig(execute_module, task_vars) + check.execute_module = execute_module + error = check.check_logging_config() + + assert error is not None + for word in words: + assert word in error + + +@pytest.mark.parametrize('name, pods, logging_driver, extra_words', [ + # use_journald returns false (not using journald), but check succeeds + # since docker is set to use json-file + ( + 'test success with use_journald=false, and docker config set to use default driver "json-file"', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [{ + "name": "USE_JOURNAL", + "value": "false", + }], + }, + ] + )], + "json-file", + [], + ), + ( + 'test success with USE_JOURNAL env var missing and docker config set to use default driver "json-file"', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [{ + "name": "RANDOM", + "value": "value", + }], + }, + ] + )], + "json-file", + [], + ), +], ids=lambda argvals: argvals[0]) +def test_check_logging_config_master(name, pods, logging_driver, extra_words): + def execute_module(module_name, args): + if module_name == "docker_info": + return { + "info": { + "LoggingDriver": logging_driver, + } + } + + return {} + + task_vars = dict( + group_names=["oo_masters_to_config"], + openshift=dict( + common=dict(config_base=""), + ), + ) + + check = FluentdConfig(execute_module, task_vars) + check.execute_module = execute_module + check.get_pods_for_component = lambda _: pods + error = check.check_logging_config() + + assert error is None + + +@pytest.mark.parametrize('name, pods, logging_driver, words', [ + ( + 'test failure with use_journald=false, but docker config set to use "journald"', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [{ + "name": "USE_JOURNAL", + "value": "false", + }], + }, + ] + )], + "journald", + ['json log files', 'has been set to use "journald"'], + ), + ( + 'test failure with use_journald=true, but docker config set to use "json-file"', + [fluentd_pod], + "json-file", + ['logs from "journald"', 'has been set to use "json-file"'], + ), + ( + 'test failure with use_journald=false, but docker set to use an "unsupported" driver', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [{ + "name": "USE_JOURNAL", + "value": "false", + }], + }, + ] + )], + "unsupported", + ["json log files", 'has been set to use "unsupported"'], + ), + ( + 'test failure with USE_JOURNAL env var missing and docker config set to use "journald"', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [{ + "name": "RANDOM", + "value": "value", + }], + }, + ] + )], + "journald", + ["configuration is set to", "json log files"], + ), +], ids=lambda argvals: argvals[0]) +def test_check_logging_config_master_failed(name, pods, logging_driver, words): + def execute_module(module_name, args): + if module_name == "docker_info": + return { + "info": { + "LoggingDriver": logging_driver, + } + } + + return {} + + task_vars = dict( + group_names=["oo_masters_to_config"], + openshift=dict( + common=dict(config_base=""), + ), + ) + + check = FluentdConfig(execute_module, task_vars) + check.execute_module = execute_module + check.get_pods_for_component = lambda _: pods + error = check.check_logging_config() + + assert error is not None + for word in words: + assert word in error + + +@pytest.mark.parametrize('name, pods, response, logging_driver, extra_words', [ + ( + 'test OpenShiftCheckException with no running containers', + [canned_fluentd_pod([])], + { + "failed": True, + "result": "unexpected", + }, + "json-file", + ['no running containers'], + ), + ( + 'test OpenShiftCheckException one container and no env vars set', + [canned_fluentd_pod( + [ + { + "name": "container1", + "env": [], + }, + ] + )], + { + "failed": True, + "result": "unexpected", + }, + "json-file", + ['no environment variables'], + ), +], ids=lambda argvals: argvals[0]) +def test_check_logging_config_master_fails_on_unscheduled_deployment(name, pods, response, logging_driver, extra_words): + def execute_module(module_name, args): + if module_name == "docker_info": + return { + "info": { + "LoggingDriver": logging_driver, + } + } + + return {} + + task_vars = dict( + group_names=["oo_masters_to_config"], + openshift=dict( + common=dict(config_base=""), + ), + ) + + check = FluentdConfig(execute_module, task_vars) + check.get_pods_for_component = lambda _: pods + + with pytest.raises(OpenShiftCheckException) as error: + check.check_logging_config() + + assert error is not None + for word in extra_words: + assert word in str(error) diff --git a/roles/openshift_health_checker/test/fluentd_test.py b/roles/openshift_health_checker/test/fluentd_test.py new file mode 100644 index 000000000..e7bf9818b --- /dev/null +++ b/roles/openshift_health_checker/test/fluentd_test.py @@ -0,0 +1,112 @@ +import pytest +import json + +from openshift_checks.logging.fluentd import Fluentd, OpenShiftCheckExceptionList, OpenShiftCheckException + + +def assert_error_in_list(expect_err, errorlist): + assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist)) + + +fluentd_pod_node1 = { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-1", + }, + "spec": {"host": "node1", "nodeName": "node1"}, + "status": { + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} +fluentd_pod_node2_down = { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-2", + }, + "spec": {"host": "node2", "nodeName": "node2"}, + "status": { + "containerStatuses": [{"ready": False}], + "conditions": [{"status": "False", "type": "Ready"}], + } +} +fluentd_node1 = { + "metadata": { + "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "node1"}, + "name": "node1", + }, + "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.1"}]}, +} +fluentd_node2 = { + "metadata": { + "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "hostname"}, + "name": "node2", + }, + "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.2"}]}, +} +fluentd_node3_unlabeled = { + "metadata": { + "labels": {"kubernetes.io/hostname": "hostname"}, + "name": "node3", + }, + "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.3"}]}, +} + + +def test_get_fluentd_pods(): + check = Fluentd() + check.exec_oc = lambda *_: json.dumps(dict(items=[fluentd_node1])) + check.get_pods_for_component = lambda *_: [fluentd_pod_node1] + assert not check.run() + + +@pytest.mark.parametrize('pods, nodes, expect_error', [ + ( + [], + [], + 'NoNodesDefined', + ), + ( + [], + [fluentd_node3_unlabeled], + 'NoNodesLabeled', + ), + ( + [], + [fluentd_node1, fluentd_node3_unlabeled], + 'NodesUnlabeled', + ), + ( + [], + [fluentd_node2], + 'MissingFluentdPod', + ), + ( + [fluentd_pod_node1, fluentd_pod_node1], + [fluentd_node1], + 'TooManyFluentdPods', + ), + ( + [fluentd_pod_node2_down], + [fluentd_node2], + 'FluentdNotRunning', + ), +]) +def test_get_fluentd_pods_errors(pods, nodes, expect_error): + check = Fluentd() + check.exec_oc = lambda *_: json.dumps(dict(items=nodes)) + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_fluentd(pods) + if isinstance(excinfo.value, OpenShiftCheckExceptionList): + assert_error_in_list(expect_error, excinfo.value) + else: + assert expect_error == excinfo.value.name + + +def test_bad_oc_node_list(): + check = Fluentd() + check.exec_oc = lambda *_: "this isn't even json" + with pytest.raises(OpenShiftCheckException) as excinfo: + check.get_nodes_by_name() + assert 'BadOcNodeList' == excinfo.value.name diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py new file mode 100644 index 000000000..04a5e89c4 --- /dev/null +++ b/roles/openshift_health_checker/test/kibana_test.py @@ -0,0 +1,244 @@ +import pytest +import json + +try: + import urllib2 + from urllib2 import HTTPError, URLError +except ImportError: + from urllib.error import HTTPError, URLError + import urllib.request as urllib2 + +from openshift_checks.logging.kibana import Kibana, OpenShiftCheckException + + +plain_kibana_pod = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-1", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} +not_running_kibana_pod = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-2", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": False}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} + + +def test_check_kibana(): + # should run without exception: + Kibana().check_kibana([plain_kibana_pod]) + + +@pytest.mark.parametrize('pods, expect_error', [ + ( + [], + "MissingComponentPods", + ), + ( + [not_running_kibana_pod], + "NoRunningPods", + ), + ( + [plain_kibana_pod, not_running_kibana_pod], + "PodNotRunning", + ), +]) +def test_check_kibana_error(pods, expect_error): + with pytest.raises(OpenShiftCheckException) as excinfo: + Kibana().check_kibana(pods) + assert expect_error == excinfo.value.name + + +@pytest.mark.parametrize('comment, route, expect_error', [ + ( + "No route returned", + None, + "no_route_exists", + ), + + ( + "broken route response", + {"status": {}}, + "get_route_failed", + ), + ( + "route with no ingress", + { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana", + }, + "status": { + "ingress": [], + }, + "spec": { + "host": "hostname", + } + }, + "route_not_accepted", + ), + + ( + "route with no host", + { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana", + }, + "status": { + "ingress": [{ + "status": True, + }], + }, + "spec": {}, + }, + "route_missing_host", + ), +]) +def test_get_kibana_url_error(comment, route, expect_error): + check = Kibana() + check.exec_oc = lambda *_: json.dumps(route) if route else "" + + with pytest.raises(OpenShiftCheckException) as excinfo: + check._get_kibana_url() + assert excinfo.value.name == expect_error + + +@pytest.mark.parametrize('comment, route, expect_url', [ + ( + "test route that looks fine", + { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana", + }, + "status": { + "ingress": [{ + "status": True, + }], + }, + "spec": { + "host": "hostname", + }, + }, + "https://hostname/", + ), +]) +def test_get_kibana_url(comment, route, expect_url): + check = Kibana() + check.exec_oc = lambda *_: json.dumps(route) + assert expect_url == check._get_kibana_url() + + +@pytest.mark.parametrize('exec_result, expect', [ + ( + 'urlopen error [Errno 111] Connection refused', + 'FailedToConnectInternal', + ), + ( + 'urlopen error [Errno -2] Name or service not known', + 'FailedToResolveInternal', + ), + ( + 'Status code was not [302]: HTTP Error 500: Server error', + 'WrongReturnCodeInternal', + ), + ( + 'bork bork bork', + 'MiscRouteErrorInternal', + ), +]) +def test_verify_url_internal_failure(exec_result, expect): + check = Kibana(execute_module=lambda *_: dict(failed=True, msg=exec_result)) + check._get_kibana_url = lambda: 'url' + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_kibana_route() + assert expect == excinfo.value.name + + +@pytest.mark.parametrize('lib_result, expect', [ + ( + HTTPError('url', 500, 'it broke', hdrs=None, fp=None), + 'MiscRouteError', + ), + ( + URLError('urlopen error [Errno 111] Connection refused'), + 'FailedToConnect', + ), + ( + URLError('urlopen error [Errno -2] Name or service not known'), + 'FailedToResolve', + ), + ( + 302, + 'WrongReturnCode', + ), + ( + 200, + None, + ), +]) +def test_verify_url_external_failure(lib_result, expect, monkeypatch): + + class _http_return: + + def __init__(self, code): + self.code = code + + def getcode(self): + return self.code + + def urlopen(url, context): + if type(lib_result) is int: + return _http_return(lib_result) + raise lib_result + monkeypatch.setattr(urllib2, 'urlopen', urlopen) + + check = Kibana() + check._get_kibana_url = lambda: 'url' + check._verify_url_internal = lambda url: None + + if not expect: + check.check_kibana_route() + return + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_kibana_route() + assert expect == excinfo.value.name + + +def test_verify_url_external_skip(): + check = Kibana(lambda *_: {}, dict(openshift_check_efk_kibana_external="false")) + check._get_kibana_url = lambda: 'url' + check.check_kibana_route() + + +# this is kind of silly but it adds coverage for the run() method... +def test_run(): + pods = ["foo"] + ran = dict(check_kibana=False, check_route=False) + + def check_kibana(pod_list): + ran["check_kibana"] = True + assert pod_list == pods + + def check_kibana_route(): + ran["check_route"] = True + + check = Kibana() + check.get_pods_for_component = lambda *_: pods + check.check_kibana = check_kibana + check.check_kibana_route = check_kibana_route + + check.run() + assert ran["check_kibana"] and ran["check_route"] diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py new file mode 100644 index 000000000..59c703214 --- /dev/null +++ b/roles/openshift_health_checker/test/logging_check_test.py @@ -0,0 +1,166 @@ +import pytest +import json + +from openshift_checks.logging.logging import LoggingCheck, MissingComponentPods, CouldNotUseOc + +task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) + + +def canned_loggingcheck(exec_oc=None, execute_module=None): + """Create a LoggingCheck object with canned exec_oc method""" + check = LoggingCheck(execute_module) + if exec_oc: + check.exec_oc = exec_oc + return check + + +def assert_error(error, expect_error): + if expect_error: + assert error + assert expect_error in error + else: + assert not error + + +plain_es_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es"}, + "name": "logging-es", + }, + "status": { + "conditions": [{"status": "True", "type": "Ready"}], + "containerStatuses": [{"ready": True}], + "podIP": "10.10.10.10", + }, + "_test_master_name_str": "name logging-es", +} + +plain_kibana_pod = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-1", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} + +plain_kibana_pod_no_containerstatus = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-1", + }, + "status": { + "conditions": [{"status": "True", "type": "Ready"}], + } +} + +fluentd_pod_node1 = { + "metadata": { + "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, + "name": "logging-fluentd-1", + }, + "spec": {"host": "node1", "nodeName": "node1"}, + "status": { + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + } +} + +plain_curator_pod = { + "metadata": { + "labels": {"component": "curator", "deploymentconfig": "logging-curator"}, + "name": "logging-curator-1", + }, + "status": { + "containerStatuses": [{"ready": True}], + "conditions": [{"status": "True", "type": "Ready"}], + "podIP": "10.10.10.10", + } +} + + +@pytest.mark.parametrize('problem, expect', [ + ("[Errno 2] No such file or directory", "supposed to be a master"), + ("Permission denied", "Unexpected error using `oc`"), +]) +def test_oc_failure(problem, expect): + def execute_module(module_name, *_): + if module_name == "ocutil": + return dict(failed=True, result=problem) + return dict(changed=False) + + check = LoggingCheck(execute_module, task_vars_config_base) + + with pytest.raises(CouldNotUseOc) as excinfo: + check.exec_oc('get foo', []) + assert expect in str(excinfo) + + +groups_with_first_master = dict(oo_first_master=['this-host']) +groups_not_a_master = dict(oo_first_master=['other-host'], oo_masters=['other-host']) + + +@pytest.mark.parametrize('groups, logging_deployed, is_active', [ + (groups_with_first_master, True, True), + (groups_with_first_master, False, False), + (groups_not_a_master, True, False), + (groups_not_a_master, True, False), +]) +def test_is_active(groups, logging_deployed, is_active): + task_vars = dict( + ansible_host='this-host', + groups=groups, + openshift_hosted_logging_deploy=logging_deployed, + ) + + assert LoggingCheck(None, task_vars).is_active() == is_active + + +@pytest.mark.parametrize('pod_output, expect_pods', [ + ( + json.dumps({'items': [plain_es_pod]}), + [plain_es_pod], + ), +]) +def test_get_pods_for_component(pod_output, expect_pods): + check = canned_loggingcheck(lambda *_: pod_output) + pods = check.get_pods_for_component("es") + assert pods == expect_pods + + +@pytest.mark.parametrize('exec_oc_output, expect_error', [ + ( + 'No resources found.', + MissingComponentPods, + ), + ( + '{"items": null}', + MissingComponentPods, + ), +]) +def test_get_pods_for_component_fail(exec_oc_output, expect_error): + check = canned_loggingcheck(lambda *_: exec_oc_output) + with pytest.raises(expect_error): + check.get_pods_for_component("es") + + +@pytest.mark.parametrize('name, pods, expected_pods', [ + ( + 'test single pod found, scheduled, but no containerStatuses field', + [plain_kibana_pod_no_containerstatus], + [plain_kibana_pod_no_containerstatus], + ), + ( + 'set of pods has at least one pod with containerStatuses (scheduled); should still fail', + [plain_kibana_pod_no_containerstatus, plain_kibana_pod], + [plain_kibana_pod_no_containerstatus], + ), + +], ids=lambda argvals: argvals[0]) +def test_get_not_running_pods_no_container_status(name, pods, expected_pods): + check = canned_loggingcheck(lambda *_: '') + result = check.not_running_pods(pods) + + assert result == expected_pods diff --git a/roles/openshift_health_checker/test/logging_index_time_test.py b/roles/openshift_health_checker/test/logging_index_time_test.py new file mode 100644 index 000000000..c48ade9b8 --- /dev/null +++ b/roles/openshift_health_checker/test/logging_index_time_test.py @@ -0,0 +1,170 @@ +import json + +import pytest + +from openshift_checks.logging.logging_index_time import LoggingIndexTime, OpenShiftCheckException + + +SAMPLE_UUID = "unique-test-uuid" + + +def canned_loggingindextime(exec_oc=None): + """Create a check object with a canned exec_oc method""" + check = LoggingIndexTime() # fails if a module is actually invoked + if exec_oc: + check.exec_oc = exec_oc + return check + + +plain_running_elasticsearch_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es-data-master"}, + "name": "logging-es-data-master-1", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": True}], + "phase": "Running", + } +} +plain_running_kibana_pod = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-1", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": True}], + "phase": "Running", + } +} +not_running_kibana_pod = { + "metadata": { + "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, + "name": "logging-kibana-2", + }, + "status": { + "containerStatuses": [{"ready": True}, {"ready": False}], + "conditions": [{"status": "True", "type": "Ready"}], + "phase": "pending", + } +} + + +@pytest.mark.parametrize('pods, expect_pods', [ + ( + [not_running_kibana_pod], + [], + ), + ( + [plain_running_kibana_pod], + [plain_running_kibana_pod], + ), + ( + [], + [], + ) +]) +def test_check_running_pods(pods, expect_pods): + check = canned_loggingindextime() + pods = check.running_pods(pods) + assert pods == expect_pods + + +def test_bad_config_param(): + with pytest.raises(OpenShiftCheckException) as error: + LoggingIndexTime(task_vars=dict(openshift_check_logging_index_timeout_seconds="foo")).run() + assert 'InvalidTimeout' == error.value.name + + +def test_no_running_pods(): + check = LoggingIndexTime() + check.get_pods_for_component = lambda *_: [not_running_kibana_pod] + with pytest.raises(OpenShiftCheckException) as error: + check.run() + assert 'kibanaNoRunningPods' == error.value.name + + +def test_with_running_pods(): + check = LoggingIndexTime() + check.get_pods_for_component = lambda *_: [plain_running_kibana_pod, plain_running_elasticsearch_pod] + check.curl_kibana_with_uuid = lambda *_: SAMPLE_UUID + check.wait_until_cmd_or_err = lambda *_: None + assert not check.run().get("failed") + + +@pytest.mark.parametrize('name, json_response, uuid, timeout', [ + ( + 'valid count in response', + { + "count": 1, + }, + SAMPLE_UUID, + 0.001, + ), +], ids=lambda argval: argval[0]) +def test_wait_until_cmd_or_err_succeeds(name, json_response, uuid, timeout): + check = canned_loggingindextime(lambda *args, **_: json.dumps(json_response)) + check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, uuid, timeout) + + +@pytest.mark.parametrize('name, json_response, timeout, expect_error', [ + ( + 'invalid json response', + { + "invalid_field": 1, + }, + 0.001, + 'esInvalidResponse', + ), + ( + 'empty response', + {}, + 0.001, + 'esInvalidResponse', + ), + ( + 'valid response but invalid match count', + { + "count": 0, + }, + 0.005, + 'NoMatchFound', + ) +], ids=lambda argval: argval[0]) +def test_wait_until_cmd_or_err(name, json_response, timeout, expect_error): + check = canned_loggingindextime(lambda *args, **_: json.dumps(json_response)) + with pytest.raises(OpenShiftCheckException) as error: + check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, SAMPLE_UUID, timeout) + + assert expect_error == error.value.name + + +def test_curl_kibana_with_uuid(): + check = canned_loggingindextime(lambda *args, **_: json.dumps({"statusCode": 404})) + check.generate_uuid = lambda: SAMPLE_UUID + assert SAMPLE_UUID == check.curl_kibana_with_uuid(plain_running_kibana_pod) + + +@pytest.mark.parametrize('name, json_response, expect_error', [ + ( + 'invalid json response', + { + "invalid_field": "invalid", + }, + 'kibanaInvalidResponse', + ), + ( + 'wrong error code in response', + { + "statusCode": 500, + }, + 'kibanaInvalidReturnCode', + ), +], ids=lambda argval: argval[0]) +def test_failed_curl_kibana_with_uuid(name, json_response, expect_error): + check = canned_loggingindextime(lambda *args, **_: json.dumps(json_response)) + check.generate_uuid = lambda: SAMPLE_UUID + + with pytest.raises(OpenShiftCheckException) as error: + check.curl_kibana_with_uuid(plain_running_kibana_pod) + + assert expect_error == error.value.name diff --git a/roles/openshift_health_checker/test/memory_availability_test.py b/roles/openshift_health_checker/test/memory_availability_test.py new file mode 100644 index 000000000..5ec83dd79 --- /dev/null +++ b/roles/openshift_health_checker/test/memory_availability_test.py @@ -0,0 +1,127 @@ +import pytest + +from openshift_checks.memory_availability import MemoryAvailability + + +@pytest.mark.parametrize('group_names,is_active', [ + (['oo_masters_to_config'], True), + (['oo_nodes_to_config'], True), + (['oo_etcd_to_config'], True), + (['oo_masters_to_config', 'oo_nodes_to_config'], True), + (['oo_masters_to_config', 'oo_etcd_to_config'], True), + ([], False), + (['lb'], False), + (['nfs'], False), +]) +def test_is_active(group_names, is_active): + task_vars = dict( + group_names=group_names, + ) + assert MemoryAvailability(None, task_vars).is_active() == is_active + + +@pytest.mark.parametrize('group_names,configured_min,ansible_memtotal_mb', [ + ( + ['oo_masters_to_config'], + 0, + 17200, + ), + ( + ['oo_nodes_to_config'], + 0, + 8200, + ), + ( + ['oo_nodes_to_config'], + 1, # configure lower threshold + 2000, # too low for recommended but not for configured + ), + ( + ['oo_nodes_to_config'], + 2, # configure threshold where adjustment pushes it over + 1900, + ), + ( + ['oo_etcd_to_config'], + 0, + 8200, + ), + ( + ['oo_masters_to_config', 'oo_nodes_to_config'], + 0, + 17000, + ), +]) +def test_succeeds_with_recommended_memory(group_names, configured_min, ansible_memtotal_mb): + task_vars = dict( + group_names=group_names, + openshift_check_min_host_memory_gb=configured_min, + ansible_memtotal_mb=ansible_memtotal_mb, + ) + + result = MemoryAvailability(fake_execute_module, task_vars).run() + + assert not result.get('failed', False) + + +@pytest.mark.parametrize('group_names,configured_min,ansible_memtotal_mb,extra_words', [ + ( + ['oo_masters_to_config'], + 0, + 0, + ['0.0 GiB'], + ), + ( + ['oo_nodes_to_config'], + 0, + 100, + ['0.1 GiB'], + ), + ( + ['oo_nodes_to_config'], + 24, # configure higher threshold + 20 * 1024, # enough to meet recommended but not configured + ['20.0 GiB'], + ), + ( + ['oo_nodes_to_config'], + 24, # configure higher threshold + 22 * 1024, # not enough for adjustment to push over threshold + ['22.0 GiB'], + ), + ( + ['oo_etcd_to_config'], + 0, + 6 * 1024, + ['6.0 GiB'], + ), + ( + ['oo_etcd_to_config', 'oo_masters_to_config'], + 0, + 9 * 1024, # enough memory for etcd, not enough for a master + ['9.0 GiB'], + ), + ( + ['oo_nodes_to_config', 'oo_masters_to_config'], + 0, + # enough memory for a node, not enough for a master + 11 * 1024, + ['11.0 GiB'], + ), +]) +def test_fails_with_insufficient_memory(group_names, configured_min, ansible_memtotal_mb, extra_words): + task_vars = dict( + group_names=group_names, + openshift_check_min_host_memory_gb=configured_min, + ansible_memtotal_mb=ansible_memtotal_mb, + ) + + result = MemoryAvailability(fake_execute_module, task_vars).run() + + assert result.get('failed', False) + for word in 'below recommended'.split() + extra_words: + assert word in result['msg'] + + +def fake_execute_module(*args): + raise AssertionError('this function should not be called') diff --git a/roles/openshift_health_checker/test/mixins_test.py b/roles/openshift_health_checker/test/mixins_test.py new file mode 100644 index 000000000..b1a41ca3c --- /dev/null +++ b/roles/openshift_health_checker/test/mixins_test.py @@ -0,0 +1,23 @@ +import pytest + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException +from openshift_checks.mixins import NotContainerizedMixin + + +class NotContainerizedCheck(NotContainerizedMixin, OpenShiftCheck): + name = "not_containerized" + run = NotImplemented + + +@pytest.mark.parametrize('task_vars,expected', [ + (dict(openshift=dict(common=dict(is_containerized=False))), True), + (dict(openshift=dict(common=dict(is_containerized=True))), False), +]) +def test_is_active(task_vars, expected): + assert NotContainerizedCheck(None, task_vars).is_active() == expected + + +def test_is_active_missing_task_vars(): + with pytest.raises(OpenShiftCheckException) as excinfo: + NotContainerizedCheck().is_active() + assert 'is_containerized' in str(excinfo.value) diff --git a/roles/openshift_health_checker/test/openshift_check_test.py b/roles/openshift_health_checker/test/openshift_check_test.py new file mode 100644 index 000000000..bc0c3b26c --- /dev/null +++ b/roles/openshift_health_checker/test/openshift_check_test.py @@ -0,0 +1,145 @@ +import pytest + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException +from openshift_checks import load_checks + + +# Fixtures + + +@pytest.fixture() +def task_vars(): + return dict(foo=42, bar=dict(baz="openshift")) + + +@pytest.fixture(params=[ + ("notfound",), + ("multiple", "keys", "not", "in", "task_vars"), +]) +def missing_keys(request): + return request.param + + +# Tests + + +def test_OpenShiftCheck_init(): + class TestCheck(OpenShiftCheck): + name = "test_check" + run = NotImplemented + + # execute_module required at init if it will be used + with pytest.raises(RuntimeError) as excinfo: + TestCheck().execute_module("foo") + assert 'execute_module' in str(excinfo.value) + + execute_module = object() + + # initialize with positional argument + check = TestCheck(execute_module) + assert check._execute_module == execute_module + + # initialize with keyword argument + check = TestCheck(execute_module=execute_module) + assert check._execute_module == execute_module + + assert check.task_vars == {} + assert check.tmp is None + + +def test_subclasses(): + """OpenShiftCheck.subclasses should find all subclasses recursively.""" + class TestCheck1(OpenShiftCheck): + pass + + class TestCheck2(OpenShiftCheck): + pass + + class TestCheck1A(TestCheck1): + pass + + local_subclasses = set([TestCheck1, TestCheck1A, TestCheck2]) + known_subclasses = set(OpenShiftCheck.subclasses()) + + assert local_subclasses - known_subclasses == set(), "local_subclasses should be a subset of known_subclasses" + + +def test_load_checks(): + """Loading checks should load and return Python modules.""" + modules = load_checks() + assert modules + + +def dummy_check(task_vars): + class TestCheck(OpenShiftCheck): + name = "dummy" + run = NotImplemented + + return TestCheck(task_vars=task_vars) + + +@pytest.mark.parametrize("keys,expected", [ + (("foo",), 42), + (("bar", "baz"), "openshift"), + (("bar.baz",), "openshift"), +]) +def test_get_var_ok(task_vars, keys, expected): + assert dummy_check(task_vars).get_var(*keys) == expected + + +def test_get_var_error(task_vars, missing_keys): + with pytest.raises(OpenShiftCheckException): + dummy_check(task_vars).get_var(*missing_keys) + + +def test_get_var_default(task_vars, missing_keys): + default = object() + assert dummy_check(task_vars).get_var(*missing_keys, default=default) == default + + +@pytest.mark.parametrize("keys, convert, expected", [ + (("foo",), str, "42"), + (("foo",), float, 42.0), + (("bar", "baz"), bool, False), +]) +def test_get_var_convert(task_vars, keys, convert, expected): + assert dummy_check(task_vars).get_var(*keys, convert=convert) == expected + + +def convert_oscexc(_): + raise OpenShiftCheckException("known failure") + + +def convert_exc(_): + raise Exception("failure unknown") + + +@pytest.mark.parametrize("keys, convert, expect_text", [ + (("bar", "baz"), int, "Cannot convert"), + (("bar.baz",), float, "Cannot convert"), + (("foo",), "bogus", "TypeError"), + (("foo",), lambda a, b: 1, "TypeError"), + (("foo",), lambda a: 1 / 0, "ZeroDivisionError"), + (("foo",), convert_oscexc, "known failure"), + (("foo",), convert_exc, "failure unknown"), +]) +def test_get_var_convert_error(task_vars, keys, convert, expect_text): + with pytest.raises(OpenShiftCheckException) as excinfo: + dummy_check(task_vars).get_var(*keys, convert=convert) + assert expect_text in str(excinfo.value) + + +def test_register(task_vars): + check = dummy_check(task_vars) + + check.register_failure(OpenShiftCheckException("spam")) + assert "spam" in str(check.failures[0]) + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.register_file("spam") # no file contents specified + assert "not specified" in str(excinfo.value) + + # normally execute_module registers the result file; test disabling that + check._execute_module = lambda *args, **_: dict() + check.execute_module("eggs", module_args={}, register=False) + assert not check.files_to_save diff --git a/roles/openshift_health_checker/test/ovs_version_test.py b/roles/openshift_health_checker/test/ovs_version_test.py new file mode 100644 index 000000000..5a82a43bf --- /dev/null +++ b/roles/openshift_health_checker/test/ovs_version_test.py @@ -0,0 +1,86 @@ +import pytest + +from openshift_checks.ovs_version import OvsVersion, OpenShiftCheckException + + +def test_openshift_version_not_supported(): + def execute_module(*_): + return {} + + openshift_release = '111.7.0' + + task_vars = dict( + openshift=dict(common=dict(service_type='origin')), + openshift_release=openshift_release, + openshift_image_tag='v' + openshift_release, + openshift_deployment_type='origin', + ) + + with pytest.raises(OpenShiftCheckException) as excinfo: + OvsVersion(execute_module, task_vars).run() + + assert "no recommended version of Open vSwitch" in str(excinfo.value) + + +def test_invalid_openshift_release_format(): + def execute_module(*_): + return {} + + task_vars = dict( + openshift=dict(common=dict(service_type='origin')), + openshift_image_tag='v0', + openshift_deployment_type='origin', + ) + + with pytest.raises(OpenShiftCheckException) as excinfo: + OvsVersion(execute_module, task_vars).run() + assert "invalid version" in str(excinfo.value) + + +@pytest.mark.parametrize('openshift_release,expected_ovs_version', [ + ("3.5", ["2.6", "2.7"]), + ("3.6", ["2.6", "2.7"]), + ("3.4", "2.4"), + ("3.3", "2.4"), + ("1.0", "2.4"), +]) +def test_ovs_package_version(openshift_release, expected_ovs_version): + task_vars = dict( + openshift=dict(common=dict(service_type='origin')), + openshift_release=openshift_release, + openshift_image_tag='v' + openshift_release, + ) + return_value = {} # note: check.execute_module modifies return hash contents + + def execute_module(module_name=None, module_args=None, *_): + assert module_name == 'rpm_version' + assert "package_list" in module_args + + for pkg in module_args["package_list"]: + if pkg["name"] == "openvswitch": + assert pkg["version"] == expected_ovs_version + + return return_value + + result = OvsVersion(execute_module, task_vars).run() + assert result is return_value + + +@pytest.mark.parametrize('group_names,is_containerized,is_active', [ + (['oo_masters_to_config'], False, True), + # ensure check is skipped on containerized installs + (['oo_masters_to_config'], True, False), + (['oo_nodes_to_config'], False, True), + (['oo_masters_to_config', 'oo_nodes_to_config'], False, True), + (['oo_masters_to_config', 'oo_etcd_to_config'], False, True), + ([], False, False), + (['oo_etcd_to_config'], False, False), + (['lb'], False, False), + (['nfs'], False, False), +]) +def test_ovs_version_skip_when_not_master_nor_node(group_names, is_containerized, is_active): + task_vars = dict( + group_names=group_names, + openshift=dict(common=dict(is_containerized=is_containerized)), + ) + assert OvsVersion(None, task_vars).is_active() == is_active diff --git a/roles/openshift_health_checker/test/package_availability_test.py b/roles/openshift_health_checker/test/package_availability_test.py new file mode 100644 index 000000000..9815acb38 --- /dev/null +++ b/roles/openshift_health_checker/test/package_availability_test.py @@ -0,0 +1,62 @@ +import pytest + +from openshift_checks.package_availability import PackageAvailability + + +@pytest.mark.parametrize('pkg_mgr,is_containerized,is_active', [ + ('yum', False, True), + ('yum', True, False), + ('dnf', True, False), + ('dnf', False, False), +]) +def test_is_active(pkg_mgr, is_containerized, is_active): + task_vars = dict( + ansible_pkg_mgr=pkg_mgr, + openshift=dict(common=dict(is_containerized=is_containerized)), + ) + assert PackageAvailability(None, task_vars).is_active() == is_active + + +@pytest.mark.parametrize('task_vars,must_have_packages,must_not_have_packages', [ + ( + dict(openshift=dict(common=dict(service_type='openshift'))), + set(), + set(['openshift-master', 'openshift-node']), + ), + ( + dict( + openshift=dict(common=dict(service_type='origin')), + group_names=['oo_masters_to_config'], + ), + set(['origin-master']), + set(['origin-node']), + ), + ( + dict( + openshift=dict(common=dict(service_type='atomic-openshift')), + group_names=['oo_nodes_to_config'], + ), + set(['atomic-openshift-node']), + set(['atomic-openshift-master']), + ), + ( + dict( + openshift=dict(common=dict(service_type='atomic-openshift')), + group_names=['oo_masters_to_config', 'oo_nodes_to_config'], + ), + set(['atomic-openshift-master', 'atomic-openshift-node']), + set(), + ), +]) +def test_package_availability(task_vars, must_have_packages, must_not_have_packages): + return_value = {} + + def execute_module(module_name=None, module_args=None, *_): + assert module_name == 'check_yum_update' + assert 'packages' in module_args + assert set(module_args['packages']).issuperset(must_have_packages) + assert not set(module_args['packages']).intersection(must_not_have_packages) + return {'foo': return_value} + + result = PackageAvailability(execute_module, task_vars).run() + assert result['foo'] is return_value diff --git a/roles/openshift_health_checker/test/package_update_test.py b/roles/openshift_health_checker/test/package_update_test.py new file mode 100644 index 000000000..85d3c9cab --- /dev/null +++ b/roles/openshift_health_checker/test/package_update_test.py @@ -0,0 +1,15 @@ +from openshift_checks.package_update import PackageUpdate + + +def test_package_update(): + return_value = {} + + def execute_module(module_name=None, module_args=None, *_): + assert module_name == 'check_yum_update' + assert 'packages' in module_args + # empty list of packages means "generic check if 'yum update' will work" + assert module_args['packages'] == [] + return {'foo': return_value} + + result = PackageUpdate(execute_module).run() + assert result['foo'] is return_value diff --git a/roles/openshift_health_checker/test/package_version_test.py b/roles/openshift_health_checker/test/package_version_test.py new file mode 100644 index 000000000..3cf4ce033 --- /dev/null +++ b/roles/openshift_health_checker/test/package_version_test.py @@ -0,0 +1,116 @@ +import pytest + +from openshift_checks.package_version import PackageVersion, OpenShiftCheckException + + +def task_vars_for(openshift_release, deployment_type): + return dict( + ansible_pkg_mgr='yum', + openshift=dict(common=dict(service_type=deployment_type)), + openshift_release=openshift_release, + openshift_image_tag='v' + openshift_release, + openshift_deployment_type=deployment_type, + ) + + +def test_openshift_version_not_supported(): + check = PackageVersion(None, task_vars_for("1.2.3", 'origin')) + check.get_openshift_version_tuple = lambda: (3, 4, 1) # won't be in the dict + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.get_required_ovs_version() + assert "no recommended version of Open vSwitch" in str(excinfo.value) + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.get_required_docker_version() + assert "no recommended version of Docker" in str(excinfo.value) + + +def test_invalid_openshift_release_format(): + task_vars = dict( + ansible_pkg_mgr='yum', + openshift=dict(common=dict(service_type='origin')), + openshift_image_tag='v0', + openshift_deployment_type='origin', + ) + + check = PackageVersion(lambda *_: {}, task_vars) + with pytest.raises(OpenShiftCheckException) as excinfo: + check.run() + assert "invalid version" in str(excinfo.value) + + +@pytest.mark.parametrize('openshift_release', [ + "111.7.0", + "3.7", + "3.6", + "3.5.1.2.3", + "3.5", + "3.4", + "3.3", + "2.1.0", +]) +def test_package_version(openshift_release): + + return_value = {"foo": object()} + + def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *_): + assert module_name == 'aos_version' + assert "package_list" in module_args + + for pkg in module_args["package_list"]: + if "-master" in pkg["name"] or "-node" in pkg["name"]: + assert pkg["version"] == task_vars["openshift_release"] + + return return_value + + check = PackageVersion(execute_module, task_vars_for(openshift_release, 'origin')) + result = check.run() + assert result == return_value + + +@pytest.mark.parametrize('deployment_type,openshift_release,expected_docker_version', [ + ("origin", "3.5", "1.12"), + ("origin", "1.3", "1.10"), + ("origin", "1.1", "1.8"), + ("openshift-enterprise", "3.4", "1.12"), + ("openshift-enterprise", "3.2", "1.10"), + ("openshift-enterprise", "3.1", "1.8"), +]) +def test_docker_package_version(deployment_type, openshift_release, expected_docker_version): + + return_value = {"foo": object()} + + def execute_module(module_name=None, module_args=None, *_): + assert module_name == 'aos_version' + assert "package_list" in module_args + + for pkg in module_args["package_list"]: + if pkg["name"] == "docker": + assert pkg["version"] == expected_docker_version + + return return_value + + check = PackageVersion(execute_module, task_vars_for(openshift_release, deployment_type)) + result = check.run() + assert result == return_value + + +@pytest.mark.parametrize('group_names,is_containerized,is_active', [ + (['oo_masters_to_config'], False, True), + # ensure check is skipped on containerized installs + (['oo_masters_to_config'], True, False), + (['oo_nodes_to_config'], False, True), + (['oo_masters_to_config', 'oo_nodes_to_config'], False, True), + (['oo_masters_to_config', 'oo_etcd_to_config'], False, True), + ([], False, False), + (['oo_etcd_to_config'], False, False), + (['lb'], False, False), + (['nfs'], False, False), +]) +def test_package_version_skip_when_not_master_nor_node(group_names, is_containerized, is_active): + task_vars = dict( + group_names=group_names, + openshift=dict(common=dict(is_containerized=is_containerized)), + ) + assert PackageVersion(None, task_vars).is_active() == is_active diff --git a/roles/openshift_health_checker/test/rpm_version_test.py b/roles/openshift_health_checker/test/rpm_version_test.py new file mode 100644 index 000000000..2c1bcf876 --- /dev/null +++ b/roles/openshift_health_checker/test/rpm_version_test.py @@ -0,0 +1,82 @@ +import pytest +import rpm_version + +expected_pkgs = { + "spam": { + "name": "spam", + "version": "3.2.1", + }, + "eggs": { + "name": "eggs", + "version": "3.2.1", + }, +} + + +@pytest.mark.parametrize('pkgs, expect_not_found', [ + ( + {}, + ["spam", "eggs"], # none found + ), + ( + {"spam": ["3.2.1", "4.5.1"]}, + ["eggs"], # completely missing + ), + ( + { + "spam": ["3.2.1", "4.5.1"], + "eggs": ["3.2.1"], + }, + [], # all found + ), +]) +def test_check_pkg_found(pkgs, expect_not_found): + if expect_not_found: + with pytest.raises(rpm_version.RpmVersionException) as e: + rpm_version._check_pkg_versions(pkgs, expected_pkgs) + + assert "not found to be installed" in str(e.value) + assert set(expect_not_found) == set(e.value.problem_pkgs) + else: + rpm_version._check_pkg_versions(pkgs, expected_pkgs) + + +@pytest.mark.parametrize('pkgs, expect_not_found', [ + ( + { + 'spam': ['3.2.1'], + 'eggs': ['3.3.2'], + }, + { + "eggs": { + "required_versions": ["3.2"], + "found_versions": ["3.3"], + } + }, # not the right version + ), + ( + { + 'spam': ['3.1.2', "3.3.2"], + 'eggs': ['3.3.2', "1.2.3"], + }, + { + "eggs": { + "required_versions": ["3.2"], + "found_versions": ["3.3", "1.2"], + }, + "spam": { + "required_versions": ["3.2"], + "found_versions": ["3.1", "3.3"], + } + }, # not the right version + ), +]) +def test_check_pkg_version_found(pkgs, expect_not_found): + if expect_not_found: + with pytest.raises(rpm_version.RpmVersionException) as e: + rpm_version._check_pkg_versions(pkgs, expected_pkgs) + + assert "found to be installed with an incorrect version" in str(e.value) + assert expect_not_found == e.value.problem_pkgs + else: + rpm_version._check_pkg_versions(pkgs, expected_pkgs) diff --git a/roles/openshift_health_checker/test/search_journalctl_test.py b/roles/openshift_health_checker/test/search_journalctl_test.py new file mode 100644 index 000000000..724928aa1 --- /dev/null +++ b/roles/openshift_health_checker/test/search_journalctl_test.py @@ -0,0 +1,157 @@ +import pytest +import search_journalctl + + +def canned_search_journalctl(get_log_output=None): + """Create a search_journalctl object with canned get_log_output method""" + module = search_journalctl + if get_log_output: + module.get_log_output = get_log_output + return module + + +DEFAULT_TIMESTAMP = 1496341364 + + +def get_timestamp(modifier=0): + return DEFAULT_TIMESTAMP + modifier + + +def get_timestamp_microseconds(modifier=0): + return get_timestamp(modifier) * 1000000 + + +def create_test_log_object(stamp, msg): + return '{{"__REALTIME_TIMESTAMP": "{}", "MESSAGE": "{}"}}'.format(stamp, msg) + + +@pytest.mark.parametrize('name,matchers,log_input,expected_matches,expected_errors', [ + ( + 'test with valid params', + [ + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"test log message", + "unit": "test", + }, + ], + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), + ], + ["test log message"], + [], + ), + ( + 'test with invalid json in log input', + [ + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"test log message", + "unit": "test-unit", + }, + ], + [ + '{__REALTIME_TIMESTAMP: ' + str(get_timestamp_microseconds()) + ', "MESSAGE": "test log message"}', + ], + [], + [ + ["invalid json", "test-unit", "test log message"], + ], + ), + ( + 'test with invalid regexp', + [ + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"test [ log message", + "unit": "test", + }, + ], + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), + ], + [], + [ + ["invalid regular expression"], + ], + ), +], ids=lambda argval: argval[0]) +def test_get_log_matches(name, matchers, log_input, expected_matches, expected_errors): + def get_log_output(matcher): + return log_input + + module = canned_search_journalctl(get_log_output) + matched_regexp, errors = module.get_log_matches(matchers, 500, 60 * 60) + + assert set(matched_regexp) == set(expected_matches) + assert len(expected_errors) == len(errors) + + for idx, partial_err_set in enumerate(expected_errors): + for partial_err_msg in partial_err_set: + assert partial_err_msg in errors[idx] + + +@pytest.mark.parametrize('name,matcher,log_count_lim,stamp_lim_seconds,log_input,expected_match', [ + ( + 'test with matching log message, but out of bounds of log_count_lim', + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"dummy log message", + "unit": "test", + }, + 3, + get_timestamp(-100 * 60 * 60), + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), + ], + None, + ), + ( + 'test with matching log message, but with timestamp too old', + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"dummy log message", + "unit": "test", + }, + 100, + get_timestamp(-10), + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"), + ], + None, + ), + ( + 'test with matching log message, and timestamp within time limit', + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"dummy log message", + "unit": "test", + }, + 100, + get_timestamp(-1010), + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"), + ], + create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), + ), +], ids=lambda argval: argval[0]) +def test_find_matches_skips_logs(name, matcher, log_count_lim, stamp_lim_seconds, log_input, expected_match): + match = search_journalctl.find_matches(log_input, matcher, log_count_lim, stamp_lim_seconds) + assert match == expected_match diff --git a/roles/openshift_health_checker/test/zz_failure_summary_test.py b/roles/openshift_health_checker/test/zz_failure_summary_test.py new file mode 100644 index 000000000..69f27653c --- /dev/null +++ b/roles/openshift_health_checker/test/zz_failure_summary_test.py @@ -0,0 +1,85 @@ +from zz_failure_summary import deduplicate_failures + +import pytest + + +@pytest.mark.parametrize('failures,deduplicated', [ + ( + [ + { + 'host': 'master1', + 'msg': 'One or more checks failed', + }, + ], + [ + { + 'host': ('master1',), + 'msg': 'One or more checks failed', + }, + ], + ), + ( + [ + { + 'host': 'master1', + 'msg': 'One or more checks failed', + }, + { + 'host': 'node1', + 'msg': 'One or more checks failed', + }, + ], + [ + { + 'host': ('master1', 'node1'), + 'msg': 'One or more checks failed', + }, + ], + ), + ( + [ + { + 'host': 'node1', + 'msg': 'One or more checks failed', + 'checks': (('test_check', 'error message'),), + }, + { + 'host': 'master2', + 'msg': 'Some error happened', + }, + { + 'host': 'master1', + 'msg': 'One or more checks failed', + 'checks': (('test_check', 'error message'),), + }, + ], + [ + { + 'host': ('master1', 'node1'), + 'msg': 'One or more checks failed', + 'checks': (('test_check', 'error message'),), + }, + { + 'host': ('master2',), + 'msg': 'Some error happened', + }, + ], + ), + # if a failure contain an unhashable value, it will not be deduplicated + ( + [ + { + 'host': 'master1', + 'msg': {'unhashable': 'value'}, + }, + ], + [ + { + 'host': 'master1', + 'msg': {'unhashable': 'value'}, + }, + ], + ), +]) +def test_deduplicate_failures(failures, deduplicated): + assert deduplicate_failures(failures) == deduplicated |