From ebf6ed8d3d467a95fe62c22f131c82882a34c3ad Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Thu, 3 Aug 2017 15:35:15 +0200 Subject: Update health check README --- playbooks/byo/openshift-checks/README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/playbooks/byo/openshift-checks/README.md b/playbooks/byo/openshift-checks/README.md index f0f14b268..5bb1b2fbf 100644 --- a/playbooks/byo/openshift-checks/README.md +++ b/playbooks/byo/openshift-checks/README.md @@ -7,15 +7,14 @@ Ansible's default operation mode is to fail fast, on the first error. However, when performing checks, it is useful to gather as much information about problems as possible in a single run. -Thus, the playbooks run a battery of checks against the inventory hosts and have -Ansible gather intermediate errors, giving a more complete diagnostic of the -state of each host. If any check failed, the playbook run will be marked as -failed. +Thus, the playbooks run a battery of checks against the inventory hosts and +gather intermediate errors, giving a more complete diagnostic of the state of +each host. If any check failed, the playbook run will be marked as failed. To facilitate understanding the problems that were encountered, a custom callback plugin summarizes execution errors at the end of a playbook run. -# Available playbooks +## Available playbooks 1. Pre-install playbook ([pre-install.yml](pre-install.yml)) - verifies system requirements and look for common problems that can prevent a successful -- cgit v1.2.3 From f98c978bd49f2473ce271d4fc69be7e4eea78125 Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Mon, 12 Jun 2017 11:46:48 +0200 Subject: Add playbook for running arbitrary health checks This is useful on its own, and also aids in developing/testing new checks that are not part of any playbook. Since the intent when running this playbook is to execute checks, opt for a less verbose explanation on the error summary. --- playbooks/byo/openshift-checks/README.md | 31 ++++++++++++++++++++++ playbooks/byo/openshift-checks/adhoc.yml | 6 +++++ playbooks/common/openshift-checks/adhoc.yml | 12 +++++++++ .../callback_plugins/zz_failure_summary.py | 2 +- 4 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 playbooks/byo/openshift-checks/adhoc.yml create mode 100644 playbooks/common/openshift-checks/adhoc.yml diff --git a/playbooks/byo/openshift-checks/README.md b/playbooks/byo/openshift-checks/README.md index 5bb1b2fbf..c17f5b6b7 100644 --- a/playbooks/byo/openshift-checks/README.md +++ b/playbooks/byo/openshift-checks/README.md @@ -26,6 +26,9 @@ callback plugin summarizes execution errors at the end of a playbook run. 3. Certificate expiry playbooks ([certificate_expiry](certificate_expiry)) - check that certificates in use are valid and not expiring soon. +4. Adhoc playbook ([adhoc.yml](adhoc.yml)) - use it to run adhoc checks. + See the [next section](#the-adhoc-playbook) for a usage example. + ## Running With a [recent installation of Ansible](../../../README.md#setup), run the playbook @@ -58,6 +61,34 @@ against your inventory file. Here is the step-by-step: $ ansible-playbook -i playbooks/byo/openshift-checks/certificate_expiry/default.yaml -v ``` +### The adhoc playbook + +The adhoc playbook gives flexibility to run any check or a custom group of +checks. What will be run is determined by the `openshift_checks` variable, +which, among other ways supported by Ansible, can be set on the command line +using the `-e` flag. + +For example, to run the `docker_storage` check: + +```console +$ ansible-playbook -i playbooks/byo/openshift-checks/adhoc.yml -e openshift_checks=docker_storage +``` + +To run more checks, use a comma-separated list of check names: + +```console +$ ansible-playbook -i playbooks/byo/openshift-checks/adhoc.yml -e openshift_checks=docker_storage,disk_availability +``` + +To run an entire class of checks, use the name of a check group tag, prefixed by `@`. This will run all checks tagged `preflight`: + +```console +$ ansible-playbook -i playbooks/byo/openshift-checks/adhoc.yml -e openshift_checks=@preflight +``` + +It is valid to specify multiple check tags and individual check names together +in a comma-separated list. + ## Running in a container This repository is built into a Docker image including Ansible so that it can diff --git a/playbooks/byo/openshift-checks/adhoc.yml b/playbooks/byo/openshift-checks/adhoc.yml new file mode 100644 index 000000000..2ece40e96 --- /dev/null +++ b/playbooks/byo/openshift-checks/adhoc.yml @@ -0,0 +1,6 @@ +--- +- include: ../openshift-cluster/initialize_groups.yml + +- include: ../../common/openshift-cluster/std_include.yml + +- include: ../../common/openshift-checks/adhoc.yml diff --git a/playbooks/common/openshift-checks/adhoc.yml b/playbooks/common/openshift-checks/adhoc.yml new file mode 100644 index 000000000..dfcef8435 --- /dev/null +++ b/playbooks/common/openshift-checks/adhoc.yml @@ -0,0 +1,12 @@ +--- +- name: OpenShift health checks + hosts: oo_all_hosts + roles: + - openshift_health_checker + vars: + - r_openshift_health_checker_playbook_context: adhoc + post_tasks: + - name: Run health checks + action: openshift_health_check + args: + checks: '{{ openshift_checks | default([]) }}' diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py index d10200719..9f9fe123a 100644 --- a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py +++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py @@ -101,7 +101,7 @@ class CallbackModule(CallbackBase): 'Variables can be set in the inventory or passed on the\n' 'command line using the -e flag to ansible-playbook.\n\n' ).format(playbook=self._playbook_file, checks=checks) - if context in ['pre-install', 'health']: + if context in ['pre-install', 'health', 'adhoc']: summary = ( # user was expecting to run checks, less explanation needed '\n' 'You may choose to configure or disable failing checks by\n' -- cgit v1.2.3 From 25276bda8c002f4279e5c1748f64a9fd1ee999a4 Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Fri, 23 Jun 2017 15:31:12 +0200 Subject: List existing health checks when none is requested This is a simple mechanism to learn what health checks are available. Note that we defer task_vars verification, so that we can compute requested_checks and resolved_checks earlier, allowing us to list checks even if openshift_facts has not run. --- playbooks/byo/openshift-checks/README.md | 10 ++++- playbooks/byo/openshift-checks/adhoc.yml | 21 ++++++++++ .../action_plugins/openshift_health_check.py | 48 ++++++++++++++++++---- .../test/action_plugin_test.py | 3 +- 4 files changed, 73 insertions(+), 9 deletions(-) diff --git a/playbooks/byo/openshift-checks/README.md b/playbooks/byo/openshift-checks/README.md index c17f5b6b7..b26e7d7ed 100644 --- a/playbooks/byo/openshift-checks/README.md +++ b/playbooks/byo/openshift-checks/README.md @@ -26,7 +26,8 @@ callback plugin summarizes execution errors at the end of a playbook run. 3. Certificate expiry playbooks ([certificate_expiry](certificate_expiry)) - check that certificates in use are valid and not expiring soon. -4. Adhoc playbook ([adhoc.yml](adhoc.yml)) - use it to run adhoc checks. +4. Adhoc playbook ([adhoc.yml](adhoc.yml)) - use it to run adhoc checks or to + list existing checks. See the [next section](#the-adhoc-playbook) for a usage example. ## Running @@ -89,6 +90,13 @@ $ ansible-playbook -i playbooks/byo/openshift-checks/adhoc.yml It is valid to specify multiple check tags and individual check names together in a comma-separated list. +To list all of the available checks and tags, run the adhoc playbook without +setting the `openshift_checks` variable: + +```console +$ ansible-playbook -i playbooks/byo/openshift-checks/adhoc.yml +``` + ## Running in a container This repository is built into a Docker image including Ansible so that it can diff --git a/playbooks/byo/openshift-checks/adhoc.yml b/playbooks/byo/openshift-checks/adhoc.yml index 2ece40e96..226bed732 100644 --- a/playbooks/byo/openshift-checks/adhoc.yml +++ b/playbooks/byo/openshift-checks/adhoc.yml @@ -1,4 +1,25 @@ --- +# NOTE: ideally this would be just part of a single play in +# common/openshift-checks/adhoc.yml that lists the existing checks when +# openshift_checks is not set or run the requested checks. However, to actually +# run the checks we need to have the included dependencies to run first and that +# takes time. To speed up listing checks, we use this separate play that runs +# before the include of dependencies to save time and improve the UX. +- name: OpenShift health checks + # NOTE: though the openshift_checks variable could be potentially defined on + # individual hosts while not defined for localhost, we do not support that + # usage. Running this play only in localhost speeds up execution. + hosts: localhost + connection: local + roles: + - openshift_health_checker + vars: + - r_openshift_health_checker_playbook_context: adhoc + pre_tasks: + - name: List known health checks + action: openshift_health_check + when: openshift_checks is undefined or not openshift_checks + - include: ../openshift-cluster/initialize_groups.yml - include: ../../common/openshift-cluster/std_include.yml diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py index 05e53333d..898d158a4 100644 --- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py +++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py @@ -28,25 +28,32 @@ class ActionModule(ActionBase): result = super(ActionModule, self).run(tmp, task_vars) task_vars = task_vars or {} - # vars are not supportably available in the callback plugin, - # so record any it will need in the result. + # callback plugins cannot read Ansible vars, but we would like + # zz_failure_summary to have access to certain values. We do so by + # storing the information we need in the result. result['playbook_context'] = task_vars.get('r_openshift_health_checker_playbook_context') - if "openshift" not in task_vars: - result["failed"] = True - result["msg"] = "'openshift' is undefined, did 'openshift_facts' run?" - return result - try: known_checks = self.load_known_checks(tmp, task_vars) args = self._task.args requested_checks = normalize(args.get('checks', [])) + + if not requested_checks: + result['failed'] = True + result['msg'] = list_known_checks(known_checks) + return result + resolved_checks = resolve_checks(requested_checks, known_checks.values()) except OpenShiftCheckException as e: result["failed"] = True result["msg"] = str(e) return result + if "openshift" not in task_vars: + result["failed"] = True + result["msg"] = "'openshift' is undefined, did 'openshift_facts' run?" + return result + result["checks"] = check_results = {} user_disabled_checks = normalize(task_vars.get('openshift_disable_check', [])) @@ -96,6 +103,33 @@ class ActionModule(ActionBase): return known_checks +def list_known_checks(known_checks): + """Return text listing the existing checks and tags.""" + # TODO: we could include a description of each check by taking it from a + # check class attribute (e.g., __doc__) when building the message below. + msg = ( + 'This playbook is meant to run health checks, but no checks were ' + 'requested. Set the `openshift_checks` variable to a comma-separated ' + 'list of check names or a YAML list. Available checks:\n {}' + ).format('\n '.join(sorted(known_checks))) + + tag_checks = defaultdict(list) + for cls in known_checks.values(): + for tag in cls.tags: + tag_checks[tag].append(cls.name) + tags = [ + '@{} = {}'.format(tag, ','.join(sorted(checks))) + for tag, checks in tag_checks.items() + ] + + msg += ( + '\n\nTags can be used as a shortcut to select multiple ' + 'checks. Available tags and the checks they select:\n {}' + ).format('\n '.join(sorted(tags))) + + return msg + + def resolve_checks(names, all_checks): """Returns a set of resolved check names. diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py index f5161d6f5..f73eb4f6e 100644 --- a/roles/openshift_health_checker/test/action_plugin_test.py +++ b/roles/openshift_health_checker/test/action_plugin_test.py @@ -80,7 +80,8 @@ def skipped(result): None, {}, ]) -def test_action_plugin_missing_openshift_facts(plugin, task_vars): +def test_action_plugin_missing_openshift_facts(plugin, task_vars, monkeypatch): + monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) result = plugin.run(tmp=None, task_vars=task_vars) assert failed(result, msg_has=['openshift_facts']) -- cgit v1.2.3 From a28796fc669bd40f9384118f278b62001a15214d Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Mon, 21 Aug 2017 13:37:25 +0200 Subject: List known checks/tags when check name is invalid --- .../action_plugins/openshift_health_check.py | 28 +++++++++++++++------- .../test/action_plugin_test.py | 10 ++------ 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py index 898d158a4..3e8962c3c 100644 --- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py +++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py @@ -113,21 +113,27 @@ def list_known_checks(known_checks): 'list of check names or a YAML list. Available checks:\n {}' ).format('\n '.join(sorted(known_checks))) + tags = describe_tags(known_checks.values()) + + msg += ( + '\n\nTags can be used as a shortcut to select multiple ' + 'checks. Available tags and the checks they select:\n {}' + ).format('\n '.join(tags)) + + return msg + + +def describe_tags(check_classes): + """Return a sorted list of strings describing tags and the checks they include.""" tag_checks = defaultdict(list) - for cls in known_checks.values(): + for cls in check_classes: for tag in cls.tags: tag_checks[tag].append(cls.name) tags = [ '@{} = {}'.format(tag, ','.join(sorted(checks))) for tag, checks in tag_checks.items() ] - - msg += ( - '\n\nTags can be used as a shortcut to select multiple ' - 'checks. Available tags and the checks they select:\n {}' - ).format('\n '.join(sorted(tags))) - - return msg + return sorted(tags) def resolve_checks(names, all_checks): @@ -157,6 +163,12 @@ def resolve_checks(names, all_checks): if unknown_tag_names: msg.append('Unknown tag names: {}.'.format(', '.join(sorted(unknown_tag_names)))) msg.append('Make sure there is no typo in the playbook and no files are missing.') + # TODO: implement a "Did you mean ...?" when the input is similar to a + # valid check or tag. + msg.append('Known checks:') + msg.append(' {}'.format('\n '.join(sorted(known_check_names)))) + msg.append('Known tags:') + msg.append(' {}'.format('\n '.join(describe_tags(all_checks)))) raise OpenShiftCheckException('\n'.join(msg)) tag_to_checks = defaultdict(set) diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py index f73eb4f6e..2274cbd21 100644 --- a/roles/openshift_health_checker/test/action_plugin_test.py +++ b/roles/openshift_health_checker/test/action_plugin_test.py @@ -218,24 +218,21 @@ def test_resolve_checks_ok(names, all_checks, expected): assert resolve_checks(names, all_checks) == expected -@pytest.mark.parametrize('names,all_checks,words_in_exception,words_not_in_exception', [ +@pytest.mark.parametrize('names,all_checks,words_in_exception', [ ( ['testA', 'testB'], [], ['check', 'name', 'testA', 'testB'], - ['tag', 'group', '@'], ), ( ['@group'], [], ['tag', 'name', 'group'], - ['check', '@'], ), ( ['testA', 'testB', '@group'], [], ['check', 'name', 'testA', 'testB', 'tag', 'group'], - ['@'], ), ( ['testA', 'testB', '@group'], @@ -245,13 +242,10 @@ def test_resolve_checks_ok(names, all_checks, expected): fake_check('from_group_2', ['preflight', 'group']), ], ['check', 'name', 'testA', 'testB'], - ['tag', 'group', '@'], ), ]) -def test_resolve_checks_failure(names, all_checks, words_in_exception, words_not_in_exception): +def test_resolve_checks_failure(names, all_checks, words_in_exception): with pytest.raises(Exception) as excinfo: resolve_checks(names, all_checks) for word in words_in_exception: assert word in str(excinfo.value) - for word in words_not_in_exception: - assert word not in str(excinfo.value) -- cgit v1.2.3 From 75b1ef8fa6e80e7645a60cef2d4e7640a6c87955 Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Thu, 3 Aug 2017 10:40:08 +0200 Subject: Handle more exceptions when running checks This prevents an exception in one check from interfering with other checks. Skips checks that raise an exception in their is_active method. Whenever capturing a broad exception in the `is_action` or `run` methods, include traceback information that can be useful in bug reports. --- .../action_plugins/openshift_health_check.py | 49 +++++++++++++--------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py index 3e8962c3c..623c3eb8f 100644 --- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py +++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py @@ -4,6 +4,7 @@ Ansible action plugin to execute health checks in OpenShift clusters. # pylint: disable=wrong-import-position,missing-docstring,invalid-name import sys import os +import traceback from collections import defaultdict try: @@ -58,26 +59,12 @@ class ActionModule(ActionBase): user_disabled_checks = normalize(task_vars.get('openshift_disable_check', [])) - for check_name in resolved_checks: - display.banner("CHECK [{} : {}]".format(check_name, task_vars["ansible_host"])) - check = known_checks[check_name] - - if not check.is_active(): - r = dict(skipped=True, skipped_reason="Not active for this host") - elif check_name in user_disabled_checks: - r = dict(skipped=True, skipped_reason="Disabled by user request") - else: - try: - r = check.run() - except OpenShiftCheckException as e: - r = dict( - failed=True, - msg=str(e), - ) - + for name in resolved_checks: + display.banner("CHECK [{} : {}]".format(name, task_vars["ansible_host"])) + check = known_checks[name] + check_results[name] = run_check(name, check, user_disabled_checks) if check.changed: - r["changed"] = True - check_results[check_name] = r + check_results[name]["changed"] = True result["changed"] = any(r.get("changed") for r in check_results.values()) if any(r.get("failed") for r in check_results.values()): @@ -192,3 +179,27 @@ def normalize(checks): if isinstance(checks, string_types): checks = checks.split(',') return [name.strip() for name in checks if name.strip()] + + +def run_check(name, check, user_disabled_checks): + """Run a single check if enabled and return a result dict.""" + if name in user_disabled_checks: + return dict(skipped=True, skipped_reason="Disabled by user request") + + # pylint: disable=broad-except; capturing exceptions broadly is intentional, + # to isolate arbitrary failures in one check from others. + try: + is_active = check.is_active() + except Exception as exc: + reason = "Could not determine if check should be run, exception: {}".format(exc) + return dict(skipped=True, skipped_reason=reason, exception=traceback.format_exc()) + + if not is_active: + return dict(skipped=True, skipped_reason="Not active for this host") + + try: + return check.run() + except OpenShiftCheckException as exc: + return dict(failed=True, msg=str(exc)) + except Exception as exc: + return dict(failed=True, msg=str(exc), exception=traceback.format_exc()) -- cgit v1.2.3 From 80476c7dea2b3f6f42ebce5a7947af1d3b9dbedb Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Fri, 23 Jun 2017 15:32:19 +0200 Subject: Rewrite failure summary callback plugin The intent is to deduplicate similar errors that happened in many hosts, making the summary more concise. --- .../callback_plugins/zz_failure_summary.py | 291 ++++++++++++--------- roles/openshift_health_checker/test/conftest.py | 1 + .../test/zz_failure_summary_test.py | 70 +++++ 3 files changed, 243 insertions(+), 119 deletions(-) create mode 100644 roles/openshift_health_checker/test/zz_failure_summary_test.py diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py index 9f9fe123a..46cc3b577 100644 --- a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py +++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py @@ -1,161 +1,214 @@ -""" -Ansible callback plugin to give a nicely formatted summary of failures. -""" +"""Ansible callback plugin to print a nicely formatted summary of failures. -# Reason: In several locations below we disable pylint protected-access -# for Ansible objects that do not give us any public way -# to access the full details we need to report check failures. -# Status: disabled permanently or until Ansible object has a public API. -# This does leave the code more likely to be broken by future Ansible changes. +The file / module name is prefixed with `zz_` to make this plugin be loaded last +by Ansible, thus making its output the last thing that users see. +""" -from pprint import pformat +from collections import defaultdict from ansible.plugins.callback import CallbackBase from ansible import constants as C from ansible.utils.color import stringc +FAILED_NO_MSG = u'Failed without returning a message.' + + class CallbackModule(CallbackBase): - """ - This callback plugin stores task results and summarizes failures. - The file name is prefixed with `zz_` to make this plugin be loaded last by - Ansible, thus making its output the last thing that users see. - """ + """This callback plugin stores task results and summarizes failures.""" CALLBACK_VERSION = 2.0 CALLBACK_TYPE = 'aggregate' CALLBACK_NAME = 'failure_summary' CALLBACK_NEEDS_WHITELIST = False - _playbook_file = None def __init__(self): super(CallbackModule, self).__init__() self.__failures = [] + self.__playbook_file = '' def v2_playbook_on_start(self, playbook): super(CallbackModule, self).v2_playbook_on_start(playbook) - # re: playbook attrs see top comment # pylint: disable=protected-access - self._playbook_file = playbook._file_name + # pylint: disable=protected-access; Ansible gives us no public API to + # get the file name of the current playbook from a callback plugin. + self.__playbook_file = playbook._file_name def v2_runner_on_failed(self, result, ignore_errors=False): super(CallbackModule, self).v2_runner_on_failed(result, ignore_errors) if not ignore_errors: - self.__failures.append(dict(result=result, ignore_errors=ignore_errors)) + self.__failures.append(result) def v2_playbook_on_stats(self, stats): super(CallbackModule, self).v2_playbook_on_stats(stats) if self.__failures: - self._print_failure_details(self.__failures) - - def _print_failure_details(self, failures): - """Print a summary of failed tasks or checks.""" - self._display.display(u'\nFailure summary:\n') - - width = len(str(len(failures))) - initial_indent_format = u' {{:>{width}}}. '.format(width=width) - initial_indent_len = len(initial_indent_format.format(0)) - subsequent_indent = u' ' * initial_indent_len - subsequent_extra_indent = u' ' * (initial_indent_len + 10) - - for i, failure in enumerate(failures, 1): - entries = _format_failure(failure) - self._display.display(u'\n{}{}'.format(initial_indent_format.format(i), entries[0])) - for entry in entries[1:]: - entry = entry.replace(u'\n', u'\n' + subsequent_extra_indent) - indented = u'{}{}'.format(subsequent_indent, entry) - self._display.display(indented) - - failed_checks = set() - playbook_context = None - # re: result attrs see top comment # pylint: disable=protected-access - for failure in failures: - # Get context from check task result since callback plugins cannot access task vars. - # NOTE: thus context is not known unless checks run. Failures prior to checks running - # don't have playbook_context in the results. But we only use it now when checks fail. - playbook_context = playbook_context or failure['result']._result.get('playbook_context') - failed_checks.update( - name - for name, result in failure['result']._result.get('checks', {}).items() - if result.get('failed') - ) - if failed_checks: - self._print_check_failure_summary(failed_checks, playbook_context) - - def _print_check_failure_summary(self, failed_checks, context): - checks = ','.join(sorted(failed_checks)) - # The purpose of specifying context is to vary the output depending on what the user was - # expecting to happen (based on which playbook they ran). The only use currently is to - # vary the message depending on whether the user was deliberately running checks or was - # trying to install/upgrade and checks are just included. Other use cases may arise. - summary = ( # default to explaining what checks are in the first place - '\n' - 'The execution of "{playbook}"\n' - 'includes checks designed to fail early if the requirements\n' - 'of the playbook are not met. One or more of these checks\n' - 'failed. To disregard these results, you may choose to\n' - 'disable failing checks by setting an Ansible variable:\n\n' - ' openshift_disable_check={checks}\n\n' - 'Failing check names are shown in the failure details above.\n' - 'Some checks may be configurable by variables if your requirements\n' - 'are different from the defaults; consult check documentation.\n' - 'Variables can be set in the inventory or passed on the\n' - 'command line using the -e flag to ansible-playbook.\n\n' - ).format(playbook=self._playbook_file, checks=checks) - if context in ['pre-install', 'health', 'adhoc']: - summary = ( # user was expecting to run checks, less explanation needed - '\n' - 'You may choose to configure or disable failing checks by\n' - 'setting Ansible variables. To disable those above:\n\n' - ' openshift_disable_check={checks}\n\n' - 'Consult check documentation for configurable variables.\n' - 'Variables can be set in the inventory or passed on the\n' - 'command line using the -e flag to ansible-playbook.\n\n' - ).format(checks=checks) - self._display.display(summary) - - -# re: result attrs see top comment # pylint: disable=protected-access -def _format_failure(failure): + self._display.display(failure_summary(self.__failures, self.__playbook_file)) + + +def failure_summary(failures, playbook): + """Return a summary of failed tasks, including details on health checks.""" + if not failures: + return u'' + + # NOTE: because we don't have access to task_vars from callback plugins, we + # store the playbook context in the task result when the + # openshift_health_check action plugin is used, and we use this context to + # customize the error message. + # pylint: disable=protected-access; Ansible gives us no sufficient public + # API on TaskResult objects. + context = next(( + context for context in + (failure._result.get('playbook_context') for failure in failures) + if context + ), None) + + failures = [failure_to_dict(failure) for failure in failures] + failures = deduplicate_failures(failures) + + summary = [u'', u'', u'Failure summary:', u''] + + width = len(str(len(failures))) + initial_indent_format = u' {{:>{width}}}. '.format(width=width) + initial_indent_len = len(initial_indent_format.format(0)) + subsequent_indent = u' ' * initial_indent_len + subsequent_extra_indent = u' ' * (initial_indent_len + 10) + + for i, failure in enumerate(failures, 1): + entries = format_failure(failure) + summary.append(u'\n{}{}'.format(initial_indent_format.format(i), entries[0])) + for entry in entries[1:]: + entry = entry.replace(u'\n', u'\n' + subsequent_extra_indent) + indented = u'{}{}'.format(subsequent_indent, entry) + summary.append(indented) + + failed_checks = set() + for failure in failures: + failed_checks.update(name for name, message in failure['checks']) + if failed_checks: + summary.append(check_failure_footer(failed_checks, context, playbook)) + + return u'\n'.join(summary) + + +def failure_to_dict(failed_task_result): + """Extract information out of a failed TaskResult into a dict. + + The intent is to transform a TaskResult object into something easier to + manipulate. TaskResult is ansible.executor.task_result.TaskResult. + """ + # pylint: disable=protected-access; Ansible gives us no sufficient public + # API on TaskResult objects. + _result = failed_task_result._result + return { + 'host': failed_task_result._host.get_name(), + 'play': play_name(failed_task_result._task), + 'task': failed_task_result.task_name, + 'msg': _result.get('msg', FAILED_NO_MSG), + 'checks': tuple( + (name, result.get('msg', FAILED_NO_MSG)) + for name, result in sorted(_result.get('checks', {}).items()) + if result.get('failed') + ), + } + + +def play_name(obj): + """Given a task or block, return the name of its parent play. + + This is loosely inspired by ansible.playbook.base.Base.dump_me. + """ + # pylint: disable=protected-access; Ansible gives us no sufficient public + # API to implement this. + if not obj: + return '' + if hasattr(obj, '_play'): + return obj._play.get_name() + return play_name(getattr(obj, '_parent')) + + +def deduplicate_failures(failures): + """Group together similar failures from different hosts. + + Returns a new list of failures such that identical failures from different + hosts are grouped together in a single entry. The relative order of failures + is preserved. + """ + groups = defaultdict(list) + for failure in failures: + group_key = tuple(sorted((key, value) for key, value in failure.items() if key != 'host')) + groups[group_key].append(failure) + result = [] + for failure in failures: + group_key = tuple(sorted((key, value) for key, value in failure.items() if key != 'host')) + if group_key not in groups: + continue + failure['host'] = tuple(sorted(g_failure['host'] for g_failure in groups.pop(group_key))) + result.append(failure) + return result + + +def format_failure(failure): """Return a list of pretty-formatted text entries describing a failure, including relevant information about it. Expect that the list of text entries will be joined by a newline separator when output to the user.""" - result = failure['result'] - host = result._host.get_name() - play = _get_play(result._task) - if play: - play = play.get_name() - task = result._task.get_name() - msg = result._result.get('msg', u'???') + host = u', '.join(failure['host']) + play = failure['play'] + task = failure['task'] + msg = failure['msg'] + checks = failure['checks'] fields = ( - (u'Host', host), + (u'Hosts', host), (u'Play', play), (u'Task', task), (u'Message', stringc(msg, C.COLOR_ERROR)), ) - if 'checks' in result._result: - fields += ((u'Details', _format_failed_checks(result._result['checks'])),) + if checks: + fields += ((u'Details', format_failed_checks(checks)),) row_format = '{:10}{}' return [row_format.format(header + u':', body) for header, body in fields] -def _format_failed_checks(checks): +def format_failed_checks(checks): """Return pretty-formatted text describing checks that failed.""" - failed_check_msgs = [] - for check, body in checks.items(): - if body.get('failed', False): # only show the failed checks - msg = body.get('msg', u"Failed without returning a message") - failed_check_msgs.append('check "%s":\n%s' % (check, msg)) - if failed_check_msgs: - return stringc("\n\n".join(failed_check_msgs), C.COLOR_ERROR) - else: # something failed but no checks will admit to it, so dump everything - return stringc(pformat(checks), C.COLOR_ERROR) - - -# This is inspired by ansible.playbook.base.Base.dump_me. -# re: play/task/block attrs see top comment # pylint: disable=protected-access -def _get_play(obj): - """Given a task or block, recursively try to find its parent play.""" - if hasattr(obj, '_play'): - return obj._play - if getattr(obj, '_parent'): - return _get_play(obj._parent) + messages = [] + for name, message in checks: + messages.append(u'check "{}":\n{}'.format(name, message)) + return stringc(u'\n\n'.join(messages), C.COLOR_ERROR) + + +def check_failure_footer(failed_checks, context, playbook): + """Return a textual explanation about checks depending on context. + + The purpose of specifying context is to vary the output depending on what + the user was expecting to happen (based on which playbook they ran). The + only use currently is to vary the message depending on whether the user was + deliberately running checks or was trying to install/upgrade and checks are + just included. Other use cases may arise. + """ + checks = ','.join(sorted(failed_checks)) + summary = [u''] + if context in ['pre-install', 'health', 'adhoc']: + # User was expecting to run checks, less explanation needed. + summary.extend([ + u'You may configure or disable checks by setting Ansible ' + u'variables. To disable those above, set:', + u' openshift_disable_check={checks}'.format(checks=checks), + u'Consult check documentation for configurable variables.', + ]) + else: + # User may not be familiar with the checks, explain what checks are in + # the first place. + summary.extend([ + u'The execution of "{playbook}" includes checks designed to fail ' + u'early if the requirements of the playbook are not met. One or ' + u'more of these checks failed. To disregard these results,' + u'explicitly disable checks by setting an Ansible variable:'.format(playbook=playbook), + u' openshift_disable_check={checks}'.format(checks=checks), + u'Failing check names are shown in the failure details above. ' + u'Some checks may be configurable by variables if your requirements ' + u'are different from the defaults; consult check documentation.', + ]) + summary.append( + u'Variables can be set in the inventory or passed on the command line ' + u'using the -e flag to ansible-playbook.' + ) + return u'\n'.join(summary) diff --git a/roles/openshift_health_checker/test/conftest.py b/roles/openshift_health_checker/test/conftest.py index 3cbd65507..244a1f0fa 100644 --- a/roles/openshift_health_checker/test/conftest.py +++ b/roles/openshift_health_checker/test/conftest.py @@ -7,5 +7,6 @@ openshift_health_checker_path = os.path.dirname(os.path.dirname(__file__)) sys.path[1:1] = [ openshift_health_checker_path, os.path.join(openshift_health_checker_path, 'action_plugins'), + os.path.join(openshift_health_checker_path, 'callback_plugins'), os.path.join(openshift_health_checker_path, 'library'), ] diff --git a/roles/openshift_health_checker/test/zz_failure_summary_test.py b/roles/openshift_health_checker/test/zz_failure_summary_test.py new file mode 100644 index 000000000..0fc258133 --- /dev/null +++ b/roles/openshift_health_checker/test/zz_failure_summary_test.py @@ -0,0 +1,70 @@ +from zz_failure_summary import deduplicate_failures + +import pytest + + +@pytest.mark.parametrize('failures,deduplicated', [ + ( + [ + { + 'host': 'master1', + 'msg': 'One or more checks failed', + }, + ], + [ + { + 'host': ('master1',), + 'msg': 'One or more checks failed', + }, + ], + ), + ( + [ + { + 'host': 'master1', + 'msg': 'One or more checks failed', + }, + { + 'host': 'node1', + 'msg': 'One or more checks failed', + }, + ], + [ + { + 'host': ('master1', 'node1'), + 'msg': 'One or more checks failed', + }, + ], + ), + ( + [ + { + 'host': 'node1', + 'msg': 'One or more checks failed', + 'checks': (('test_check', 'error message'),), + }, + { + 'host': 'master2', + 'msg': 'Some error happened', + }, + { + 'host': 'master1', + 'msg': 'One or more checks failed', + 'checks': (('test_check', 'error message'),), + }, + ], + [ + { + 'host': ('master1', 'node1'), + 'msg': 'One or more checks failed', + 'checks': (('test_check', 'error message'),), + }, + { + 'host': ('master2',), + 'msg': 'Some error happened', + }, + ], + ), +]) +def test_deduplicate_failures(failures, deduplicated): + assert deduplicate_failures(failures) == deduplicated -- cgit v1.2.3 From 6ba5603ea8ef2664210970f7c7901b86ebe984f8 Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Thu, 3 Aug 2017 10:52:51 +0200 Subject: Handle exceptions in failure summary cb plugin This serves two purposes: - Gracefully omit the summary if there was an error computing it, no confusion to the regular end user. - Provide a stacktrace of the error when running verbose, giving developers or users reporting bugs a better insight of what went wrong, as opposed to Ansible's opaque handling of errors in callback plugins. --- .../callback_plugins/zz_failure_summary.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py index 46cc3b577..349655966 100644 --- a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py +++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py @@ -5,6 +5,7 @@ by Ansible, thus making its output the last thing that users see. """ from collections import defaultdict +import traceback from ansible.plugins.callback import CallbackBase from ansible import constants as C @@ -40,8 +41,16 @@ class CallbackModule(CallbackBase): def v2_playbook_on_stats(self, stats): super(CallbackModule, self).v2_playbook_on_stats(stats) - if self.__failures: - self._display.display(failure_summary(self.__failures, self.__playbook_file)) + # pylint: disable=broad-except; capturing exceptions broadly is + # intentional, to isolate arbitrary failures in this callback plugin. + try: + if self.__failures: + self._display.display(failure_summary(self.__failures, self.__playbook_file)) + except Exception: + msg = stringc( + u'An error happened while generating a summary of failures:\n' + u'{}'.format(traceback.format_exc()), C.COLOR_WARN) + self._display.v(msg) def failure_summary(failures, playbook): -- cgit v1.2.3 From 8216ea6e6eff5f60ad3aeddd602f2ab3979117df Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Thu, 3 Aug 2017 15:13:01 +0200 Subject: Make pylint disables more specific And beautify the code a bit. --- .../action_plugins/openshift_health_check.py | 41 ++++++++++++++-------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py index 623c3eb8f..f80a53030 100644 --- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py +++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py @@ -1,29 +1,33 @@ """ Ansible action plugin to execute health checks in OpenShift clusters. """ -# pylint: disable=wrong-import-position,missing-docstring,invalid-name import sys import os import traceback from collections import defaultdict +from ansible.plugins.action import ActionBase +from ansible.module_utils.six import string_types + try: from __main__ import display except ImportError: + # pylint: disable=ungrouped-imports; this is the standard way how to import + # the default display object in Ansible action plugins. from ansible.utils.display import Display display = Display() -from ansible.plugins.action import ActionBase -from ansible.module_utils.six import string_types - # Augment sys.path so that we can import checks from a directory relative to # this callback plugin. sys.path.insert(1, os.path.dirname(os.path.dirname(__file__))) +# pylint: disable=wrong-import-position; the import statement must come after +# the manipulation of sys.path. from openshift_checks import OpenShiftCheck, OpenShiftCheckException, load_checks # noqa: E402 class ActionModule(ActionBase): + """Action plugin to execute health checks.""" def run(self, tmp=None, task_vars=None): result = super(ActionModule, self).run(tmp, task_vars) @@ -45,9 +49,9 @@ class ActionModule(ActionBase): return result resolved_checks = resolve_checks(requested_checks, known_checks.values()) - except OpenShiftCheckException as e: + except OpenShiftCheckException as exc: result["failed"] = True - result["msg"] = str(e) + result["msg"] = str(exc) return result if "openshift" not in task_vars: @@ -74,19 +78,21 @@ class ActionModule(ActionBase): return result def load_known_checks(self, tmp, task_vars): + """Find all existing checks and return a mapping of names to instances.""" load_checks() known_checks = {} for cls in OpenShiftCheck.subclasses(): - check_name = cls.name - if check_name in known_checks: - other_cls = known_checks[check_name].__class__ - raise OpenShiftCheckException( - "non-unique check name '{}' in: '{}.{}' and '{}.{}'".format( - check_name, - cls.__module__, cls.__name__, - other_cls.__module__, other_cls.__name__)) - known_checks[check_name] = cls(execute_module=self._execute_module, tmp=tmp, task_vars=task_vars) + name = cls.name + if name in known_checks: + other_cls = known_checks[name].__class__ + msg = "non-unique check name '{}' in: '{}' and '{}'".format( + name, + full_class_name(cls), + full_class_name(other_cls), + ) + raise OpenShiftCheckException(msg) + known_checks[name] = cls(execute_module=self._execute_module, tmp=tmp, task_vars=task_vars) return known_checks @@ -203,3 +209,8 @@ def run_check(name, check, user_disabled_checks): return dict(failed=True, msg=str(exc)) except Exception as exc: return dict(failed=True, msg=str(exc), exception=traceback.format_exc()) + + +def full_class_name(cls): + """Return the name of a class prefixed with its module name.""" + return '{}.{}'.format(cls.__module__, cls.__name__) -- cgit v1.2.3 From 51645eab599b0b23c582e22a9e84ce9423362ab1 Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Mon, 21 Aug 2017 14:27:19 +0200 Subject: Update error message: s/non-unique/duplicate --- .../action_plugins/openshift_health_check.py | 8 +++----- roles/openshift_health_checker/test/action_plugin_test.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py index f80a53030..8d35db6b5 100644 --- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py +++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py @@ -86,12 +86,10 @@ class ActionModule(ActionBase): name = cls.name if name in known_checks: other_cls = known_checks[name].__class__ - msg = "non-unique check name '{}' in: '{}' and '{}'".format( - name, - full_class_name(cls), - full_class_name(other_cls), + raise OpenShiftCheckException( + "duplicate check name '{}' in: '{}' and '{}'" + "".format(name, full_class_name(cls), full_class_name(other_cls)) ) - raise OpenShiftCheckException(msg) known_checks[name] = cls(execute_module=self._execute_module, tmp=tmp, task_vars=task_vars) return known_checks diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py index 2274cbd21..c109ebd24 100644 --- a/roles/openshift_health_checker/test/action_plugin_test.py +++ b/roles/openshift_health_checker/test/action_plugin_test.py @@ -95,7 +95,7 @@ def test_action_plugin_cannot_load_checks_with_the_same_name(plugin, task_vars, result = plugin.run(tmp=None, task_vars=task_vars) - assert failed(result, msg_has=['unique', 'duplicate_name', 'FakeCheck']) + assert failed(result, msg_has=['duplicate', 'duplicate_name', 'FakeCheck']) def test_action_plugin_skip_non_active_checks(plugin, task_vars, monkeypatch): -- cgit v1.2.3