13 files changed, 168 insertions, 68 deletions
diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
index a62e4331e..0390dc82e 100644
--- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py
+++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
@@ -38,14 +38,13 @@ class ActionModule(ActionBase):
 
         try:
             known_checks = self.load_known_checks()
+            args = self._task.args
+            resolved_checks = resolve_checks(args.get("checks", []), known_checks.values())
         except OpenShiftCheckException as e:
             result["failed"] = True
             result["msg"] = str(e)
             return result
 
-        args = self._task.args
-        resolved_checks = resolve_checks(args.get("checks", []), known_checks.values())
-
         result["checks"] = check_results = {}
 
         user_disabled_checks = [
diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
index 64c29a8d9..443b76ea1 100644
--- a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
+++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
@@ -39,7 +39,8 @@ class CallbackModule(CallbackBase):
 
     def v2_runner_on_failed(self, result, ignore_errors=False):
         super(CallbackModule, self).v2_runner_on_failed(result, ignore_errors)
-        self.__failures.append(dict(result=result, ignore_errors=ignore_errors))
+        if not ignore_errors:
+            self.__failures.append(dict(result=result, ignore_errors=ignore_errors))
 
     def v2_playbook_on_stats(self, stats):
         super(CallbackModule, self).v2_playbook_on_stats(stats)
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index 962148cb8..e93e81efa 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -1,9 +1,12 @@
-# pylint: disable=missing-docstring
+"""Check that there is enough disk space in predefined paths."""
+
+import os.path
+import tempfile
+
 from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
-from openshift_checks.mixins import NotContainerizedMixin
 
 
-class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
+class DiskAvailability(OpenShiftCheck):
     """Check that recommended disk space is available before a first-time install."""
 
     name = "disk_availability"
@@ -12,56 +15,101 @@ class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
     # Values taken from the official installation documentation:
     # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
     recommended_disk_space_bytes = {
-        "masters": 40 * 10**9,
-        "nodes": 15 * 10**9,
-        "etcd": 20 * 10**9,
+        '/var': {
+            'masters': 40 * 10**9,
+            'nodes': 15 * 10**9,
+            'etcd': 20 * 10**9,
+        },
+        # Used to copy client binaries into,
+        # see roles/openshift_cli/library/openshift_container_binary_sync.py.
+        '/usr/local/bin': {
+            'masters': 1 * 10**9,
+            'nodes': 1 * 10**9,
+            'etcd': 1 * 10**9,
+        },
+        # Used as temporary storage in several cases.
+        tempfile.gettempdir(): {
+            'masters': 1 * 10**9,
+            'nodes': 1 * 10**9,
+            'etcd': 1 * 10**9,
+        },
     }
 
     @classmethod
     def is_active(cls, task_vars):
         """Skip hosts that do not have recommended disk space requirements."""
         group_names = get_var(task_vars, "group_names", default=[])
-        has_disk_space_recommendation = bool(set(group_names).intersection(cls.recommended_disk_space_bytes))
+        active_groups = set()
+        for recommendation in cls.recommended_disk_space_bytes.values():
+            active_groups.update(recommendation.keys())
+        has_disk_space_recommendation = bool(active_groups.intersection(group_names))
         return super(DiskAvailability, cls).is_active(task_vars) and has_disk_space_recommendation
 
     def run(self, tmp, task_vars):
         group_names = get_var(task_vars, "group_names")
         ansible_mounts = get_var(task_vars, "ansible_mounts")
-        free_bytes = self.openshift_available_disk(ansible_mounts)
-
-        recommended_min = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
-        configured_min = int(get_var(task_vars, "openshift_check_min_host_disk_gb", default=0)) * 10**9
-        min_free_bytes = configured_min or recommended_min
-
-        if free_bytes < min_free_bytes:
-            return {
-                'failed': True,
-                'msg': (
-                    'Available disk space ({:.1f} GB) for the volume containing '
-                    '"/var" is below minimum recommended space ({:.1f} GB)'
-                ).format(float(free_bytes) / 10**9, float(min_free_bytes) / 10**9)
+        ansible_mounts = {mount['mount']: mount for mount in ansible_mounts}
+
+        user_config = get_var(task_vars, "openshift_check_min_host_disk_gb", default={})
+        try:
+            # For backwards-compatibility, if openshift_check_min_host_disk_gb
+            # is a number, then it overrides the required config for '/var'.
+            number = float(user_config)
+            user_config = {
+                '/var': {
+                    'masters': number,
+                    'nodes': number,
+                    'etcd': number,
+                },
             }
+        except TypeError:
+            # If it is not a number, then it should be a nested dict.
+            pass
+
+        # TODO: as suggested in
+        # https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021,
+        # maybe we could support checking disk availability in paths that are
+        # not part of the official recommendation but present in the user
+        # configuration.
+        for path, recommendation in self.recommended_disk_space_bytes.items():
+            free_bytes = self.free_bytes(path, ansible_mounts)
+            recommended_bytes = max(recommendation.get(name, 0) for name in group_names)
+
+            config = user_config.get(path, {})
+            # NOTE: the user config is in GB, but we compare bytes, thus the
+            # conversion.
+            config_bytes = max(config.get(name, 0) for name in group_names) * 10**9
+            recommended_bytes = config_bytes or recommended_bytes
+
+            if free_bytes < recommended_bytes:
+                free_gb = float(free_bytes) / 10**9
+                recommended_gb = float(recommended_bytes) / 10**9
+                return {
+                    'failed': True,
+                    'msg': (
+                        'Available disk space in "{}" ({:.1f} GB) '
+                        'is below minimum recommended ({:.1f} GB)'
+                    ).format(path, free_gb, recommended_gb)
+                }
 
         return {}
 
     @staticmethod
-    def openshift_available_disk(ansible_mounts):
-        """Determine the available disk space for an OpenShift installation.
-
-        ansible_mounts should be a list of dicts like the 'setup' Ansible module
-        returns.
-        """
-        # priority list in descending order
-        supported_mnt_paths = ["/var", "/"]
-        available_mnts = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+    def free_bytes(path, ansible_mounts):
+        """Return the size available in path based on ansible_mounts."""
+        mount_point = path
+        # arbitry value to prevent an infinite loop, in the unlike case that '/'
+        # is not in ansible_mounts.
+        max_depth = 32
+        while mount_point not in ansible_mounts and max_depth > 0:
+            mount_point = os.path.dirname(mount_point)
+            max_depth -= 1
 
         try:
-            for path in supported_mnt_paths:
-                if path in available_mnts:
-                    return available_mnts[path]["size_available"]
+            free_bytes = ansible_mounts[mount_point]['size_available']
         except KeyError:
-            pass
+            known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(ansible_mounts)) or 'none'
+            msg = 'Unable to determine disk availability for "{}". Known mount points: {}.'
+            raise OpenShiftCheckException(msg.format(path, known_mounts))
 
-        paths = ''.join(sorted(available_mnts)) or 'none'
-        msg = "Unable to determine available disk space. Paths mounted: {}.".format(paths)
-        raise OpenShiftCheckException(msg)
+        return free_bytes
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index 60aacf715..bde81ad2c 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -94,7 +94,8 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
         required = set()
         deployment_type = get_var(task_vars, "openshift_deployment_type")
         host_groups = get_var(task_vars, "group_names")
-        image_tag = get_var(task_vars, "openshift_image_tag")
+        # containerized etcd may not have openshift_image_tag, see bz 1466622
+        image_tag = get_var(task_vars, "openshift_image_tag", default="latest")
         image_info = DEPLOYMENT_IMAGE_INFO[deployment_type]
         if not image_info:
             return required
@@ -169,7 +170,7 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
             registries = [registry]
 
         for registry in registries:
-            args = {"_raw_params": "skopeo inspect docker://{}/{}".format(registry, image)}
+            args = {"_raw_params": "skopeo inspect --tls-verify=false docker://{}/{}".format(registry, image)}
             result = self.execute_module("command", args, task_vars=task_vars)
             if result.get("rc", 0) == 0 and not result.get("failed"):
                 return True
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
index 2bd615457..e80691ef3 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_storage.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -17,7 +17,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
     tags = ["pre-install", "health", "preflight"]
 
     dependencies = ["python-docker-py"]
-    storage_drivers = ["devicemapper", "overlay2"]
+    storage_drivers = ["devicemapper", "overlay", "overlay2"]
     max_thinpool_data_usage_percent = 90.0
     max_thinpool_meta_usage_percent = 90.0
 
@@ -143,7 +143,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
                 "so the available storage in the VG cannot be determined.".format(pool)
             )
         vg_name = match.groups()[0].replace("--", "-")
-        vgs_cmd = "/sbin/vgs --noheadings -o vg_free --select vg_name=" + vg_name
+        vgs_cmd = "/sbin/vgs --noheadings -o vg_free --units g --select vg_name=" + vg_name
         # should return free space like "  12.00g" if the VG exists; empty if it does not
 
         ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars=task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
index 442f407b1..551e8dfa0 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -62,7 +62,7 @@ class Kibana(LoggingCheck):
             # TODO(lmeyer): give users option to validate certs
             status_code=302,
         )
-        result = self.execute_module('uri', args, task_vars)
+        result = self.execute_module('uri', args, None, task_vars)
         if result.get('failed'):
             return result['msg']
         return None
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
index 05b4d300c..6e951e82c 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -54,12 +54,12 @@ class LoggingCheck(OpenShiftCheck):
         """Returns: list of pods not in a ready and running state"""
         return [
             pod for pod in pods
-            if any(
+            if not pod.get("status", {}).get("containerStatuses") or any(
                 container['ready'] is False
                 for container in pod['status']['containerStatuses']
             ) or not any(
                 condition['type'] == 'Ready' and condition['status'] == 'True'
-                for condition in pod['status']['conditions']
+                for condition in pod['status'].get('conditions', [])
             )
         ]
 
@@ -78,7 +78,7 @@ class LoggingCheck(OpenShiftCheck):
             "extra_args": list(extra_args) if extra_args else [],
         }
 
-        result = execute_module("ocutil", args, task_vars)
+        result = execute_module("ocutil", args, None, task_vars)
         if result.get("failed"):
             msg = (
                 'Unexpected error using `oc` to validate the logging stack components.\n'
diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py
index 6ebf0ebb2..9383b233c 100644
--- a/roles/openshift_health_checker/test/action_plugin_test.py
+++ b/roles/openshift_health_checker/test/action_plugin_test.py
@@ -59,7 +59,7 @@ def failed(result, msg_has=None):
     if msg_has is not None:
         assert 'msg' in result
         for term in msg_has:
-            assert term in result['msg']
+            assert term.lower() in result['msg'].lower()
     return result.get('failed', False)
 
 
@@ -178,6 +178,16 @@ def test_action_plugin_run_check_exception(plugin, task_vars, monkeypatch):
     assert not skipped(result)
 
 
+def test_action_plugin_resolve_checks_exception(plugin, task_vars, monkeypatch):
+    monkeypatch.setattr(plugin, 'load_known_checks', lambda: {})
+
+    result = plugin.run(tmp=None, task_vars=task_vars)
+
+    assert failed(result, msg_has=['unknown', 'name'])
+    assert not changed(result)
+    assert not skipped(result)
+
+
 @pytest.mark.parametrize('names,all_checks,expected', [
     ([], [], set()),
     (
diff --git a/roles/openshift_health_checker/test/disk_availability_test.py b/roles/openshift_health_checker/test/disk_availability_test.py
index b353fa610..945b9eafc 100644
--- a/roles/openshift_health_checker/test/disk_availability_test.py
+++ b/roles/openshift_health_checker/test/disk_availability_test.py
@@ -3,22 +3,19 @@ import pytest
 from openshift_checks.disk_availability import DiskAvailability, OpenShiftCheckException
 
 
-@pytest.mark.parametrize('group_names,is_containerized,is_active', [
-    (['masters'], False, True),
-    # ensure check is skipped on containerized installs
-    (['masters'], True, False),
-    (['nodes'], False, True),
-    (['etcd'], False, True),
-    (['masters', 'nodes'], False, True),
-    (['masters', 'etcd'], False, True),
-    ([], False, False),
-    (['lb'], False, False),
-    (['nfs'], False, False),
+@pytest.mark.parametrize('group_names,is_active', [
+    (['masters'], True),
+    (['nodes'], True),
+    (['etcd'], True),
+    (['masters', 'nodes'], True),
+    (['masters', 'etcd'], True),
+    ([], False),
+    (['lb'], False),
+    (['nfs'], False),
 ])
-def test_is_active(group_names, is_containerized, is_active):
+def test_is_active(group_names, is_active):
     task_vars = dict(
         group_names=group_names,
-        openshift=dict(common=dict(is_containerized=is_containerized)),
     )
     assert DiskAvailability.is_active(task_vars=task_vars) == is_active
 
@@ -38,7 +35,7 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words):
     with pytest.raises(OpenShiftCheckException) as excinfo:
         check.run(tmp=None, task_vars=task_vars)
 
-    for word in 'determine available disk'.split() + extra_words:
+    for word in 'determine disk availability'.split() + extra_words:
         assert word in str(excinfo.value)
 
 
@@ -81,7 +78,7 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words):
         [{
             # not enough space on / ...
             'mount': '/',
-            'size_available': 0,
+            'size_available': 2 * 10**9,
         }, {
             # ... but enough on /var
             'mount': '/var',
diff --git a/roles/openshift_health_checker/test/docker_image_availability_test.py b/roles/openshift_health_checker/test/docker_image_availability_test.py
index 0a7c0f8d3..3b9e097fb 100644
--- a/roles/openshift_health_checker/test/docker_image_availability_test.py
+++ b/roles/openshift_health_checker/test/docker_image_availability_test.py
@@ -259,3 +259,17 @@ def test_required_images(deployment_type, is_containerized, groups, oreg_url, ex
     )
 
     assert expected == DockerImageAvailability("DUMMY").required_images(task_vars)
+
+
+def test_containerized_etcd():
+    task_vars = dict(
+        openshift=dict(
+            common=dict(
+                is_containerized=True,
+            ),
+        ),
+        openshift_deployment_type="origin",
+        group_names=['etcd'],
+    )
+    expected = set(['registry.access.redhat.com/rhel7/etcd'])
+    assert expected == DockerImageAvailability("DUMMY").required_images(task_vars)
diff --git a/roles/openshift_health_checker/test/docker_storage_test.py b/roles/openshift_health_checker/test/docker_storage_test.py
index 876614b1d..bb25e3f66 100644
--- a/roles/openshift_health_checker/test/docker_storage_test.py
+++ b/roles/openshift_health_checker/test/docker_storage_test.py
@@ -65,8 +65,8 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
         dict(info={
             "Driver": "overlay",
         }),
-        True,
-        ["unsupported Docker storage driver"],
+        False,
+        [],
     ),
     (
         dict(info={
diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py
index 19140a1b6..40a5d19d8 100644
--- a/roles/openshift_health_checker/test/kibana_test.py
+++ b/roles/openshift_health_checker/test/kibana_test.py
@@ -169,7 +169,7 @@ def test_get_kibana_url(route, expect_url, expect_error):
     ),
 ])
 def test_verify_url_internal_failure(exec_result, expect):
-    check = Kibana(execute_module=lambda module_name, args, task_vars: dict(failed=True, msg=exec_result))
+    check = Kibana(execute_module=lambda module_name, args, tmp, task_vars: dict(failed=True, msg=exec_result))
     check._get_kibana_url = lambda task_vars: ('url', None)
 
     error = check._check_kibana_route({})
diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py
index b6db34fe3..128b76b12 100644
--- a/roles/openshift_health_checker/test/logging_check_test.py
+++ b/roles/openshift_health_checker/test/logging_check_test.py
@@ -50,6 +50,16 @@ plain_kibana_pod = {
     }
 }
 
+plain_kibana_pod_no_containerstatus = {
+    "metadata": {
+        "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+        "name": "logging-kibana-1",
+    },
+    "status": {
+        "conditions": [{"status": "True", "type": "Ready"}],
+    }
+}
+
 fluentd_pod_node1 = {
     "metadata": {
         "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
@@ -80,7 +90,7 @@ plain_curator_pod = {
     ("Permission denied", "Unexpected error using `oc`"),
 ])
 def test_oc_failure(problem, expect):
-    def execute_module(module_name, args, task_vars):
+    def execute_module(module_name, args, tmp, task_vars):
         if module_name == "ocutil":
             return dict(failed=True, result=problem)
         return dict(changed=False)
@@ -135,3 +145,23 @@ def test_get_pods_for_component(pod_output, expect_pods, expect_error):
         {}
     )
     assert_error(error, expect_error)
+
+
+@pytest.mark.parametrize('name, pods, expected_pods', [
+    (
+        'test single pod found, scheduled, but no containerStatuses field',
+        [plain_kibana_pod_no_containerstatus],
+        [plain_kibana_pod_no_containerstatus],
+    ),
+    (
+        'set of pods has at least one pod with containerStatuses (scheduled); should still fail',
+        [plain_kibana_pod_no_containerstatus, plain_kibana_pod],
+        [plain_kibana_pod_no_containerstatus],
+    ),
+
+], ids=lambda argvals: argvals[0])
+def test_get_not_running_pods_no_container_status(name, pods, expected_pods):
+    check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: '')
+    result = check.not_running_pods(pods)
+
+    assert result == expected_pods