17 files changed, 386 insertions, 118 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index 07ec6f7ef..b7b16e0ea 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -2,14 +2,18 @@
 Health checks for OpenShift clusters.
 """
 
+import json
 import operator
 import os
+import time
+import collections
 
 from abc import ABCMeta, abstractmethod, abstractproperty
 from importlib import import_module
 
 from ansible.module_utils import six
 from ansible.module_utils.six.moves import reduce  # pylint: disable=import-error,redefined-builtin
+from ansible.module_utils.six import string_types
 from ansible.plugins.filter.core import to_bool as ansible_to_bool
 
 
@@ -27,7 +31,7 @@ class OpenShiftCheckException(Exception):
 
 
 class OpenShiftCheckExceptionList(OpenShiftCheckException):
-    """A container for multiple logging errors that may be detected in one check."""
+    """A container for multiple errors that may be detected in one check."""
     def __init__(self, errors):
         self.errors = errors
         super(OpenShiftCheckExceptionList, self).__init__(
@@ -40,26 +44,56 @@ class OpenShiftCheckExceptionList(OpenShiftCheckException):
         return self.errors[index]
 
 
+FileToSave = collections.namedtuple("FileToSave", "filename contents remote_filename")
+
+
+# pylint: disable=too-many-instance-attributes; all represent significantly different state.
+# Arguably they could be separated into two hashes, one for storing parameters, and one for
+# storing result state; but that smells more like clutter than clarity.
 @six.add_metaclass(ABCMeta)
 class OpenShiftCheck(object):
-    """
-    A base class for defining checks for an OpenShift cluster environment.
+    """A base class for defining checks for an OpenShift cluster environment.
 
-    Expect optional params: method execute_module, dict task_vars, and string tmp.
+    Optional init params: method execute_module, dict task_vars, and string tmp
     execute_module is expected to have a signature compatible with _execute_module
     from ansible plugins/action/__init__.py, e.g.:
     def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *args):
     This is stored so that it can be invoked in subclasses via check.execute_module("name", args)
     which provides the check's stored task_vars and tmp.
-    """
 
-    def __init__(self, execute_module=None, task_vars=None, tmp=None):
+    Optional init param: want_full_results
+    If the check can gather logs, tarballs, etc., do so when True; but no need to spend
+    the time if they're not wanted (won't be written to output directory).
+    """
+    # pylint: disable=too-many-arguments
+    def __init__(self, execute_module=None, task_vars=None, tmp=None, want_full_results=False,
+                 templar=None):
+        # store a method for executing ansible modules from the check
         self._execute_module = execute_module
+        # the task variables and tmpdir passed into the health checker task
         self.task_vars = task_vars or {}
+        # We may need to template some task_vars
+        self._templar = templar
         self.tmp = tmp
+        # a boolean for disabling the gathering of results (files, computations) that won't
+        # actually be recorded/used
+        self.want_full_results = want_full_results
+
+        # mainly for testing purposes; see execute_module_with_retries
+        self._module_retries = 3
+        self._module_retry_interval = 5  # seconds
 
+        # state to be recorded for inspection after the check runs:
+        #
         # set to True when the check changes the host, for accurate total "changed" count
         self.changed = False
+        # list of OpenShiftCheckException for check to report (alternative to returning a failed result)
+        self.failures = []
+        # list of FileToSave - files the check specifies to be written locally if so configured
+        self.files_to_save = []
+        # log messages for the check - tuples of (description, msg) where msg is serializable.
+        # These are intended to be a sequential record of what the check observed and determined.
+        self.logs = []
 
     @abstractproperty
     def name(self):
@@ -80,9 +114,20 @@ class OpenShiftCheck(object):
         """Returns true if this check applies to the ansible-playbook run."""
         return True
 
+    def is_first_master(self):
+        """Determine if running on first master. Returns: bool"""
+        masters = self.get_var("groups", "oo_first_master", default=None) or [None]
+        return masters[0] == self.get_var("ansible_host")
+
     @abstractmethod
     def run(self):
-        """Executes a check, normally implemented as a module."""
+        """Executes a check against a host and returns a result hash similar to Ansible modules.
+
+        Actually the direction ahead is to record state in the attributes and
+        not bother building a result hash. Instead, return an empty hash and let
+        the action plugin fill it in. Or raise an OpenShiftCheckException.
+        Returning a hash may become deprecated if it does not prove necessary.
+        """
         return {}
 
     @classmethod
@@ -94,7 +139,43 @@ class OpenShiftCheck(object):
             for subclass in subclass.subclasses():
                 yield subclass
 
-    def execute_module(self, module_name=None, module_args=None):
+    def register_failure(self, error):
+        """Record in the check that a failure occurred.
+
+        Recorded failures are merged into the result hash for now. They are also saved to output directory
+        (if provided) <check>.failures.json and registered as a log entry for context <check>.log.json.
+        """
+        # It should be an exception; make it one if not
+        if not isinstance(error, OpenShiftCheckException):
+            error = OpenShiftCheckException(str(error))
+        self.failures.append(error)
+        # duplicate it in the logs so it can be seen in the context of any
+        # information that led to the failure
+        self.register_log("failure: " + error.name, str(error))
+
+    def register_log(self, context, msg):
+        """Record an entry for the check log.
+
+        Notes are intended to serve as context of the whole sequence of what the check observed.
+        They are be saved as an ordered list in a local check log file.
+        They are not to included in the result or in the ansible log; it's just for the record.
+        """
+        self.logs.append([context, msg])
+
+    def register_file(self, filename, contents=None, remote_filename=""):
+        """Record a file that a check makes available to be saved individually to output directory.
+
+        Either file contents should be passed in, or a file to be copied from the remote host
+        should be specified. Contents that are not a string are to be serialized as JSON.
+
+        NOTE: When copying a file from remote host, it is slurped into memory as base64, meaning
+        you should avoid using this on huge files (more than say 10M).
+        """
+        if contents is None and not remote_filename:
+            raise OpenShiftCheckException("File data/source not specified; this is a bug in the check.")
+        self.files_to_save.append(FileToSave(filename, contents, remote_filename))
+
+    def execute_module(self, module_name=None, module_args=None, save_as_name=None, register=True):
         """Invoke an Ansible module from a check.
 
         Invoke stored _execute_module, normally copied from the action
@@ -106,6 +187,12 @@ class OpenShiftCheck(object):
         Ansible version).
 
         So e.g. check.execute_module("foo", dict(arg1=...))
+
+        save_as_name specifies a file name for saving the result to an output directory,
+        if needed, and is intended to uniquely identify the result of invoking execute_module.
+        If not provided, the module name will be used.
+        If register is set False, then the result won't be registered in logs or files to save.
+
         Return: result hash from module execution.
         """
         if self._execute_module is None:
@@ -113,7 +200,33 @@ class OpenShiftCheck(object):
                 self.__class__.__name__ +
                 " invoked execute_module without providing the method at initialization."
             )
-        return self._execute_module(module_name, module_args, self.tmp, self.task_vars)
+        result = self._execute_module(module_name, module_args, self.tmp, self.task_vars)
+        if result.get("changed"):
+            self.changed = True
+        for output in ["result", "stdout"]:
+            # output is often JSON; attempt to decode
+            try:
+                result[output + "_json"] = json.loads(result[output])
+            except (KeyError, ValueError):
+                pass
+
+        if register:
+            self.register_log("execute_module: " + module_name, result)
+            self.register_file(save_as_name or module_name + ".json", result)
+        return result
+
+    def execute_module_with_retries(self, module_name, module_args):
+        """Run execute_module and retry on failure."""
+        result = {}
+        tries = 0
+        while True:
+            res = self.execute_module(module_name, module_args)
+            if tries > self._module_retries or not res.get("failed"):
+                result.update(res)
+                return result
+            result["last_failed"] = res
+            tries += 1
+            time.sleep(self._module_retry_interval)
 
     def get_var(self, *keys, **kwargs):
         """Get deeply nested values from task_vars.
@@ -171,8 +284,23 @@ class OpenShiftCheck(object):
                 'There is a bug in this check. While trying to convert variable \n'
                 '  "{var}={value}"\n'
                 'the given converter cannot be used or failed unexpectedly:\n'
-                '{error}'.format(var=".".join(keys), value=value, error=error)
-            )
+                '{type}: {error}'.format(
+                    var=".".join(keys),
+                    value=value,
+                    type=error.__class__.__name__,
+                    error=error
+                ))
+
+    @staticmethod
+    def normalize(name_list):
+        """Return a clean list of names.
+
+        The input may be a comma-separated string or a sequence. Leading and
+        trailing whitespace characters are removed. Empty items are discarded.
+        """
+        if isinstance(name_list, string_types):
+            name_list = name_list.split(',')
+        return [name.strip() for name in name_list if name.strip()]
 
     @staticmethod
     def get_major_minor_version(openshift_image_tag):
@@ -214,7 +342,9 @@ class OpenShiftCheck(object):
             mount_point = os.path.dirname(mount_point)
 
         try:
-            return mount_for_path[mount_point]
+            mount = mount_for_path[mount_point]
+            self.register_log("mount point for " + path, mount)
+            return mount
         except KeyError:
             known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path))
             raise OpenShiftCheckException(
diff --git a/roles/openshift_health_checker/openshift_checks/diagnostics.py b/roles/openshift_health_checker/openshift_checks/diagnostics.py
new file mode 100644
index 000000000..1cfdc1129
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/diagnostics.py
@@ -0,0 +1,62 @@
+"""
+A check to run relevant diagnostics via `oc adm diagnostics`.
+"""
+
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+DIAGNOSTIC_LIST = (
+    "AggregatedLogging ClusterRegistry ClusterRoleBindings ClusterRoles "
+    "ClusterRouter DiagnosticPod NetworkCheck"
+).split()
+
+
+class DiagnosticCheck(OpenShiftCheck):
+    """A check to run relevant diagnostics via `oc adm diagnostics`."""
+
+    name = "diagnostics"
+    tags = ["health"]
+
+    def is_active(self):
+        return super(DiagnosticCheck, self).is_active() and self.is_first_master()
+
+    def run(self):
+        if self.exec_diagnostic("ConfigContexts"):
+            # only run the other diagnostics if that one succeeds (otherwise, all will fail)
+            diagnostics = self.get_var("openshift_check_diagnostics", default=DIAGNOSTIC_LIST)
+            for diagnostic in self.normalize(diagnostics):
+                self.exec_diagnostic(diagnostic)
+        return {}
+
+    def exec_diagnostic(self, diagnostic):
+        """
+        Execute an 'oc adm diagnostics' command on the remote host.
+        Raises OcNotFound or registers OcDiagFailed.
+        Returns True on success or False on failure (non-zero rc).
+        """
+        config_base = self.get_var("openshift.common.config_base")
+        args = {
+            "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+            "cmd": "adm diagnostics",
+            "extra_args": [diagnostic],
+        }
+
+        result = self.execute_module("ocutil", args, save_as_name=diagnostic + ".failure.json")
+        self.register_file(diagnostic + ".txt", result['result'])
+        if result.get("failed"):
+            if result['result'] == '[Errno 2] No such file or directory':
+                raise OpenShiftCheckException(
+                    "OcNotFound",
+                    "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+                    "Has an installation been run on this host yet?"
+                )
+
+            self.register_failure(OpenShiftCheckException(
+                'OcDiagFailed',
+                'The {diag} diagnostic reported an error:\n'
+                '{error}'.format(diag=diagnostic, error=result['result'])
+            ))
+            return False
+        return True
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index 6d1dea9ce..87e6146d4 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -1,6 +1,7 @@
 """Check that there is enough disk space in predefined paths."""
 
 import tempfile
+import os.path
 
 from openshift_checks import OpenShiftCheck, OpenShiftCheckException
 
@@ -15,31 +16,31 @@ class DiskAvailability(OpenShiftCheck):
     # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
     recommended_disk_space_bytes = {
         '/var': {
-            'masters': 40 * 10**9,
-            'nodes': 15 * 10**9,
-            'etcd': 20 * 10**9,
+            'oo_masters_to_config': 40 * 10**9,
+            'oo_nodes_to_config': 15 * 10**9,
+            'oo_etcd_to_config': 20 * 10**9,
         },
         # Used to copy client binaries into,
         # see roles/openshift_cli/library/openshift_container_binary_sync.py.
         '/usr/local/bin': {
-            'masters': 1 * 10**9,
-            'nodes': 1 * 10**9,
-            'etcd': 1 * 10**9,
+            'oo_masters_to_config': 1 * 10**9,
+            'oo_nodes_to_config': 1 * 10**9,
+            'oo_etcd_to_config': 1 * 10**9,
         },
         # Used as temporary storage in several cases.
         tempfile.gettempdir(): {
-            'masters': 1 * 10**9,
-            'nodes': 1 * 10**9,
-            'etcd': 1 * 10**9,
+            'oo_masters_to_config': 1 * 10**9,
+            'oo_nodes_to_config': 1 * 10**9,
+            'oo_etcd_to_config': 1 * 10**9,
         },
     }
 
     # recommended disk space for each location under an upgrade context
     recommended_disk_upgrade_bytes = {
         '/var': {
-            'masters': 10 * 10**9,
-            'nodes': 5 * 10 ** 9,
-            'etcd': 5 * 10 ** 9,
+            'oo_masters_to_config': 10 * 10**9,
+            'oo_nodes_to_config': 5 * 10 ** 9,
+            'oo_etcd_to_config': 5 * 10 ** 9,
         },
     }
 
@@ -61,15 +62,19 @@ class DiskAvailability(OpenShiftCheck):
             number = float(user_config)
             user_config = {
                 '/var': {
-                    'masters': number,
-                    'nodes': number,
-                    'etcd': number,
+                    'oo_masters_to_config': number,
+                    'oo_nodes_to_config': number,
+                    'oo_etcd_to_config': number,
                 },
             }
         except TypeError:
             # If it is not a number, then it should be a nested dict.
             pass
 
+        self.register_log("recommended thresholds", self.recommended_disk_space_bytes)
+        if user_config:
+            self.register_log("user-configured thresholds", user_config)
+
         # TODO: as suggested in
         # https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021,
         # maybe we could support checking disk availability in paths that are
@@ -113,21 +118,25 @@ class DiskAvailability(OpenShiftCheck):
                             'in your Ansible inventory, and lower the recommended disk space availability\n'
                             'if necessary for this upgrade.').format(config_bytes)
 
-                return {
-                    'failed': True,
-                    'msg': (
-                        'Available disk space in "{}" ({:.1f} GB) '
-                        'is below minimum recommended ({:.1f} GB)'
-                    ).format(path, free_gb, recommended_gb)
-                }
+                self.register_failure(msg)
 
         return {}
 
+    def find_ansible_submounts(self, path):
+        """Return a list of ansible_mounts that are below the given path."""
+        base = os.path.join(path, "")
+        return [
+            mount
+            for mount in self.get_var("ansible_mounts")
+            if mount["mount"].startswith(base)
+        ]
+
     def free_bytes(self, path):
         """Return the size available in path based on ansible_mounts."""
+        submounts = sum(mnt.get('size_available', 0) for mnt in self.find_ansible_submounts(path))
         mount = self.find_ansible_mount(path)
         try:
-            return mount['size_available']
+            return mount['size_available'] + submounts
         except KeyError:
             raise OpenShiftCheckException(
                 'Unable to retrieve disk availability for "{path}".\n'
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index 85a922f86..5beb20503 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -1,5 +1,7 @@
 """Check that required Docker images are available."""
 
+from pipes import quote
+from ansible.module_utils import six
 from openshift_checks import OpenShiftCheck
 from openshift_checks.mixins import DockerHostMixin
 
@@ -32,6 +34,46 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
     # we use python-docker-py to check local docker for images, and skopeo
     # to look for images available remotely without waiting to pull them.
     dependencies = ["python-docker-py", "skopeo"]
+    # command for checking if remote registries have an image, without docker pull
+    skopeo_command = "timeout 10 skopeo inspect --tls-verify={tls} {creds} docker://{registry}/{image}"
+    skopeo_example_command = "skopeo inspect [--tls-verify=false] [--creds=<user>:<pass>] docker://<registry>/<image>"
+
+    def __init__(self, *args, **kwargs):
+        super(DockerImageAvailability, self).__init__(*args, **kwargs)
+
+        self.registries = dict(
+            # set of registries that need to be checked insecurely (note: not accounting for CIDR entries)
+            insecure=set(self.ensure_list("openshift_docker_insecure_registries")),
+            # set of registries that should never be queried even if given in the image
+            blocked=set(self.ensure_list("openshift_docker_blocked_registries")),
+        )
+
+        # ordered list of registries (according to inventory vars) that docker will try for unscoped images
+        regs = self.ensure_list("openshift_docker_additional_registries")
+        # currently one of these registries is added whether the user wants it or not.
+        deployment_type = self.get_var("openshift_deployment_type")
+        if deployment_type == "origin" and "docker.io" not in regs:
+            regs.append("docker.io")
+        elif deployment_type == 'openshift-enterprise' and "registry.access.redhat.com" not in regs:
+            regs.append("registry.access.redhat.com")
+        self.registries["configured"] = regs
+
+        # for the oreg_url registry there may be credentials specified
+        components = self.get_var("oreg_url", default="").split('/')
+        self.registries["oreg"] = "" if len(components) < 3 else components[0]
+
+        # Retrieve and template registry credentials, if provided
+        self.skopeo_command_creds = ""
+        oreg_auth_user = self.get_var('oreg_auth_user', default='')
+        oreg_auth_password = self.get_var('oreg_auth_password', default='')
+        if oreg_auth_user != '' and oreg_auth_password != '':
+            if self._templar is not None:
+                oreg_auth_user = self._templar.template(oreg_auth_user)
+                oreg_auth_password = self._templar.template(oreg_auth_password)
+            self.skopeo_command_creds = "--creds={}:{}".format(quote(oreg_auth_user), quote(oreg_auth_password))
+
+        # record whether we could reach a registry or not (and remember results)
+        self.reachable_registries = {}
 
     def is_active(self):
         """Skip hosts with unsupported deployment types."""
@@ -55,21 +97,28 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
         if not missing_images:
             return {}
 
-        registries = self.known_docker_registries()
-        if not registries:
-            return {"failed": True, "msg": "Unable to retrieve any docker registries."}
-
-        available_images = self.available_images(missing_images, registries)
+        available_images = self.available_images(missing_images)
         unavailable_images = set(missing_images) - set(available_images)
 
         if unavailable_images:
-            return {
-                "failed": True,
-                "msg": (
-                    "One or more required Docker images are not available:\n    {}\n"
-                    "Configured registries: {}"
-                ).format(",\n    ".join(sorted(unavailable_images)), ", ".join(registries)),
-            }
+            unreachable = [reg for reg, reachable in self.reachable_registries.items() if not reachable]
+            unreachable_msg = "Failed connecting to: {}\n".format(", ".join(unreachable))
+            blocked_msg = "Blocked registries: {}\n".format(", ".join(self.registries["blocked"]))
+            msg = (
+                "One or more required container images are not available:\n    {missing}\n"
+                "Checked with: {cmd}\n"
+                "Default registries searched: {registries}\n"
+                "{blocked}"
+                "{unreachable}"
+            ).format(
+                missing=",\n    ".join(sorted(unavailable_images)),
+                cmd=self.skopeo_example_command,
+                registries=", ".join(self.registries["configured"]),
+                blocked=blocked_msg if self.registries["blocked"] else "",
+                unreachable=unreachable_msg if unreachable else "",
+            )
+
+            return dict(failed=True, msg=msg)
 
         return {}
 
@@ -95,13 +144,11 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
         # containerized etcd may not have openshift_image_tag, see bz 1466622
         image_tag = self.get_var("openshift_image_tag", default="latest")
         image_info = DEPLOYMENT_IMAGE_INFO[deployment_type]
-        if not image_info:
-            return required
 
         # template for images that run on top of OpenShift
         image_url = "{}/{}-{}:{}".format(image_info["namespace"], image_info["name"], "${component}", "${version}")
         image_url = self.get_var("oreg_url", default="") or image_url
-        if 'nodes' in host_groups:
+        if 'oo_nodes_to_config' in host_groups:
             for suffix in NODE_IMAGE_SUFFIXES:
                 required.add(image_url.replace("${component}", suffix).replace("${version}", image_tag))
             # The registry-console is for some reason not prefixed with ose- like the other components.
@@ -112,65 +159,87 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
         # images for containerized components
         if self.get_var("openshift", "common", "is_containerized"):
             components = set()
-            if 'nodes' in host_groups:
+            if 'oo_nodes_to_config' in host_groups:
                 components.update(["node", "openvswitch"])
-            if 'masters' in host_groups:  # name is "origin" or "ose"
+            if 'oo_masters_to_config' in host_groups:  # name is "origin" or "ose"
                 components.add(image_info["name"])
             for component in components:
                 required.add("{}/{}:{}".format(image_info["namespace"], component, image_tag))
-            if 'etcd' in host_groups:  # special case, note it is the same for origin/enterprise
+            if 'oo_etcd_to_config' in host_groups:  # special case, note it is the same for origin/enterprise
                 required.add("registry.access.redhat.com/rhel7/etcd")  # and no image tag
 
         return required
 
     def local_images(self, images):
         """Filter a list of images and return those available locally."""
-        return [
-            image for image in images
-            if self.is_image_local(image)
-        ]
+        found_images = []
+        for image in images:
+            # docker could have the image name as-is or prefixed with any registry
+            imglist = [image] + [reg + "/" + image for reg in self.registries["configured"]]
+            if self.is_image_local(imglist):
+                found_images.append(image)
+        return found_images
 
     def is_image_local(self, image):
         """Check if image is already in local docker index."""
         result = self.execute_module("docker_image_facts", {"name": image})
-        if result.get("failed", False):
-            return False
-
-        return bool(result.get("images", []))
-
-    def known_docker_registries(self):
-        """Build a list of docker registries available according to inventory vars."""
-        docker_facts = self.get_var("openshift", "docker")
-        regs = set(docker_facts["additional_registries"])
-
-        deployment_type = self.get_var("openshift_deployment_type")
-        if deployment_type == "origin":
-            regs.update(["docker.io"])
-        elif "enterprise" in deployment_type:
-            regs.update(["registry.access.redhat.com"])
-
-        return list(regs)
-
-    def available_images(self, images, default_registries):
+        return bool(result.get("images")) and not result.get("failed")
+
+    def ensure_list(self, registry_param):
+        """Return the task var as a list."""
+        # https://bugzilla.redhat.com/show_bug.cgi?id=1497274
+        # If the result was a string type, place it into a list. We must do this
+        # as using list() on a string will split the string into its characters.
+        # Otherwise cast to a list as was done previously.
+        registry = self.get_var(registry_param, default=[])
+        if not isinstance(registry, six.string_types):
+            return list(registry)
+        return self.normalize(registry)
+
+    def available_images(self, images):
         """Search remotely for images. Returns: list of images found."""
         return [
             image for image in images
-            if self.is_available_skopeo_image(image, default_registries)
+            if self.is_available_skopeo_image(image)
         ]
 
-    def is_available_skopeo_image(self, image, default_registries):
+    def is_available_skopeo_image(self, image):
         """Use Skopeo to determine if required image exists in known registry(s)."""
-        registries = default_registries
-
-        # if image already includes a registry, only use that
+        registries = self.registries["configured"]
+        # If image already includes a registry, only use that.
+        # NOTE: This logic would incorrectly identify images that do not use a namespace, e.g.
+        # registry.access.redhat.com/rhel7 as if the registry were a namespace.
+        # It's not clear that there's any way to distinguish them, but fortunately
+        # the current set of images all look like [registry/]namespace/name[:version].
         if image.count("/") > 1:
             registry, image = image.split("/", 1)
             registries = [registry]
 
         for registry in registries:
-            args = {"_raw_params": "skopeo inspect --tls-verify=false docker://{}/{}".format(registry, image)}
-            result = self.execute_module("command", args)
+            if registry in self.registries["blocked"]:
+                continue  # blocked will never be consulted
+            if registry not in self.reachable_registries:
+                self.reachable_registries[registry] = self.connect_to_registry(registry)
+            if not self.reachable_registries[registry]:
+                continue  # do not keep trying unreachable registries
+
+            args = dict(registry=registry, image=image)
+            args["tls"] = "false" if registry in self.registries["insecure"] else "true"
+            args["creds"] = self.skopeo_command_creds if registry == self.registries["oreg"] else ""
+
+            result = self.execute_module_with_retries("command", {"_raw_params": self.skopeo_command.format(**args)})
             if result.get("rc", 0) == 0 and not result.get("failed"):
                 return True
+            if result.get("rc") == 124:  # RC 124 == timed out; mark unreachable
+                self.reachable_registries[registry] = False
 
         return False
+
+    def connect_to_registry(self, registry):
+        """Use ansible wait_for module to test connectivity from host to registry. Returns bool."""
+        # test a simple TCP connection
+        host, _, port = registry.partition(":")
+        port = port or 443
+        args = dict(host=host, port=port, state="started", timeout=30)
+        result = self.execute_module("wait_for", args)
+        return result.get("rc", 0) == 0 and not result.get("failed")
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
index 0558ddf14..6808d8b2f 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_storage.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -14,7 +14,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
     """
 
     name = "docker_storage"
-    tags = ["pre-install", "health", "preflight"]
+    tags = ["health", "preflight"]
 
     dependencies = ["python-docker-py"]
     storage_drivers = ["devicemapper", "overlay", "overlay2"]
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
index b4c8957e9..8b20ccb49 100644
--- a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
+++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
@@ -12,7 +12,7 @@ class EtcdTraffic(OpenShiftCheck):
     def is_active(self):
         """Skip hosts that do not have etcd in their group names."""
         group_names = self.get_var("group_names", default=[])
-        valid_group_names = "etcd" in group_names
+        valid_group_names = "oo_etcd_to_config" in group_names
 
         version = self.get_major_minor_version(self.get_var("openshift_image_tag"))
         valid_version = version in ((3, 4), (3, 5))
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
index e5d93ff3f..3d75da6f9 100644
--- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py
+++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
@@ -15,8 +15,12 @@ class EtcdVolume(OpenShiftCheck):
     etcd_mount_path = "/var/lib/etcd"
 
     def is_active(self):
-        etcd_hosts = self.get_var("groups", "etcd", default=[]) or self.get_var("groups", "masters", default=[]) or []
-        is_etcd_host = self.get_var("ansible_ssh_host") in etcd_hosts
+        etcd_hosts = (
+            self.get_var("groups", "oo_etcd_to_config", default=[]) or
+            self.get_var("groups", "oo_masters_to_config", default=[]) or
+            []
+        )
+        is_etcd_host = self.get_var("ansible_host") in etcd_hosts
         return super(EtcdVolume, self).is_active() and is_etcd_host
 
     def run(self):
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
index 7fc843fd7..986a01f38 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -72,7 +72,7 @@ class Elasticsearch(LoggingCheck):
         for pod_name in pods_by_name.keys():
             # Compare what each ES node reports as master and compare for split brain
             get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
-            master_name_str = self.exec_oc(get_master_cmd, [])
+            master_name_str = self.exec_oc(get_master_cmd, [], save_as_name="get_master_names.json")
             master_names = (master_name_str or '').split(' ')
             if len(master_names) > 1:
                 es_master_names.add(master_names[1])
@@ -113,7 +113,7 @@ class Elasticsearch(LoggingCheck):
 
         # get ES cluster nodes
         node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
-        cluster_node_data = self.exec_oc(node_cmd, [])
+        cluster_node_data = self.exec_oc(node_cmd, [], save_as_name="get_es_nodes.json")
         try:
             cluster_nodes = json.loads(cluster_node_data)['nodes']
         except (ValueError, KeyError):
@@ -142,7 +142,7 @@ class Elasticsearch(LoggingCheck):
         errors = []
         for pod_name in pods_by_name.keys():
             cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
-            cluster_health_data = self.exec_oc(cluster_health_cmd, [])
+            cluster_health_data = self.exec_oc(cluster_health_cmd, [], save_as_name='get_es_health.json')
             try:
                 health_res = json.loads(cluster_health_data)
                 if not health_res or not health_res.get('status'):
@@ -171,7 +171,7 @@ class Elasticsearch(LoggingCheck):
         errors = []
         for pod_name in pods_by_name.keys():
             df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
-            disk_output = self.exec_oc(df_cmd, [])
+            disk_output = self.exec_oc(df_cmd, [], save_as_name='get_pv_diskspace.json')
             lines = disk_output.splitlines()
             # expecting one header looking like 'IUse% Use%' and one body line
             body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
index d783e6760..e93cc9028 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
@@ -46,7 +46,7 @@ class FluentdConfig(LoggingCheck):
         # if check is running on a master, retrieve all running pods
         # and check any pod's container for the env var "USE_JOURNAL"
         group_names = self.get_var("group_names")
-        if "masters" in group_names:
+        if "oo_masters_to_config" in group_names:
             use_journald = self.check_fluentd_env_var()
 
         docker_info = self.execute_module("docker_info", {})
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
index ecd8adb64..05ba73ca1 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -30,14 +30,6 @@ class LoggingCheck(OpenShiftCheck):
         logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False)
         return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master()
 
-    def is_first_master(self):
-        """Determine if running on first master. Returns: bool"""
-        # Note: It would be nice to use membership in oo_first_master group, however for now it
-        # seems best to avoid requiring that setup and just check this is the first master.
-        hostname = self.get_var("ansible_ssh_host") or [None]
-        masters = self.get_var("groups", "masters", default=None) or [None]
-        return masters[0] == hostname
-
     def run(self):
         return {}
 
@@ -78,7 +70,7 @@ class LoggingCheck(OpenShiftCheck):
         """Returns the namespace in which logging is configured to deploy."""
         return self.get_var("openshift_logging_namespace", default="logging")
 
-    def exec_oc(self, cmd_str="", extra_args=None):
+    def exec_oc(self, cmd_str="", extra_args=None, save_as_name=None):
         """
         Execute an 'oc' command in the remote host.
         Returns: output of command and namespace,
@@ -92,7 +84,7 @@ class LoggingCheck(OpenShiftCheck):
             "extra_args": list(extra_args) if extra_args else [],
         }
 
-        result = self.execute_module("ocutil", args)
+        result = self.execute_module("ocutil", args, save_as_name=save_as_name)
         if result.get("failed"):
             if result['result'] == '[Errno 2] No such file or directory':
                 raise CouldNotUseOc(
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
index d781db649..cacdf4213 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -104,7 +104,7 @@ class LoggingIndexTime(LoggingCheck):
             "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
         )
         exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid)
-        result = self.exec_oc(exec_cmd, [])
+        result = self.exec_oc(exec_cmd, [], save_as_name="query_for_uuid.json")
 
         try:
             count = json.loads(result)["count"]
diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py
index 765ba072d..e7a8ec976 100644
--- a/roles/openshift_health_checker/openshift_checks/memory_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py
@@ -14,9 +14,9 @@ class MemoryAvailability(OpenShiftCheck):
     # Values taken from the official installation documentation:
     # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
     recommended_memory_bytes = {
-        "masters": 16 * GIB,
-        "nodes": 8 * GIB,
-        "etcd": 8 * GIB,
+        "oo_masters_to_config": 16 * GIB,
+        "oo_nodes_to_config": 8 * GIB,
+        "oo_etcd_to_config": 8 * GIB,
     }
     # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal
     memtotal_adjustment = 1 * GIB
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
index e9bae60a3..cfbdea303 100644
--- a/roles/openshift_health_checker/openshift_checks/mixins.py
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -21,9 +21,11 @@ class DockerHostMixin(object):
 
     def is_active(self):
         """Only run on hosts that depend on Docker."""
-        is_containerized = self.get_var("openshift", "common", "is_containerized")
-        is_node = "nodes" in self.get_var("group_names", default=[])
-        return super(DockerHostMixin, self).is_active() and (is_containerized or is_node)
+        group_names = set(self.get_var("group_names", default=[]))
+        needs_docker = set(["oo_nodes_to_config"])
+        if self.get_var("openshift.common.is_containerized"):
+            needs_docker.update(["oo_masters_to_config", "oo_etcd_to_config"])
+        return super(DockerHostMixin, self).is_active() and bool(group_names.intersection(needs_docker))
 
     def ensure_dependencies(self):
         """
@@ -36,7 +38,7 @@ class DockerHostMixin(object):
 
         # NOTE: we would use the "package" module but it's actually an action plugin
         # and it's not clear how to invoke one of those. This is about the same anyway:
-        result = self.execute_module(
+        result = self.execute_module_with_retries(
             self.get_var("ansible_pkg_mgr", default="yum"),
             {"name": self.dependencies, "state": "present"},
         )
@@ -49,5 +51,4 @@ class DockerHostMixin(object):
                 "    {deps}\n{msg}"
             ).format(deps=',\n    '.join(self.dependencies), msg=msg)
         failed = result.get("failed", False) or result.get("rc", 0) != 0
-        self.changed = result.get("changed", False)
         return msg, failed
diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py
index 363c12def..416805c4d 100644
--- a/roles/openshift_health_checker/openshift_checks/ovs_version.py
+++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py
@@ -24,7 +24,7 @@ class OvsVersion(NotContainerizedMixin, OpenShiftCheck):
     def is_active(self):
         """Skip hosts that do not have package requirements."""
         group_names = self.get_var("group_names", default=[])
-        master_or_node = 'masters' in group_names or 'nodes' in group_names
+        master_or_node = 'oo_masters_to_config' in group_names or 'oo_nodes_to_config' in group_names
         return super(OvsVersion, self).is_active() and master_or_node
 
     def run(self):
diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py
index a86180b00..090e438ff 100644
--- a/roles/openshift_health_checker/openshift_checks/package_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/package_availability.py
@@ -20,13 +20,13 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
 
         packages = set()
 
-        if "masters" in group_names:
+        if "oo_masters_to_config" in group_names:
             packages.update(self.master_packages(rpm_prefix))
-        if "nodes" in group_names:
+        if "oo_nodes_to_config" in group_names:
             packages.update(self.node_packages(rpm_prefix))
 
         args = {"packages": sorted(set(packages))}
-        return self.execute_module("check_yum_update", args)
+        return self.execute_module_with_retries("check_yum_update", args)
 
     @staticmethod
     def master_packages(rpm_prefix):
diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py
index 1e9aecbe0..8464e8a5e 100644
--- a/roles/openshift_health_checker/openshift_checks/package_update.py
+++ b/roles/openshift_health_checker/openshift_checks/package_update.py
@@ -11,4 +11,4 @@ class PackageUpdate(NotContainerizedMixin, OpenShiftCheck):
 
     def run(self):
         args = {"packages": []}
-        return self.execute_module("check_yum_update", args)
+        return self.execute_module_with_retries("check_yum_update", args)
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
index 8b780114f..2f09b22fc 100644
--- a/roles/openshift_health_checker/openshift_checks/package_version.py
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -36,7 +36,7 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
     def is_active(self):
         """Skip hosts that do not have package requirements."""
         group_names = self.get_var("group_names", default=[])
-        master_or_node = 'masters' in group_names or 'nodes' in group_names
+        master_or_node = 'oo_masters_to_config' in group_names or 'oo_nodes_to_config' in group_names
         return super(PackageVersion, self).is_active() and master_or_node
 
     def run(self):
@@ -46,6 +46,7 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
         check_multi_minor_release = deployment_type in ['openshift-enterprise']
 
         args = {
+            "package_mgr": self.get_var("ansible_pkg_mgr"),
             "package_list": [
                 {
                     "name": "openvswitch",
@@ -75,7 +76,7 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
             ],
         }
 
-        return self.execute_module("aos_version", args)
+        return self.execute_module_with_retries("aos_version", args)
 
     def get_required_ovs_version(self):
         """Return the correct Open vSwitch version(s) for the current OpenShift version."""