22 files changed, 2419 insertions, 287 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index be63d864a..b7b16e0ea 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -2,32 +2,98 @@
 Health checks for OpenShift clusters.
 """
 
+import json
 import operator
 import os
+import time
+import collections
 
 from abc import ABCMeta, abstractmethod, abstractproperty
 from importlib import import_module
 
 from ansible.module_utils import six
 from ansible.module_utils.six.moves import reduce  # pylint: disable=import-error,redefined-builtin
+from ansible.module_utils.six import string_types
+from ansible.plugins.filter.core import to_bool as ansible_to_bool
 
 
 class OpenShiftCheckException(Exception):
-    """Raised when a check cannot proceed."""
-    pass
+    """Raised when a check encounters a failure condition."""
 
+    def __init__(self, name, msg=None):
+        # msg is for the message the user will see when this is raised.
+        # name is for test code to identify the error without looking at msg text.
+        if msg is None:  # for parameter backward compatibility
+            msg = name
+            name = self.__class__.__name__
+        self.name = name
+        super(OpenShiftCheckException, self).__init__(msg)
 
+
+class OpenShiftCheckExceptionList(OpenShiftCheckException):
+    """A container for multiple errors that may be detected in one check."""
+    def __init__(self, errors):
+        self.errors = errors
+        super(OpenShiftCheckExceptionList, self).__init__(
+            'OpenShiftCheckExceptionList',
+            '\n'.join(str(msg) for msg in errors)
+        )
+
+    # make iterable
+    def __getitem__(self, index):
+        return self.errors[index]
+
+
+FileToSave = collections.namedtuple("FileToSave", "filename contents remote_filename")
+
+
+# pylint: disable=too-many-instance-attributes; all represent significantly different state.
+# Arguably they could be separated into two hashes, one for storing parameters, and one for
+# storing result state; but that smells more like clutter than clarity.
 @six.add_metaclass(ABCMeta)
 class OpenShiftCheck(object):
-    """A base class for defining checks for an OpenShift cluster environment."""
+    """A base class for defining checks for an OpenShift cluster environment.
+
+    Optional init params: method execute_module, dict task_vars, and string tmp
+    execute_module is expected to have a signature compatible with _execute_module
+    from ansible plugins/action/__init__.py, e.g.:
+    def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *args):
+    This is stored so that it can be invoked in subclasses via check.execute_module("name", args)
+    which provides the check's stored task_vars and tmp.
 
-    def __init__(self, execute_module=None, module_executor=None):
-        if execute_module is module_executor is None:
-            raise TypeError(
-                "__init__() takes either execute_module (recommended) "
-                "or module_executor (deprecated), none given")
-        self.execute_module = execute_module or module_executor
-        self.module_executor = self.execute_module
+    Optional init param: want_full_results
+    If the check can gather logs, tarballs, etc., do so when True; but no need to spend
+    the time if they're not wanted (won't be written to output directory).
+    """
+    # pylint: disable=too-many-arguments
+    def __init__(self, execute_module=None, task_vars=None, tmp=None, want_full_results=False,
+                 templar=None):
+        # store a method for executing ansible modules from the check
+        self._execute_module = execute_module
+        # the task variables and tmpdir passed into the health checker task
+        self.task_vars = task_vars or {}
+        # We may need to template some task_vars
+        self._templar = templar
+        self.tmp = tmp
+        # a boolean for disabling the gathering of results (files, computations) that won't
+        # actually be recorded/used
+        self.want_full_results = want_full_results
+
+        # mainly for testing purposes; see execute_module_with_retries
+        self._module_retries = 3
+        self._module_retry_interval = 5  # seconds
+
+        # state to be recorded for inspection after the check runs:
+        #
+        # set to True when the check changes the host, for accurate total "changed" count
+        self.changed = False
+        # list of OpenShiftCheckException for check to report (alternative to returning a failed result)
+        self.failures = []
+        # list of FileToSave - files the check specifies to be written locally if so configured
+        self.files_to_save = []
+        # log messages for the check - tuples of (description, msg) where msg is serializable.
+        # These are intended to be a sequential record of what the check observed and determined.
+        self.logs = []
 
     @abstractproperty
     def name(self):
@@ -43,14 +109,25 @@ class OpenShiftCheck(object):
         """
         return []
 
-    @classmethod
-    def is_active(cls, task_vars):  # pylint: disable=unused-argument
+    @staticmethod
+    def is_active():
         """Returns true if this check applies to the ansible-playbook run."""
         return True
 
+    def is_first_master(self):
+        """Determine if running on first master. Returns: bool"""
+        masters = self.get_var("groups", "oo_first_master", default=None) or [None]
+        return masters[0] == self.get_var("ansible_host")
+
     @abstractmethod
-    def run(self, tmp, task_vars):
-        """Executes a check, normally implemented as a module."""
+    def run(self):
+        """Executes a check against a host and returns a result hash similar to Ansible modules.
+
+        Actually the direction ahead is to record state in the attributes and
+        not bother building a result hash. Instead, return an empty hash and let
+        the action plugin fill it in. Or raise an OpenShiftCheckException.
+        Returning a hash may become deprecated if it does not prove necessary.
+        """
         return {}
 
     @classmethod
@@ -62,34 +139,240 @@ class OpenShiftCheck(object):
             for subclass in subclass.subclasses():
                 yield subclass
 
+    def register_failure(self, error):
+        """Record in the check that a failure occurred.
+
+        Recorded failures are merged into the result hash for now. They are also saved to output directory
+        (if provided) <check>.failures.json and registered as a log entry for context <check>.log.json.
+        """
+        # It should be an exception; make it one if not
+        if not isinstance(error, OpenShiftCheckException):
+            error = OpenShiftCheckException(str(error))
+        self.failures.append(error)
+        # duplicate it in the logs so it can be seen in the context of any
+        # information that led to the failure
+        self.register_log("failure: " + error.name, str(error))
+
+    def register_log(self, context, msg):
+        """Record an entry for the check log.
+
+        Notes are intended to serve as context of the whole sequence of what the check observed.
+        They are be saved as an ordered list in a local check log file.
+        They are not to included in the result or in the ansible log; it's just for the record.
+        """
+        self.logs.append([context, msg])
+
+    def register_file(self, filename, contents=None, remote_filename=""):
+        """Record a file that a check makes available to be saved individually to output directory.
+
+        Either file contents should be passed in, or a file to be copied from the remote host
+        should be specified. Contents that are not a string are to be serialized as JSON.
+
+        NOTE: When copying a file from remote host, it is slurped into memory as base64, meaning
+        you should avoid using this on huge files (more than say 10M).
+        """
+        if contents is None and not remote_filename:
+            raise OpenShiftCheckException("File data/source not specified; this is a bug in the check.")
+        self.files_to_save.append(FileToSave(filename, contents, remote_filename))
+
+    def execute_module(self, module_name=None, module_args=None, save_as_name=None, register=True):
+        """Invoke an Ansible module from a check.
+
+        Invoke stored _execute_module, normally copied from the action
+        plugin, with its params and the task_vars and tmp given at
+        check initialization. No positional parameters beyond these
+        are specified. If it's necessary to specify any of the other
+        parameters to _execute_module then that should just be invoked
+        directly (with awareness of changes in method signature per
+        Ansible version).
+
+        So e.g. check.execute_module("foo", dict(arg1=...))
+
+        save_as_name specifies a file name for saving the result to an output directory,
+        if needed, and is intended to uniquely identify the result of invoking execute_module.
+        If not provided, the module name will be used.
+        If register is set False, then the result won't be registered in logs or files to save.
+
+        Return: result hash from module execution.
+        """
+        if self._execute_module is None:
+            raise NotImplementedError(
+                self.__class__.__name__ +
+                " invoked execute_module without providing the method at initialization."
+            )
+        result = self._execute_module(module_name, module_args, self.tmp, self.task_vars)
+        if result.get("changed"):
+            self.changed = True
+        for output in ["result", "stdout"]:
+            # output is often JSON; attempt to decode
+            try:
+                result[output + "_json"] = json.loads(result[output])
+            except (KeyError, ValueError):
+                pass
+
+        if register:
+            self.register_log("execute_module: " + module_name, result)
+            self.register_file(save_as_name or module_name + ".json", result)
+        return result
+
+    def execute_module_with_retries(self, module_name, module_args):
+        """Run execute_module and retry on failure."""
+        result = {}
+        tries = 0
+        while True:
+            res = self.execute_module(module_name, module_args)
+            if tries > self._module_retries or not res.get("failed"):
+                result.update(res)
+                return result
+            result["last_failed"] = res
+            tries += 1
+            time.sleep(self._module_retry_interval)
+
+    def get_var(self, *keys, **kwargs):
+        """Get deeply nested values from task_vars.
+
+        Ansible task_vars structures are Python dicts, often mapping strings to
+        other dicts. This helper makes it easier to get a nested value, raising
+        OpenShiftCheckException when a key is not found.
+
+        Keyword args:
+          default:
+            On missing key, return this as default value instead of raising exception.
+          convert:
+            Supply a function to apply to normalize the value before returning it.
+            None is the default (return as-is).
+            This function should raise ValueError if the user has provided a value
+            that cannot be converted, or OpenShiftCheckException if some other
+            problem needs to be described to the user.
+        """
+        if len(keys) == 1:
+            keys = keys[0].split(".")
+
+        try:
+            value = reduce(operator.getitem, keys, self.task_vars)
+        except (KeyError, TypeError):
+            if "default" not in kwargs:
+                raise OpenShiftCheckException(
+                    "This check expects the '{}' inventory variable to be defined\n"
+                    "in order to proceed, but it is undefined. There may be a bug\n"
+                    "in Ansible, the checks, or their dependencies."
+                    "".format(".".join(map(str, keys)))
+                )
+            value = kwargs["default"]
+
+        convert = kwargs.get("convert", None)
+        try:
+            if convert is None:
+                return value
+            elif convert is bool:  # interpret bool as Ansible does, instead of python truthiness
+                return ansible_to_bool(value)
+            else:
+                return convert(value)
+
+        except ValueError as error:  # user error in specifying value
+            raise OpenShiftCheckException(
+                'Cannot convert inventory variable to expected type:\n'
+                '  "{var}={value}"\n'
+                '{error}'.format(var=".".join(keys), value=value, error=error)
+            )
+
+        except OpenShiftCheckException:  # some other check-specific problem
+            raise
+
+        except Exception as error:  # probably a bug in the function
+            raise OpenShiftCheckException(
+                'There is a bug in this check. While trying to convert variable \n'
+                '  "{var}={value}"\n'
+                'the given converter cannot be used or failed unexpectedly:\n'
+                '{type}: {error}'.format(
+                    var=".".join(keys),
+                    value=value,
+                    type=error.__class__.__name__,
+                    error=error
+                ))
+
+    @staticmethod
+    def normalize(name_list):
+        """Return a clean list of names.
+
+        The input may be a comma-separated string or a sequence. Leading and
+        trailing whitespace characters are removed. Empty items are discarded.
+        """
+        if isinstance(name_list, string_types):
+            name_list = name_list.split(',')
+        return [name.strip() for name in name_list if name.strip()]
+
+    @staticmethod
+    def get_major_minor_version(openshift_image_tag):
+        """Parse and return the deployed version of OpenShift as a tuple."""
+        if openshift_image_tag and openshift_image_tag[0] == 'v':
+            openshift_image_tag = openshift_image_tag[1:]
+
+        # map major release versions across releases
+        # to a common major version
+        openshift_major_release_version = {
+            "1": "3",
+        }
+
+        components = openshift_image_tag.split(".")
+        if not components or len(components) < 2:
+            msg = "An invalid version of OpenShift was found for this host: {}"
+            raise OpenShiftCheckException(msg.format(openshift_image_tag))
+
+        if components[0] in openshift_major_release_version:
+            components[0] = openshift_major_release_version[components[0]]
+
+        components = tuple(int(x) for x in components[:2])
+        return components
+
+    def find_ansible_mount(self, path):
+        """Return the mount point for path from ansible_mounts."""
+
+        # reorganize list of mounts into dict by path
+        mount_for_path = {
+            mount['mount']: mount
+            for mount
+            in self.get_var('ansible_mounts')
+        }
+
+        # NOTE: including base cases '/' and '' to ensure the loop ends
+        mount_targets = set(mount_for_path.keys()) | {'/', ''}
+        mount_point = path
+        while mount_point not in mount_targets:
+            mount_point = os.path.dirname(mount_point)
+
+        try:
+            mount = mount_for_path[mount_point]
+            self.register_log("mount point for " + path, mount)
+            return mount
+        except KeyError:
+            known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path))
+            raise OpenShiftCheckException(
+                'Unable to determine mount point for path "{}".\n'
+                'Known mount points: {}.'.format(path, known_mounts or 'none')
+            )
+
 
 LOADER_EXCLUDES = (
     "__init__.py",
     "mixins.py",
+    "logging.py",
 )
 
 
-def load_checks():
+def load_checks(path=None, subpkg=""):
     """Dynamically import all check modules for the side effect of registering checks."""
-    return [
-        import_module(__package__ + "." + name[:-3])
-        for name in os.listdir(os.path.dirname(__file__))
-        if name.endswith(".py") and name not in LOADER_EXCLUDES
-    ]
+    if path is None:
+        path = os.path.dirname(__file__)
 
+    modules = []
 
-def get_var(task_vars, *keys, **kwargs):
-    """Helper function to get deeply nested values from task_vars.
+    for name in os.listdir(path):
+        if os.path.isdir(os.path.join(path, name)):
+            modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name)
+            continue
 
-    Ansible task_vars structures are Python dicts, often mapping strings to
-    other dicts. This helper makes it easier to get a nested value, raising
-    OpenShiftCheckException when a key is not found or returning a default value
-    provided as a keyword argument.
-    """
-    try:
-        value = reduce(operator.getitem, keys, task_vars)
-    except (KeyError, TypeError):
-        if "default" in kwargs:
-            return kwargs["default"]
-        raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys))))
-    return value
+        if name.endswith(".py") and name not in LOADER_EXCLUDES:
+            modules.append(import_module(__package__ + subpkg + "." + name[:-3]))
+
+    return modules
diff --git a/roles/openshift_health_checker/openshift_checks/diagnostics.py b/roles/openshift_health_checker/openshift_checks/diagnostics.py
new file mode 100644
index 000000000..1cfdc1129
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/diagnostics.py
@@ -0,0 +1,62 @@
+"""
+A check to run relevant diagnostics via `oc adm diagnostics`.
+"""
+
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+DIAGNOSTIC_LIST = (
+    "AggregatedLogging ClusterRegistry ClusterRoleBindings ClusterRoles "
+    "ClusterRouter DiagnosticPod NetworkCheck"
+).split()
+
+
+class DiagnosticCheck(OpenShiftCheck):
+    """A check to run relevant diagnostics via `oc adm diagnostics`."""
+
+    name = "diagnostics"
+    tags = ["health"]
+
+    def is_active(self):
+        return super(DiagnosticCheck, self).is_active() and self.is_first_master()
+
+    def run(self):
+        if self.exec_diagnostic("ConfigContexts"):
+            # only run the other diagnostics if that one succeeds (otherwise, all will fail)
+            diagnostics = self.get_var("openshift_check_diagnostics", default=DIAGNOSTIC_LIST)
+            for diagnostic in self.normalize(diagnostics):
+                self.exec_diagnostic(diagnostic)
+        return {}
+
+    def exec_diagnostic(self, diagnostic):
+        """
+        Execute an 'oc adm diagnostics' command on the remote host.
+        Raises OcNotFound or registers OcDiagFailed.
+        Returns True on success or False on failure (non-zero rc).
+        """
+        config_base = self.get_var("openshift.common.config_base")
+        args = {
+            "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+            "cmd": "adm diagnostics",
+            "extra_args": [diagnostic],
+        }
+
+        result = self.execute_module("ocutil", args, save_as_name=diagnostic + ".failure.json")
+        self.register_file(diagnostic + ".txt", result['result'])
+        if result.get("failed"):
+            if result['result'] == '[Errno 2] No such file or directory':
+                raise OpenShiftCheckException(
+                    "OcNotFound",
+                    "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+                    "Has an installation been run on this host yet?"
+                )
+
+            self.register_failure(OpenShiftCheckException(
+                'OcDiagFailed',
+                'The {diag} diagnostic reported an error:\n'
+                '{error}'.format(diag=diagnostic, error=result['result'])
+            ))
+            return False
+        return True
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index c2792a0fe..87e6146d4 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -1,9 +1,12 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
-from openshift_checks.mixins import NotContainerizedMixin
+"""Check that there is enough disk space in predefined paths."""
 
+import tempfile
+import os.path
 
-class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+class DiskAvailability(OpenShiftCheck):
     """Check that recommended disk space is available before a first-time install."""
 
     name = "disk_availability"
@@ -12,54 +15,134 @@ class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
     # Values taken from the official installation documentation:
     # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
     recommended_disk_space_bytes = {
-        "masters": 40 * 10**9,
-        "nodes": 15 * 10**9,
-        "etcd": 20 * 10**9,
+        '/var': {
+            'oo_masters_to_config': 40 * 10**9,
+            'oo_nodes_to_config': 15 * 10**9,
+            'oo_etcd_to_config': 20 * 10**9,
+        },
+        # Used to copy client binaries into,
+        # see roles/openshift_cli/library/openshift_container_binary_sync.py.
+        '/usr/local/bin': {
+            'oo_masters_to_config': 1 * 10**9,
+            'oo_nodes_to_config': 1 * 10**9,
+            'oo_etcd_to_config': 1 * 10**9,
+        },
+        # Used as temporary storage in several cases.
+        tempfile.gettempdir(): {
+            'oo_masters_to_config': 1 * 10**9,
+            'oo_nodes_to_config': 1 * 10**9,
+            'oo_etcd_to_config': 1 * 10**9,
+        },
     }
 
-    @classmethod
-    def is_active(cls, task_vars):
+    # recommended disk space for each location under an upgrade context
+    recommended_disk_upgrade_bytes = {
+        '/var': {
+            'oo_masters_to_config': 10 * 10**9,
+            'oo_nodes_to_config': 5 * 10 ** 9,
+            'oo_etcd_to_config': 5 * 10 ** 9,
+        },
+    }
+
+    def is_active(self):
         """Skip hosts that do not have recommended disk space requirements."""
-        group_names = get_var(task_vars, "group_names", default=[])
-        has_disk_space_recommendation = bool(set(group_names).intersection(cls.recommended_disk_space_bytes))
-        return super(DiskAvailability, cls).is_active(task_vars) and has_disk_space_recommendation
-
-    def run(self, tmp, task_vars):
-        group_names = get_var(task_vars, "group_names")
-        ansible_mounts = get_var(task_vars, "ansible_mounts")
-
-        min_free_bytes = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
-        free_bytes = self.openshift_available_disk(ansible_mounts)
-
-        if free_bytes < min_free_bytes:
-            return {
-                'failed': True,
-                'msg': (
-                    'Available disk space ({:.1f} GB) for the volume containing '
-                    '"/var" is below minimum recommended space ({:.1f} GB)'
-                ).format(float(free_bytes) / 10**9, float(min_free_bytes) / 10**9)
+        group_names = self.get_var("group_names", default=[])
+        active_groups = set()
+        for recommendation in self.recommended_disk_space_bytes.values():
+            active_groups.update(recommendation.keys())
+        has_disk_space_recommendation = bool(active_groups.intersection(group_names))
+        return super(DiskAvailability, self).is_active() and has_disk_space_recommendation
+
+    def run(self):
+        group_names = self.get_var("group_names")
+        user_config = self.get_var("openshift_check_min_host_disk_gb", default={})
+        try:
+            # For backwards-compatibility, if openshift_check_min_host_disk_gb
+            # is a number, then it overrides the required config for '/var'.
+            number = float(user_config)
+            user_config = {
+                '/var': {
+                    'oo_masters_to_config': number,
+                    'oo_nodes_to_config': number,
+                    'oo_etcd_to_config': number,
+                },
             }
+        except TypeError:
+            # If it is not a number, then it should be a nested dict.
+            pass
 
-        return {}
+        self.register_log("recommended thresholds", self.recommended_disk_space_bytes)
+        if user_config:
+            self.register_log("user-configured thresholds", user_config)
+
+        # TODO: as suggested in
+        # https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021,
+        # maybe we could support checking disk availability in paths that are
+        # not part of the official recommendation but present in the user
+        # configuration.
+        for path, recommendation in self.recommended_disk_space_bytes.items():
+            free_bytes = self.free_bytes(path)
+            recommended_bytes = max(recommendation.get(name, 0) for name in group_names)
 
-    @staticmethod
-    def openshift_available_disk(ansible_mounts):
-        """Determine the available disk space for an OpenShift installation.
+            config = user_config.get(path, {})
+            # NOTE: the user config is in GB, but we compare bytes, thus the
+            # conversion.
+            config_bytes = max(config.get(name, 0) for name in group_names) * 10**9
+            recommended_bytes = config_bytes or recommended_bytes
 
-        ansible_mounts should be a list of dicts like the 'setup' Ansible module
-        returns.
-        """
-        # priority list in descending order
-        supported_mnt_paths = ["/var", "/"]
-        available_mnts = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+            # if an "upgrade" context is set, update the minimum disk requirement
+            # as this signifies an in-place upgrade - the node might have the
+            # required total disk space, but some of that space may already be
+            # in use by the existing OpenShift deployment.
+            context = self.get_var("r_openshift_health_checker_playbook_context", default="")
+            if context == "upgrade":
+                recommended_upgrade_paths = self.recommended_disk_upgrade_bytes.get(path, {})
+                if recommended_upgrade_paths:
+                    recommended_bytes = config_bytes or max(recommended_upgrade_paths.get(name, 0)
+                                                            for name in group_names)
 
+            if free_bytes < recommended_bytes:
+                free_gb = float(free_bytes) / 10**9
+                recommended_gb = float(recommended_bytes) / 10**9
+                msg = (
+                    'Available disk space in "{}" ({:.1f} GB) '
+                    'is below minimum recommended ({:.1f} GB)'
+                ).format(path, free_gb, recommended_gb)
+
+                # warn if check failed under an "upgrade" context
+                # due to limits imposed by the user config
+                if config_bytes and context == "upgrade":
+                    msg += ('\n\nMake sure to account for decreased disk space during an upgrade\n'
+                            'due to an existing OpenShift deployment. Please check the value of\n'
+                            '  openshift_check_min_host_disk_gb={}\n'
+                            'in your Ansible inventory, and lower the recommended disk space availability\n'
+                            'if necessary for this upgrade.').format(config_bytes)
+
+                self.register_failure(msg)
+
+        return {}
+
+    def find_ansible_submounts(self, path):
+        """Return a list of ansible_mounts that are below the given path."""
+        base = os.path.join(path, "")
+        return [
+            mount
+            for mount in self.get_var("ansible_mounts")
+            if mount["mount"].startswith(base)
+        ]
+
+    def free_bytes(self, path):
+        """Return the size available in path based on ansible_mounts."""
+        submounts = sum(mnt.get('size_available', 0) for mnt in self.find_ansible_submounts(path))
+        mount = self.find_ansible_mount(path)
         try:
-            for path in supported_mnt_paths:
-                if path in available_mnts:
-                    return available_mnts[path]["size_available"]
+            return mount['size_available'] + submounts
         except KeyError:
-            pass
-
-        paths = ''.join(sorted(available_mnts)) or 'none'
-        msg = "Unable to determine available disk space. Paths mounted: {}.".format(paths)
-        raise OpenShiftCheckException(msg)
+            raise OpenShiftCheckException(
+                'Unable to retrieve disk availability for "{path}".\n'
+                'Ansible facts included a matching mount point for this path:\n'
+                '  {mount}\n'
+                'however it is missing the size_available field.\n'
+                'To investigate, you can inspect the output of `ansible -m setup <host>`'
+                ''.format(path=path, mount=mount)
+            )
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index cce289b95..5beb20503 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -1,179 +1,245 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
-
-
-class DockerImageAvailability(OpenShiftCheck):
+"""Check that required Docker images are available."""
+
+from pipes import quote
+from ansible.module_utils import six
+from openshift_checks import OpenShiftCheck
+from openshift_checks.mixins import DockerHostMixin
+
+
+NODE_IMAGE_SUFFIXES = ["haproxy-router", "docker-registry", "deployer", "pod"]
+DEPLOYMENT_IMAGE_INFO = {
+    "origin": {
+        "namespace": "openshift",
+        "name": "origin",
+        "registry_console_image": "cockpit/kubernetes",
+    },
+    "openshift-enterprise": {
+        "namespace": "openshift3",
+        "name": "ose",
+        "registry_console_image": "registry.access.redhat.com/openshift3/registry-console",
+    },
+}
+
+
+class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
     """Check that required Docker images are available.
 
-    This check attempts to ensure that required docker images are
-    either present locally, or able to be pulled down from available
-    registries defined in a host machine.
+    Determine docker images that an install would require and check that they
+    are either present in the host's docker index, or available for the host to pull
+    with known registries as defined in our inventory file (or defaults).
     """
 
     name = "docker_image_availability"
     tags = ["preflight"]
+    # we use python-docker-py to check local docker for images, and skopeo
+    # to look for images available remotely without waiting to pull them.
+    dependencies = ["python-docker-py", "skopeo"]
+    # command for checking if remote registries have an image, without docker pull
+    skopeo_command = "timeout 10 skopeo inspect --tls-verify={tls} {creds} docker://{registry}/{image}"
+    skopeo_example_command = "skopeo inspect [--tls-verify=false] [--creds=<user>:<pass>] docker://<registry>/<image>"
+
+    def __init__(self, *args, **kwargs):
+        super(DockerImageAvailability, self).__init__(*args, **kwargs)
+
+        self.registries = dict(
+            # set of registries that need to be checked insecurely (note: not accounting for CIDR entries)
+            insecure=set(self.ensure_list("openshift_docker_insecure_registries")),
+            # set of registries that should never be queried even if given in the image
+            blocked=set(self.ensure_list("openshift_docker_blocked_registries")),
+        )
 
-    skopeo_image = "openshift/openshift-ansible"
-
-    # FIXME(juanvallejo): we should consider other possible values of
-    # `deployment_type` (the key here). See
-    # https://github.com/openshift/openshift-ansible/blob/8e26f8c/roles/openshift_repos/vars/main.yml#L7
-    docker_image_base = {
-        "origin": {
-            "repo": "openshift",
-            "image": "origin",
-        },
-        "openshift-enterprise": {
-            "repo": "openshift3",
-            "image": "ose",
-        },
-    }
-
-    def run(self, tmp, task_vars):
-        required_images = self.required_images(task_vars)
-        missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
-
-        # exit early if all images were found locally
-        if not missing_images:
-            return {"changed": False}
-
-        msg, failed, changed = self.update_skopeo_image(task_vars)
-
-        # exit early if Skopeo update fails
+        # ordered list of registries (according to inventory vars) that docker will try for unscoped images
+        regs = self.ensure_list("openshift_docker_additional_registries")
+        # currently one of these registries is added whether the user wants it or not.
+        deployment_type = self.get_var("openshift_deployment_type")
+        if deployment_type == "origin" and "docker.io" not in regs:
+            regs.append("docker.io")
+        elif deployment_type == 'openshift-enterprise' and "registry.access.redhat.com" not in regs:
+            regs.append("registry.access.redhat.com")
+        self.registries["configured"] = regs
+
+        # for the oreg_url registry there may be credentials specified
+        components = self.get_var("oreg_url", default="").split('/')
+        self.registries["oreg"] = "" if len(components) < 3 else components[0]
+
+        # Retrieve and template registry credentials, if provided
+        self.skopeo_command_creds = ""
+        oreg_auth_user = self.get_var('oreg_auth_user', default='')
+        oreg_auth_password = self.get_var('oreg_auth_password', default='')
+        if oreg_auth_user != '' and oreg_auth_password != '':
+            if self._templar is not None:
+                oreg_auth_user = self._templar.template(oreg_auth_user)
+                oreg_auth_password = self._templar.template(oreg_auth_password)
+            self.skopeo_command_creds = "--creds={}:{}".format(quote(oreg_auth_user), quote(oreg_auth_password))
+
+        # record whether we could reach a registry or not (and remember results)
+        self.reachable_registries = {}
+
+    def is_active(self):
+        """Skip hosts with unsupported deployment types."""
+        deployment_type = self.get_var("openshift_deployment_type")
+        has_valid_deployment_type = deployment_type in DEPLOYMENT_IMAGE_INFO
+
+        return super(DockerImageAvailability, self).is_active() and has_valid_deployment_type
+
+    def run(self):
+        msg, failed = self.ensure_dependencies()
         if failed:
             return {
                 "failed": True,
-                "changed": changed,
-                "msg": "Failed to update Skopeo image ({img_name}). {msg}".format(img_name=self.skopeo_image, msg=msg),
+                "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg
             }
 
-        registries = self.known_docker_registries(task_vars)
-        available_images = self.available_images(missing_images, registries, task_vars)
-        unavailable_images = set(missing_images) - set(available_images)
-
-        if unavailable_images:
-            return {
-                "failed": True,
-                "msg": (
-                    "One or more required images are not available: {}.\n"
-                    "Configured registries: {}"
-                ).format(", ".join(sorted(unavailable_images)), ", ".join(registries)),
-                "changed": changed,
-            }
+        required_images = self.required_images()
+        missing_images = set(required_images) - set(self.local_images(required_images))
 
-        return {"changed": changed}
-
-    def required_images(self, task_vars):
-        deployment_type = get_var(task_vars, "deployment_type")
-        # FIXME(juanvallejo): we should handle gracefully with a proper error
-        # message when given an unexpected value for `deployment_type`.
-        image_base_name = self.docker_image_base[deployment_type]
-
-        openshift_release = get_var(task_vars, "openshift_release")
-        # FIXME(juanvallejo): this variable is not required when the
-        # installation is non-containerized. The example inventories have it
-        # commented out. We should handle gracefully and with a proper error
-        # message when this variable is required and not set.
-        openshift_image_tag = get_var(task_vars, "openshift_image_tag")
-
-        is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
-
-        if is_containerized:
-            images = set(self.containerized_docker_images(image_base_name, openshift_release))
-        else:
-            images = set(self.rpm_docker_images(image_base_name, openshift_release))
-
-        # append images with qualified image tags to our list of required images.
-        # these are images with a (v0.0.0.0) tag, rather than a standard release
-        # format tag (v0.0). We want to check this set in both containerized and
-        # non-containerized installations.
-        images.update(
-            self.qualified_docker_images(self.image_from_base_name(image_base_name), "v" + openshift_image_tag)
-        )
+        # exit early if all images were found locally
+        if not missing_images:
+            return {}
 
-        return images
+        available_images = self.available_images(missing_images)
+        unavailable_images = set(missing_images) - set(available_images)
 
-    def local_images(self, images, task_vars):
+        if unavailable_images:
+            unreachable = [reg for reg, reachable in self.reachable_registries.items() if not reachable]
+            unreachable_msg = "Failed connecting to: {}\n".format(", ".join(unreachable))
+            blocked_msg = "Blocked registries: {}\n".format(", ".join(self.registries["blocked"]))
+            msg = (
+                "One or more required container images are not available:\n    {missing}\n"
+                "Checked with: {cmd}\n"
+                "Default registries searched: {registries}\n"
+                "{blocked}"
+                "{unreachable}"
+            ).format(
+                missing=",\n    ".join(sorted(unavailable_images)),
+                cmd=self.skopeo_example_command,
+                registries=", ".join(self.registries["configured"]),
+                blocked=blocked_msg if self.registries["blocked"] else "",
+                unreachable=unreachable_msg if unreachable else "",
+            )
+
+            return dict(failed=True, msg=msg)
+
+        return {}
+
+    def required_images(self):
+        """
+        Determine which images we expect to need for this host.
+        Returns: a set of required images like 'openshift/origin:v3.6'
+
+        The thorny issue of determining the image names from the variables is under consideration
+        via https://github.com/openshift/openshift-ansible/issues/4415
+
+        For now we operate as follows:
+        * For containerized components (master, node, ...) we look at the deployment type and
+          use openshift/origin or openshift3/ose as the base for those component images. The
+          version is openshift_image_tag as determined by the openshift_version role.
+        * For OpenShift-managed infrastructure (router, registry...) we use oreg_url if
+          it is defined; otherwise we again use the base that depends on the deployment type.
+        Registry is not included in constructed images. It may be in oreg_url or etcd image.
+        """
+        required = set()
+        deployment_type = self.get_var("openshift_deployment_type")
+        host_groups = self.get_var("group_names")
+        # containerized etcd may not have openshift_image_tag, see bz 1466622
+        image_tag = self.get_var("openshift_image_tag", default="latest")
+        image_info = DEPLOYMENT_IMAGE_INFO[deployment_type]
+
+        # template for images that run on top of OpenShift
+        image_url = "{}/{}-{}:{}".format(image_info["namespace"], image_info["name"], "${component}", "${version}")
+        image_url = self.get_var("oreg_url", default="") or image_url
+        if 'oo_nodes_to_config' in host_groups:
+            for suffix in NODE_IMAGE_SUFFIXES:
+                required.add(image_url.replace("${component}", suffix).replace("${version}", image_tag))
+            # The registry-console is for some reason not prefixed with ose- like the other components.
+            # Nor is it versioned the same, so just look for latest.
+            # Also a completely different name is used for Origin.
+            required.add(image_info["registry_console_image"])
+
+        # images for containerized components
+        if self.get_var("openshift", "common", "is_containerized"):
+            components = set()
+            if 'oo_nodes_to_config' in host_groups:
+                components.update(["node", "openvswitch"])
+            if 'oo_masters_to_config' in host_groups:  # name is "origin" or "ose"
+                components.add(image_info["name"])
+            for component in components:
+                required.add("{}/{}:{}".format(image_info["namespace"], component, image_tag))
+            if 'oo_etcd_to_config' in host_groups:  # special case, note it is the same for origin/enterprise
+                required.add("registry.access.redhat.com/rhel7/etcd")  # and no image tag
+
+        return required
+
+    def local_images(self, images):
         """Filter a list of images and return those available locally."""
+        found_images = []
+        for image in images:
+            # docker could have the image name as-is or prefixed with any registry
+            imglist = [image] + [reg + "/" + image for reg in self.registries["configured"]]
+            if self.is_image_local(imglist):
+                found_images.append(image)
+        return found_images
+
+    def is_image_local(self, image):
+        """Check if image is already in local docker index."""
+        result = self.execute_module("docker_image_facts", {"name": image})
+        return bool(result.get("images")) and not result.get("failed")
+
+    def ensure_list(self, registry_param):
+        """Return the task var as a list."""
+        # https://bugzilla.redhat.com/show_bug.cgi?id=1497274
+        # If the result was a string type, place it into a list. We must do this
+        # as using list() on a string will split the string into its characters.
+        # Otherwise cast to a list as was done previously.
+        registry = self.get_var(registry_param, default=[])
+        if not isinstance(registry, six.string_types):
+            return list(registry)
+        return self.normalize(registry)
+
+    def available_images(self, images):
+        """Search remotely for images. Returns: list of images found."""
         return [
             image for image in images
-            if self.is_image_local(image, task_vars)
+            if self.is_available_skopeo_image(image)
         ]
 
-    def is_image_local(self, image, task_vars):
-        result = self.module_executor("docker_image_facts", {"name": image}, task_vars)
-        if result.get("failed", False):
-            return False
-
-        return bool(result.get("images", []))
+    def is_available_skopeo_image(self, image):
+        """Use Skopeo to determine if required image exists in known registry(s)."""
+        registries = self.registries["configured"]
+        # If image already includes a registry, only use that.
+        # NOTE: This logic would incorrectly identify images that do not use a namespace, e.g.
+        # registry.access.redhat.com/rhel7 as if the registry were a namespace.
+        # It's not clear that there's any way to distinguish them, but fortunately
+        # the current set of images all look like [registry/]namespace/name[:version].
+        if image.count("/") > 1:
+            registry, image = image.split("/", 1)
+            registries = [registry]
 
-    def known_docker_registries(self, task_vars):
-        result = self.module_executor("docker_info", {}, task_vars)
-
-        if result.get("failed", False):
-            return []
-
-        # FIXME(juanvallejo): wrong default type, result["info"] is expected to
-        # contain a dictionary (see how we call `docker_info.get` below).
-        docker_info = result.get("info", "")
-        return [registry.get("Name", "") for registry in docker_info.get("Registries", {})]
-
-    def available_images(self, images, registries, task_vars):
-        """Inspect existing images using Skopeo and return all images successfully inspected."""
-        return [
-            image for image in images
-            if self.is_image_available(image, registries, task_vars)
-        ]
-
-    def is_image_available(self, image, registries, task_vars):
         for registry in registries:
-            if self.is_available_skopeo_image(image, registry, task_vars):
+            if registry in self.registries["blocked"]:
+                continue  # blocked will never be consulted
+            if registry not in self.reachable_registries:
+                self.reachable_registries[registry] = self.connect_to_registry(registry)
+            if not self.reachable_registries[registry]:
+                continue  # do not keep trying unreachable registries
+
+            args = dict(registry=registry, image=image)
+            args["tls"] = "false" if registry in self.registries["insecure"] else "true"
+            args["creds"] = self.skopeo_command_creds if registry == self.registries["oreg"] else ""
+
+            result = self.execute_module_with_retries("command", {"_raw_params": self.skopeo_command.format(**args)})
+            if result.get("rc", 0) == 0 and not result.get("failed"):
                 return True
+            if result.get("rc") == 124:  # RC 124 == timed out; mark unreachable
+                self.reachable_registries[registry] = False
 
         return False
 
-    def is_available_skopeo_image(self, image, registry, task_vars):
-        """Uses Skopeo to determine if required image exists in a given registry."""
-
-        cmd_str = "skopeo inspect docker://{registry}/{image}".format(
-            registry=registry,
-            image=image,
-        )
-
-        args = {
-            "name": "skopeo_inspect",
-            "image": self.skopeo_image,
-            "command": cmd_str,
-            "detach": False,
-            "cleanup": True,
-        }
-        result = self.module_executor("docker_container", args, task_vars)
-        return result.get("failed", False)
-
-    def containerized_docker_images(self, base_name, version):
-        return [
-            "{image}:{version}".format(image=self.image_from_base_name(base_name), version=version)
-        ]
-
-    @staticmethod
-    def rpm_docker_images(base, version):
-        return [
-            "{image_repo}/registry-console:{version}".format(image_repo=base["repo"], version=version)
-        ]
-
-    @staticmethod
-    def qualified_docker_images(image_name, version):
-        return [
-            "{}-{}:{}".format(image_name, component, version)
-            for component in "haproxy-router docker-registry deployer pod".split()
-        ]
-
-    @staticmethod
-    def image_from_base_name(base):
-        return "".join([base["repo"], "/", base["image"]])
-
-    # ensures that the skopeo docker image exists, and updates it
-    # with latest if image was already present locally.
-    def update_skopeo_image(self, task_vars):
-        result = self.module_executor("docker_image", {"name": self.skopeo_image}, task_vars)
-        return result.get("msg", ""), result.get("failed", False), result.get("changed", False)
+    def connect_to_registry(self, registry):
+        """Use ansible wait_for module to test connectivity from host to registry. Returns bool."""
+        # test a simple TCP connection
+        host, _, port = registry.partition(":")
+        port = port or 443
+        args = dict(host=host, port=port, state="started", timeout=30)
+        result = self.execute_module("wait_for", args)
+        return result.get("rc", 0) == 0 and not result.get("failed")
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
new file mode 100644
index 000000000..6808d8b2f
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -0,0 +1,276 @@
+"""Check Docker storage driver and usage."""
+import json
+import re
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+from openshift_checks.mixins import DockerHostMixin
+
+
+class DockerStorage(DockerHostMixin, OpenShiftCheck):
+    """Check Docker storage driver compatibility.
+
+    This check ensures that Docker is using a supported storage driver,
+    and that loopback is not being used (if using devicemapper).
+    Also that storage usage is not above threshold.
+    """
+
+    name = "docker_storage"
+    tags = ["health", "preflight"]
+
+    dependencies = ["python-docker-py"]
+    storage_drivers = ["devicemapper", "overlay", "overlay2"]
+    max_thinpool_data_usage_percent = 90.0
+    max_thinpool_meta_usage_percent = 90.0
+    max_overlay_usage_percent = 90.0
+
+    # TODO(lmeyer): mention these in the output when check fails
+    configuration_variables = [
+        (
+            "max_thinpool_data_usage_percent",
+            "For 'devicemapper' storage driver, usage threshold percentage for data. "
+            "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent),
+        ),
+        (
+            "max_thinpool_meta_usage_percent",
+            "For 'devicemapper' storage driver, usage threshold percentage for metadata. "
+            "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent),
+        ),
+        (
+            "max_overlay_usage_percent",
+            "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. "
+            "Format: float. Default: {:.1f}".format(max_overlay_usage_percent),
+        ),
+    ]
+
+    def run(self):
+        msg, failed = self.ensure_dependencies()
+        if failed:
+            return {
+                "failed": True,
+                "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg
+            }
+
+        # attempt to get the docker info hash from the API
+        docker_info = self.execute_module("docker_info", {})
+        if docker_info.get("failed"):
+            return {"failed": True,
+                    "msg": "Failed to query Docker API. Is docker running on this host?"}
+        if not docker_info.get("info"):  # this would be very strange
+            return {"failed": True,
+                    "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))}
+        docker_info = docker_info["info"]
+
+        # check if the storage driver we saw is valid
+        driver = docker_info.get("Driver", "[NONE]")
+        if driver not in self.storage_drivers:
+            msg = (
+                "Detected unsupported Docker storage driver '{driver}'.\n"
+                "Supported storage drivers are: {drivers}"
+            ).format(driver=driver, drivers=', '.join(self.storage_drivers))
+            return {"failed": True, "msg": msg}
+
+        # driver status info is a list of tuples; convert to dict and validate based on driver
+        driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])}
+
+        result = {}
+
+        if driver == "devicemapper":
+            result = self.check_devicemapper_support(driver_status)
+
+        if driver in ['overlay', 'overlay2']:
+            result = self.check_overlay_support(docker_info, driver_status)
+
+        return result
+
+    def check_devicemapper_support(self, driver_status):
+        """Check if dm storage driver is supported as configured. Return: result dict."""
+        if driver_status.get("Data loop file"):
+            msg = (
+                "Use of loopback devices with the Docker devicemapper storage driver\n"
+                "(the default storage configuration) is unsupported in production.\n"
+                "Please use docker-storage-setup to configure a backing storage volume.\n"
+                "See http://red.ht/2rNperO for further information."
+            )
+            return {"failed": True, "msg": msg}
+        result = self.check_dm_usage(driver_status)
+        return result
+
+    def check_dm_usage(self, driver_status):
+        """Check usage thresholds for Docker dm storage driver. Return: result dict.
+        Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
+        implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
+        devicemapper storage. The LV is "thin" because it does not use all available storage
+        from its VG, instead expanding as needed; so to determine available space, we gather
+        current usage as the Docker API reports for the driver as well as space available for
+        expansion in the pool's VG.
+        Usage within the LV is divided into pools allocated to data and metadata, either of which
+        could run out of space first; so we check both.
+        """
+        vals = dict(
+            vg_free=self.get_vg_free(driver_status.get("Pool Name")),
+            data_used=driver_status.get("Data Space Used"),
+            data_total=driver_status.get("Data Space Total"),
+            metadata_used=driver_status.get("Metadata Space Used"),
+            metadata_total=driver_status.get("Metadata Space Total"),
+        )
+
+        # convert all human-readable strings to bytes
+        for key, value in vals.copy().items():
+            try:
+                vals[key + "_bytes"] = self.convert_to_bytes(value)
+            except ValueError as err:  # unlikely to hit this from API info, but just to be safe
+                return {
+                    "failed": True,
+                    "values": vals,
+                    "msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err))
+                }
+
+        # determine the threshold percentages which usage should not exceed
+        for name, default in [("data", self.max_thinpool_data_usage_percent),
+                              ("metadata", self.max_thinpool_meta_usage_percent)]:
+            percent = self.get_var("max_thinpool_" + name + "_usage_percent", default=default)
+            try:
+                vals[name + "_threshold"] = float(percent)
+            except ValueError:
+                return {
+                    "failed": True,
+                    "msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent)
+                }
+
+        # test whether the thresholds are exceeded
+        messages = []
+        for name in ["data", "metadata"]:
+            vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / (
+                vals[name + "_total_bytes"] + vals["vg_free_bytes"])
+            if vals[name + "_pct_used"] > vals[name + "_threshold"]:
+                messages.append(
+                    "Docker thinpool {name} usage percentage {pct:.1f} "
+                    "is higher than threshold {thresh:.1f}.".format(
+                        name=name,
+                        pct=vals[name + "_pct_used"],
+                        thresh=vals[name + "_threshold"],
+                    ))
+                vals["failed"] = True
+
+        vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
+        return vals
+
+    def get_vg_free(self, pool):
+        """Determine which VG to examine according to the pool name. Return: size vgs reports.
+        Pool name is the only indicator currently available from the Docker API driver info.
+        We assume a name that looks like "vg--name-docker--pool";
+        vg and lv names with inner hyphens doubled, joined by a hyphen.
+        """
+        match = re.match(r'((?:[^-]|--)+)-(?!-)', pool)  # matches up to the first single hyphen
+        if not match:  # unlikely, but... be clear if we assumed wrong
+            raise OpenShiftCheckException(
+                "This host's Docker reports it is using a storage pool named '{}'.\n"
+                "However this name does not have the expected format of 'vgname-lvname'\n"
+                "so the available storage in the VG cannot be determined.".format(pool)
+            )
+        vg_name = match.groups()[0].replace("--", "-")
+        vgs_cmd = "/sbin/vgs --noheadings -o vg_free --units g --select vg_name=" + vg_name
+        # should return free space like "  12.00g" if the VG exists; empty if it does not
+
+        ret = self.execute_module("command", {"_raw_params": vgs_cmd})
+        if ret.get("failed") or ret.get("rc", 0) != 0:
+            raise OpenShiftCheckException(
+                "Is LVM installed? Failed to run /sbin/vgs "
+                "to determine docker storage usage:\n" + ret.get("msg", "")
+            )
+        size = ret.get("stdout", "").strip()
+        if not size:
+            raise OpenShiftCheckException(
+                "This host's Docker reports it is using a storage pool named '{pool}'.\n"
+                "which we expect to come from local VG '{vg}'.\n"
+                "However, /sbin/vgs did not find this VG. Is Docker for this host"
+                "running and using the storage on the host?".format(pool=pool, vg=vg_name)
+            )
+        return size
+
+    @staticmethod
+    def convert_to_bytes(string):
+        """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes."""
+        units = dict(
+            b=1,
+            k=1024,
+            m=1024**2,
+            g=1024**3,
+            t=1024**4,
+            p=1024**5,
+        )
+        string = string or ""
+        match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string)  # float followed by optional unit
+        if not match:
+            raise ValueError("Cannot convert to a byte size: " + string)
+
+        number, unit = match.groups()
+        multiplier = 1 if not unit else units.get(unit.lower())
+        if not multiplier:
+            raise ValueError("Cannot convert to a byte size: " + string)
+
+        return float(number) * multiplier
+
+    def check_overlay_support(self, docker_info, driver_status):
+        """Check if overlay storage driver is supported for this host. Return: result dict."""
+        # check for xfs as backing store
+        backing_fs = driver_status.get("Backing Filesystem", "[NONE]")
+        if backing_fs != "xfs":
+            msg = (
+                "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n"
+                "'xfs' as the backing storage, but this host's storage is type '{fs}'."
+            ).format(fs=backing_fs)
+            return {"failed": True, "msg": msg}
+
+        # check support for OS and kernel version
+        o_s = docker_info.get("OperatingSystem", "[NONE]")
+        if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s:
+            # keep it simple, only check enterprise kernel versions; assume everyone else is good
+            kernel = docker_info.get("KernelVersion", "[NONE]")
+            kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)]
+            if kernel_arr < [3, 10, 0, 514]:  # rhel < 7.3
+                msg = (
+                    "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n"
+                    "kernel version 3.10.0-514; but Docker reports kernel version {version}."
+                ).format(version=kernel)
+                return {"failed": True, "msg": msg}
+            # NOTE: we could check for --selinux-enabled here but docker won't even start with
+            # that option until it's supported in the kernel so we don't need to.
+
+        return self.check_overlay_usage(docker_info)
+
+    def check_overlay_usage(self, docker_info):
+        """Check disk usage on OverlayFS backing store volume. Return: result dict."""
+        path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"]
+
+        threshold = self.get_var("max_overlay_usage_percent", default=self.max_overlay_usage_percent)
+        try:
+            threshold = float(threshold)
+        except ValueError:
+            return {
+                "failed": True,
+                "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold),
+            }
+
+        mount = self.find_ansible_mount(path)
+        try:
+            free_bytes = mount['size_available']
+            total_bytes = mount['size_total']
+            usage = 100.0 * (total_bytes - free_bytes) / total_bytes
+        except (KeyError, ZeroDivisionError):
+            return {
+                "failed": True,
+                "msg": "The ansible_mount found for path {} is invalid.\n"
+                       "This is likely to be an Ansible bug. The record was:\n"
+                       "{}".format(path, json.dumps(mount, indent=2)),
+            }
+
+        if usage > threshold:
+            return {
+                "failed": True,
+                "msg": (
+                    "For Docker OverlayFS mount point {path},\n"
+                    "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}."
+                ).format(path=mount["mount"], pct=usage, thresh=threshold)
+            }
+
+        return {}
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
new file mode 100644
index 000000000..f4296753a
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
@@ -0,0 +1,72 @@
+"""
+Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster.
+"""
+
+from openshift_checks import OpenShiftCheck
+
+
+class EtcdImageDataSize(OpenShiftCheck):
+    """Check that total size of OpenShift image data does not exceed the recommended limit in an etcd cluster"""
+
+    name = "etcd_imagedata_size"
+    tags = ["etcd"]
+
+    def run(self):
+        etcd_mountpath = self.find_ansible_mount("/var/lib/etcd")
+        etcd_avail_diskspace = etcd_mountpath["size_available"]
+        etcd_total_diskspace = etcd_mountpath["size_total"]
+
+        etcd_imagedata_size_limit = self.get_var(
+            "etcd_max_image_data_size_bytes",
+            default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace))
+        )
+
+        etcd_is_ssl = self.get_var("openshift", "master", "etcd_use_ssl", default=False)
+        etcd_port = self.get_var("openshift", "master", "etcd_port", default=2379)
+        etcd_hosts = self.get_var("openshift", "master", "etcd_hosts")
+
+        config_base = self.get_var("openshift", "common", "config_base")
+
+        cert = self.get_var("etcd_client_cert", default=config_base + "/master/master.etcd-client.crt")
+        key = self.get_var("etcd_client_key", default=config_base + "/master/master.etcd-client.key")
+        ca_cert = self.get_var("etcd_client_ca_cert", default=config_base + "/master/master.etcd-ca.crt")
+
+        for etcd_host in list(etcd_hosts):
+            args = {
+                "size_limit_bytes": etcd_imagedata_size_limit,
+                "paths": ["/openshift.io/images", "/openshift.io/imagestreams"],
+                "host": etcd_host,
+                "port": etcd_port,
+                "protocol": "https" if etcd_is_ssl else "http",
+                "version_prefix": "/v2",
+                "allow_redirect": True,
+                "ca_cert": ca_cert,
+                "cert": {
+                    "cert": cert,
+                    "key": key,
+                },
+            }
+
+            etcdkeysize = self.execute_module("etcdkeysize", args)
+
+            if etcdkeysize.get("rc", 0) != 0 or etcdkeysize.get("failed"):
+                msg = 'Failed to retrieve stats for etcd host "{host}": {reason}'
+                reason = etcdkeysize.get("msg")
+                if etcdkeysize.get("module_stderr"):
+                    reason = etcdkeysize["module_stderr"]
+
+                msg = msg.format(host=etcd_host, reason=reason)
+                return {"failed": True, "msg": msg}
+
+            if etcdkeysize["size_limit_exceeded"]:
+                limit = self._to_gigabytes(etcd_imagedata_size_limit)
+                msg = ("The size of OpenShift image data stored in etcd host "
+                       "\"{host}\" exceeds the maximum recommended limit of {limit:.2f} GB. "
+                       "Use the `oadm prune images` command to cleanup unused Docker images.")
+                return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)}
+
+        return {}
+
+    @staticmethod
+    def _to_gigabytes(byte_size):
+        return float(byte_size) / 10.0**9
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
new file mode 100644
index 000000000..8b20ccb49
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
@@ -0,0 +1,44 @@
+"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic."""
+
+from openshift_checks import OpenShiftCheck
+
+
+class EtcdTraffic(OpenShiftCheck):
+    """Check if host is being affected by an increase in etcd traffic."""
+
+    name = "etcd_traffic"
+    tags = ["health", "etcd"]
+
+    def is_active(self):
+        """Skip hosts that do not have etcd in their group names."""
+        group_names = self.get_var("group_names", default=[])
+        valid_group_names = "oo_etcd_to_config" in group_names
+
+        version = self.get_major_minor_version(self.get_var("openshift_image_tag"))
+        valid_version = version in ((3, 4), (3, 5))
+
+        return super(EtcdTraffic, self).is_active() and valid_group_names and valid_version
+
+    def run(self):
+        is_containerized = self.get_var("openshift", "common", "is_containerized")
+        unit = "etcd_container" if is_containerized else "etcd"
+
+        log_matchers = [{
+            "start_regexp": r"Starting Etcd Server",
+            "regexp": r"etcd: sync duration of [^,]+, expected less than 1s",
+            "unit": unit
+        }]
+
+        match = self.execute_module("search_journalctl", {"log_matchers": log_matchers})
+
+        if match.get("matched"):
+            msg = ("Higher than normal etcd traffic detected.\n"
+                   "OpenShift 3.4 introduced an increase in etcd traffic.\n"
+                   "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n"
+                   "Please refer to https://access.redhat.com/solutions/2916381 for more information.")
+            return {"failed": True, "msg": msg}
+
+        if match.get("failed"):
+            return {"failed": True, "msg": "\n".join(match.get("errors"))}
+
+        return {}
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
new file mode 100644
index 000000000..3d75da6f9
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
@@ -0,0 +1,47 @@
+"""A health check for OpenShift clusters."""
+
+from openshift_checks import OpenShiftCheck
+
+
+class EtcdVolume(OpenShiftCheck):
+    """Ensures etcd storage usage does not exceed a given threshold."""
+
+    name = "etcd_volume"
+    tags = ["etcd", "health"]
+
+    # Default device usage threshold. Value should be in the range [0, 100].
+    default_threshold_percent = 90
+    # Where to find etcd data
+    etcd_mount_path = "/var/lib/etcd"
+
+    def is_active(self):
+        etcd_hosts = (
+            self.get_var("groups", "oo_etcd_to_config", default=[]) or
+            self.get_var("groups", "oo_masters_to_config", default=[]) or
+            []
+        )
+        is_etcd_host = self.get_var("ansible_host") in etcd_hosts
+        return super(EtcdVolume, self).is_active() and is_etcd_host
+
+    def run(self):
+        mount_info = self.find_ansible_mount(self.etcd_mount_path)
+        available = mount_info["size_available"]
+        total = mount_info["size_total"]
+        used = total - available
+
+        threshold = self.get_var(
+            "etcd_device_usage_threshold_percent",
+            default=self.default_threshold_percent
+        )
+
+        used_percent = 100.0 * used / total
+
+        if used_percent > threshold:
+            device = mount_info.get("device", "unknown")
+            mount = mount_info.get("mount", "unknown")
+            msg = "etcd storage usage ({:.1f}%) is above threshold ({:.1f}%). Device: {}, mount: {}.".format(
+                used_percent, threshold, device, mount
+            )
+            return {"failed": True, "msg": msg}
+
+        return {}
diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
new file mode 100644
index 000000000..b27f97172
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -0,0 +1,43 @@
+"""Check for an aggregated logging Curator deployment"""
+
+from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck
+
+
+class Curator(LoggingCheck):
+    """Check for an aggregated logging Curator deployment"""
+
+    name = "curator"
+    tags = ["health", "logging"]
+
+    def run(self):
+        """Check various things and gather errors. Returns: result as hash"""
+
+        curator_pods = self.get_pods_for_component("curator")
+        self.check_curator(curator_pods)
+        # TODO(lmeyer): run it all again for the ops cluster
+
+        return {}
+
+    def check_curator(self, pods):
+        """Check to see if curator is up and working. Returns: error string"""
+        if not pods:
+            raise OpenShiftCheckException(
+                "MissingComponentPods",
+                "There are no Curator pods for the logging stack,\n"
+                "so nothing will prune Elasticsearch indexes.\n"
+                "Is Curator correctly deployed?"
+            )
+
+        not_running = self.not_running_pods(pods)
+        if len(not_running) == len(pods):
+            raise OpenShiftCheckException(
+                "CuratorNotRunning",
+                "The Curator pod is not currently in a running state,\n"
+                "so Elasticsearch indexes may increase without bound."
+            )
+        if len(pods) - len(not_running) > 1:
+            raise OpenShiftCheckException(
+                "TooManyCurators",
+                "There is more than one Curator pod running. This should not normally happen.\n"
+                "Although this doesn't cause any problems, you may want to investigate."
+            )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
new file mode 100644
index 000000000..986a01f38
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -0,0 +1,212 @@
+"""Check for an aggregated logging Elasticsearch deployment"""
+
+import json
+import re
+
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Elasticsearch(LoggingCheck):
+    """Check for an aggregated logging Elasticsearch deployment"""
+
+    name = "elasticsearch"
+    tags = ["health", "logging"]
+
+    def run(self):
+        """Check various things and gather errors. Returns: result as hash"""
+
+        es_pods = self.get_pods_for_component("es")
+        self.check_elasticsearch(es_pods)
+        # TODO(lmeyer): run it all again for the ops cluster
+
+        return {}
+
+    def check_elasticsearch(self, es_pods):
+        """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors."""
+        running_pods, errors = self.running_elasticsearch_pods(es_pods)
+        pods_by_name = {
+            pod['metadata']['name']: pod for pod in running_pods
+            # Filter out pods that are not members of a DC
+            if pod['metadata'].get('labels', {}).get('deploymentconfig')
+        }
+        if not pods_by_name:
+            # nothing running, cannot run the rest of the check
+            errors.append(OpenShiftCheckException(
+                'NoRunningPods',
+                'No logging Elasticsearch pods were found running, so no logs are being aggregated.'
+            ))
+            raise OpenShiftCheckExceptionList(errors)
+
+        errors += self.check_elasticsearch_masters(pods_by_name)
+        errors += self.check_elasticsearch_node_list(pods_by_name)
+        errors += self.check_es_cluster_health(pods_by_name)
+        errors += self.check_elasticsearch_diskspace(pods_by_name)
+        if errors:
+            raise OpenShiftCheckExceptionList(errors)
+
+    def running_elasticsearch_pods(self, es_pods):
+        """Returns: list of running pods, list of errors about non-running pods"""
+        not_running = self.not_running_pods(es_pods)
+        running_pods = [pod for pod in es_pods if pod not in not_running]
+        if not_running:
+            return running_pods, [OpenShiftCheckException(
+                'PodNotRunning',
+                'The following Elasticsearch pods are defined but not running:\n'
+                '{pods}'.format(pods=''.join(
+                    "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+                    for pod in not_running
+                ))
+            )]
+        return running_pods, []
+
+    @staticmethod
+    def _build_es_curl_cmd(pod_name, url):
+        base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
+        return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
+
+    def check_elasticsearch_masters(self, pods_by_name):
+        """Check that Elasticsearch masters are sane. Returns: list of errors"""
+        es_master_names = set()
+        errors = []
+        for pod_name in pods_by_name.keys():
+            # Compare what each ES node reports as master and compare for split brain
+            get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
+            master_name_str = self.exec_oc(get_master_cmd, [], save_as_name="get_master_names.json")
+            master_names = (master_name_str or '').split(' ')
+            if len(master_names) > 1:
+                es_master_names.add(master_names[1])
+            else:
+                errors.append(OpenShiftCheckException(
+                    'NoMasterName',
+                    'Elasticsearch {pod} gave unexpected response when asked master name:\n'
+                    '  {response}'.format(pod=pod_name, response=master_name_str)
+                ))
+
+        if not es_master_names:
+            errors.append(OpenShiftCheckException(
+                'NoMasterFound',
+                'No logging Elasticsearch masters were found.'
+            ))
+            return errors
+
+        if len(es_master_names) > 1:
+            errors.append(OpenShiftCheckException(
+                'SplitBrainMasters',
+                'Found multiple Elasticsearch masters according to the pods:\n'
+                '{master_list}\n'
+                'This implies that the masters have "split brain" and are not correctly\n'
+                'replicating data for the logging cluster. Log loss is likely to occur.'
+                .format(master_list='\n'.join('  ' + master for master in es_master_names))
+            ))
+
+        return errors
+
+    def check_elasticsearch_node_list(self, pods_by_name):
+        """Check that reported ES masters are accounted for by pods. Returns: list of errors"""
+
+        if not pods_by_name:
+            return [OpenShiftCheckException(
+                'MissingComponentPods',
+                'No logging Elasticsearch pods were found.'
+            )]
+
+        # get ES cluster nodes
+        node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
+        cluster_node_data = self.exec_oc(node_cmd, [], save_as_name="get_es_nodes.json")
+        try:
+            cluster_nodes = json.loads(cluster_node_data)['nodes']
+        except (ValueError, KeyError):
+            return [OpenShiftCheckException(
+                'MissingNodeList',
+                'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
+                cluster_node_data
+            )]
+
+        # Try to match all ES-reported node hosts to known pods.
+        errors = []
+        for node in cluster_nodes.values():
+            # Note that with 1.4/3.4 the pod IP may be used as the master name
+            if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
+                       for pod_name, pod in pods_by_name.items()):
+                errors.append(OpenShiftCheckException(
+                    'EsPodNodeMismatch',
+                    'The Elasticsearch cluster reports a member node "{node}"\n'
+                    'that does not correspond to any known ES pod.'.format(node=node['host'])
+                ))
+
+        return errors
+
+    def check_es_cluster_health(self, pods_by_name):
+        """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
+        errors = []
+        for pod_name in pods_by_name.keys():
+            cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
+            cluster_health_data = self.exec_oc(cluster_health_cmd, [], save_as_name='get_es_health.json')
+            try:
+                health_res = json.loads(cluster_health_data)
+                if not health_res or not health_res.get('status'):
+                    raise ValueError()
+            except ValueError:
+                errors.append(OpenShiftCheckException(
+                    'BadEsResponse',
+                    'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
+                    'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
+                ))
+                continue
+
+            if health_res['status'] not in ['green', 'yellow']:
+                errors.append(OpenShiftCheckException(
+                    'EsClusterHealthRed',
+                    'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
+                ))
+
+        return errors
+
+    def check_elasticsearch_diskspace(self, pods_by_name):
+        """
+        Exec into an ES pod and query the diskspace on the persistent volume.
+        Returns: list of errors
+        """
+        errors = []
+        for pod_name in pods_by_name.keys():
+            df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
+            disk_output = self.exec_oc(df_cmd, [], save_as_name='get_pv_diskspace.json')
+            lines = disk_output.splitlines()
+            # expecting one header looking like 'IUse% Use%' and one body line
+            body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
+            if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
+                errors.append(OpenShiftCheckException(
+                    'BadDfResponse',
+                    'Could not retrieve storage usage from logging ES pod "{pod}".\n'
+                    'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
+                ))
+                continue
+            inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
+
+            inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90')
+            if int(inode_pct) >= int(inode_pct_thresh):
+                errors.append(OpenShiftCheckException(
+                    'InodeUsageTooHigh',
+                    'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
+                    '  is {pct}, greater than threshold {limit}.\n'
+                    '  Note: threshold can be specified in inventory with {param}'.format(
+                        pod=pod_name,
+                        pct=str(inode_pct),
+                        limit=str(inode_pct_thresh),
+                        param='openshift_check_efk_es_inode_pct',
+                    )))
+            disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80')
+            if int(disk_pct) >= int(disk_pct_thresh):
+                errors.append(OpenShiftCheckException(
+                    'DiskUsageTooHigh',
+                    'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
+                    '  is {pct}, greater than threshold {limit}.\n'
+                    '  Note: threshold can be specified in inventory with {param}'.format(
+                        pod=pod_name,
+                        pct=str(disk_pct),
+                        limit=str(disk_pct_thresh),
+                        param='openshift_check_efk_es_storage_pct',
+                    )))
+
+        return errors
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
new file mode 100644
index 000000000..3b192a281
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -0,0 +1,154 @@
+"""Check for an aggregated logging Fluentd deployment"""
+
+import json
+
+
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Fluentd(LoggingCheck):
+    """Check for an aggregated logging Fluentd deployment"""
+
+    name = "fluentd"
+    tags = ["health", "logging"]
+
+    def run(self):
+        """Check the Fluentd deployment and raise an error if any problems are found."""
+
+        fluentd_pods = self.get_pods_for_component("fluentd")
+        self.check_fluentd(fluentd_pods)
+        return {}
+
+    def check_fluentd(self, pods):
+        """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found."""
+
+        node_selector = self.get_var(
+            'openshift_logging_fluentd_nodeselector',
+            default='logging-infra-fluentd=true'
+        )
+
+        nodes_by_name = self.get_nodes_by_name()
+        fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
+
+        errors = []
+        errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
+        errors += self.check_nodes_have_fluentd(pods, fluentd_nodes)
+        errors += self.check_fluentd_pods_running(pods)
+
+        # Make sure there are no extra fluentd pods
+        if len(pods) > len(fluentd_nodes):
+            errors.append(OpenShiftCheckException(
+                'TooManyFluentdPods',
+                'There are more Fluentd pods running than nodes labeled.\n'
+                'This may not cause problems with logging but it likely indicates something wrong.'
+            ))
+
+        if errors:
+            raise OpenShiftCheckExceptionList(errors)
+
+    def get_nodes_by_name(self):
+        """Retrieve all the node definitions. Returns: dict(name: node)"""
+        nodes_json = self.exec_oc("get nodes -o json", [])
+        try:
+            nodes = json.loads(nodes_json)
+        except ValueError:  # no valid json - should not happen
+            raise OpenShiftCheckException(
+                "BadOcNodeList",
+                "Could not obtain a list of nodes to validate fluentd.\n"
+                "Output from oc get:\n" + nodes_json
+            )
+        if not nodes or not nodes.get('items'):  # also should not happen
+            raise OpenShiftCheckException(
+                "NoNodesDefined",
+                "No nodes appear to be defined according to the API."
+            )
+        return {
+            node['metadata']['name']: node
+            for node in nodes['items']
+        }
+
+    @staticmethod
+    def filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+        """Filter to all nodes with fluentd label. Returns dict(name: node)"""
+        label, value = node_selector.split('=', 1)
+        fluentd_nodes = {
+            name: node for name, node in nodes_by_name.items()
+            if node['metadata']['labels'].get(label) == value
+        }
+        if not fluentd_nodes:
+            raise OpenShiftCheckException(
+                'NoNodesLabeled',
+                'There are no nodes with the fluentd label {label}.\n'
+                'This means no logs will be aggregated from the nodes.'.format(label=node_selector)
+            )
+        return fluentd_nodes
+
+    def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
+        """Note if nodes are not labeled as expected. Returns: error list"""
+        intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all'])
+        if not intended_nodes or '--all' in intended_nodes:
+            intended_nodes = nodes_by_name.keys()
+        nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
+        if nodes_missing_labels:
+            return [OpenShiftCheckException(
+                'NodesUnlabeled',
+                'The following nodes are supposed to be labeled with {label} but are not:\n'
+                '  {nodes}\n'
+                'Fluentd will not aggregate logs from these nodes.'.format(
+                    label=node_selector, nodes=', '.join(nodes_missing_labels)
+                ))]
+
+        return []
+
+    @staticmethod
+    def check_nodes_have_fluentd(pods, fluentd_nodes):
+        """Make sure fluentd is on all the labeled nodes. Returns: error list"""
+        unmatched_nodes = fluentd_nodes.copy()
+        node_names_by_label = {
+            node['metadata']['labels']['kubernetes.io/hostname']: name
+            for name, node in fluentd_nodes.items()
+        }
+        node_names_by_internal_ip = {
+            address['address']: name
+            for name, node in fluentd_nodes.items()
+            for address in node['status']['addresses']
+            if address['type'] == "InternalIP"
+        }
+        for pod in pods:
+            for name in [
+                    pod['spec']['nodeName'],
+                    node_names_by_internal_ip.get(pod['spec']['nodeName']),
+                    node_names_by_label.get(pod.get('spec', {}).get('host')),
+            ]:
+                unmatched_nodes.pop(name, None)
+        if unmatched_nodes:
+            return [OpenShiftCheckException(
+                'MissingFluentdPod',
+                'The following nodes are supposed to have a Fluentd pod but do not:\n'
+                '  {nodes}\n'
+                'These nodes will not have their logs aggregated.'.format(
+                    nodes='\n  '.join(unmatched_nodes.keys())
+                ))]
+
+        return []
+
+    def check_fluentd_pods_running(self, pods):
+        """Make sure all fluentd pods are running. Returns: error string"""
+        not_running = super(Fluentd, self).not_running_pods(pods)
+        if not_running:
+            return [OpenShiftCheckException(
+                'FluentdNotRunning',
+                'The following Fluentd pods are supposed to be running but are not:\n'
+                '  {pods}\n'
+                'These pods will not aggregate logs from their nodes.'.format(
+                    pods='\n'.join(
+                        "  {name} ({host})".format(
+                            name=pod['metadata']['name'],
+                            host=pod['spec'].get('host', 'None')
+                        )
+                        for pod in not_running
+                    )
+                ))]
+
+        return []
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
new file mode 100644
index 000000000..e93cc9028
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
@@ -0,0 +1,131 @@
+"""
+Module for performing checks on a Fluentd logging deployment configuration
+"""
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class FluentdConfig(LoggingCheck):
+    """Module that checks logging configuration of an integrated logging Fluentd deployment"""
+    name = "fluentd_config"
+    tags = ["health"]
+
+    def is_active(self):
+        logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False)
+
+        try:
+            version = self.get_major_minor_version(self.get_var("openshift_image_tag"))
+        except ValueError:
+            # if failed to parse OpenShift version, perform check anyway (if logging enabled)
+            return logging_deployed
+
+        return logging_deployed and version < (3, 6)
+
+    def run(self):
+        """Check that Fluentd has running pods, and that its logging config matches Docker's logging config."""
+        config_error = self.check_logging_config()
+        if config_error:
+            msg = ("The following Fluentd logging configuration problem was found:"
+                   "\n{}".format(config_error))
+            return {"failed": True, "msg": msg}
+
+        return {}
+
+    def check_logging_config(self):
+        """Ensure that the configured Docker logging driver matches fluentd settings.
+        This means that, at least for now, if the following condition is met:
+
+            openshift_logging_fluentd_use_journal == True
+
+        then the value of the configured Docker logging driver should be "journald".
+        Otherwise, the value of the Docker logging driver should be "json-file".
+        Returns an error string if the above condition is not met, or None otherwise."""
+        use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True)
+
+        # if check is running on a master, retrieve all running pods
+        # and check any pod's container for the env var "USE_JOURNAL"
+        group_names = self.get_var("group_names")
+        if "oo_masters_to_config" in group_names:
+            use_journald = self.check_fluentd_env_var()
+
+        docker_info = self.execute_module("docker_info", {})
+        try:
+            logging_driver = docker_info["info"]["LoggingDriver"]
+        except KeyError:
+            return "Unable to determine Docker logging driver."
+
+        logging_driver = docker_info["info"]["LoggingDriver"]
+        recommended_logging_driver = "journald"
+        error = None
+
+        # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver`
+        # option as an inventory file variable, or adding the log driver value as part of the
+        # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that
+        # can be passed to the Docker binary; the only other recommendation that can be made, would be
+        # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running
+        # individual containers.
+        if use_journald and logging_driver != "journald":
+            error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n'
+                     'This differs from your Docker configuration, which has been set to use "{driver}" '
+                     'as the default method of storing logs.\n'
+                     'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+                     'from your Docker containers.').format(driver=logging_driver)
+        elif not use_journald and logging_driver != "json-file":
+            recommended_logging_driver = "json-file"
+            error = ('Your Fluentd configuration is set to aggregate Docker container logs from '
+                     'individual json log files per container.\n '
+                     'This differs from your Docker configuration, which has been set to use '
+                     '"{driver}" as the default method of storing logs.\n'
+                     'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+                     'from your Docker containers.').format(driver=logging_driver)
+
+        if error:
+            error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n'
+                      '  openshift_docker_options="--log-driver={driver}"\n\n'
+                      'Alternatively, you can add the following option to your Docker configuration, located in'
+                      '"/etc/docker/daemon.json":\n\n'
+                      '{{ "log-driver": "{driver}" }}\n\n'
+                      'See https://docs.docker.com/engine/admin/logging/json-file '
+                      'for more information.').format(driver=recommended_logging_driver)
+
+        return error
+
+    def check_fluentd_env_var(self):
+        """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod."""
+        running_pods = self.running_fluentd_pods()
+
+        try:
+            pod_containers = running_pods[0]["spec"]["containers"]
+        except KeyError:
+            return "Unable to detect running containers on selected Fluentd pod."
+
+        if not pod_containers:
+            msg = ('There are no running containers on selected Fluentd pod "{}".\n'
+                   'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", ""))
+            raise OpenShiftCheckException(msg)
+
+        pod_env = pod_containers[0].get("env")
+        if not pod_env:
+            msg = ('There are no environment variables set on the Fluentd container "{}".\n'
+                   'Unable to calculate expected logging driver.').format(pod_containers[0].get("name"))
+            raise OpenShiftCheckException(msg)
+
+        for env in pod_env:
+            if env["name"] == "USE_JOURNAL":
+                return env.get("value", "false") != "false"
+
+        return False
+
+    def running_fluentd_pods(self):
+        """Return a list of running fluentd pods."""
+        fluentd_pods = self.get_pods_for_component("fluentd")
+
+        running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running']
+        if not running_fluentd_pods:
+            raise OpenShiftCheckException(
+                'No Fluentd pods were found to be in the "Running" state. '
+                'At least one Fluentd pod is required in order to perform this check.'
+            )
+
+        return running_fluentd_pods
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
new file mode 100644
index 000000000..3b1cf8baa
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -0,0 +1,226 @@
+"""
+Module for performing checks on a Kibana logging deployment
+"""
+
+import json
+import ssl
+
+try:
+    from urllib2 import HTTPError, URLError
+    import urllib2
+except ImportError:
+    from urllib.error import HTTPError, URLError
+    import urllib.request as urllib2
+
+from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException
+
+
+class Kibana(LoggingCheck):
+    """Module that checks an integrated logging Kibana deployment"""
+
+    name = "kibana"
+    tags = ["health", "logging"]
+
+    def run(self):
+        """Check various things and gather errors. Returns: result as hash"""
+
+        kibana_pods = self.get_pods_for_component("kibana")
+        self.check_kibana(kibana_pods)
+        self.check_kibana_route()
+        # TODO(lmeyer): run it all again for the ops cluster
+
+        return {}
+
+    def _verify_url_internal(self, url):
+        """
+        Try to reach a URL from the host.
+        Returns: success (bool), reason (for failure)
+        """
+        args = dict(
+            url=url,
+            follow_redirects='none',
+            validate_certs='no',  # likely to be signed with internal CA
+            # TODO(lmeyer): give users option to validate certs
+            status_code=302,
+        )
+        result = self.execute_module('uri', args)
+        if result.get('failed'):
+            return result['msg']
+        return None
+
+    @staticmethod
+    def _verify_url_external(url):
+        """
+        Try to reach a URL from ansible control host.
+        Raise an OpenShiftCheckException if anything goes wrong.
+        """
+        # This actually checks from the ansible control host, which may or may not
+        # really be "external" to the cluster.
+
+        # Disable SSL cert validation to work around internally signed certs
+        ctx = ssl.create_default_context()
+        ctx.check_hostname = False  # or setting CERT_NONE is refused
+        ctx.verify_mode = ssl.CERT_NONE
+
+        # Verify that the url is returning a valid response
+        try:
+            # We only care if the url connects and responds
+            return_code = urllib2.urlopen(url, context=ctx).getcode()
+        except HTTPError as httperr:
+            return httperr.reason
+        except URLError as urlerr:
+            return str(urlerr)
+
+        # there appears to be no way to prevent urlopen from following redirects
+        if return_code != 200:
+            return 'Expected success (200) but got return code {}'.format(int(return_code))
+
+        return None
+
+    def check_kibana(self, pods):
+        """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not."""
+
+        if not pods:
+            raise OpenShiftCheckException(
+                "MissingComponentPods",
+                "There are no Kibana pods deployed, so no access to the logging UI."
+            )
+
+        not_running = self.not_running_pods(pods)
+        if len(not_running) == len(pods):
+            raise OpenShiftCheckException(
+                "NoRunningPods",
+                "No Kibana pod is in a running state, so there is no access to the logging UI."
+            )
+        elif not_running:
+            raise OpenShiftCheckException(
+                "PodNotRunning",
+                "The following Kibana pods are not currently in a running state:\n"
+                "  {pods}\n"
+                "However at least one is, so service may not be impacted.".format(
+                    pods="\n  ".join(pod['metadata']['name'] for pod in not_running)
+                )
+            )
+
+    def _get_kibana_url(self):
+        """
+        Get kibana route or report error.
+        Returns: url
+        """
+
+        # Get logging url
+        get_route = self.exec_oc("get route logging-kibana -o json", [])
+        if not get_route:
+            raise OpenShiftCheckException(
+                'no_route_exists',
+                'No route is defined for Kibana in the logging namespace,\n'
+                'so the logging stack is not accessible. Is logging deployed?\n'
+                'Did something remove the logging-kibana route?'
+            )
+
+        try:
+            route = json.loads(get_route)
+            # check that the route has been accepted by a router
+            ingress = route["status"]["ingress"]
+        except (ValueError, KeyError):
+            raise OpenShiftCheckException(
+                'get_route_failed',
+                '"oc get route" returned an unexpected response:\n' + get_route
+            )
+
+        # ingress can be null if there is no router, or empty if not routed
+        if not ingress or not ingress[0]:
+            raise OpenShiftCheckException(
+                'route_not_accepted',
+                'The logging-kibana route is not being routed by any router.\n'
+                'Is the router deployed and working?'
+            )
+
+        host = route.get("spec", {}).get("host")
+        if not host:
+            raise OpenShiftCheckException(
+                'route_missing_host',
+                'The logging-kibana route has no hostname defined,\n'
+                'which should never happen. Did something alter its definition?'
+            )
+
+        return 'https://{}/'.format(host)
+
+    def check_kibana_route(self):
+        """
+        Check to see if kibana route is up and working.
+        Raises exception if not.
+        """
+
+        kibana_url = self._get_kibana_url()
+
+        # first, check that kibana is reachable from the master.
+        error = self._verify_url_internal(kibana_url)
+        if error:
+            if 'urlopen error [Errno 111] Connection refused' in error:
+                raise OpenShiftCheckException(
+                    'FailedToConnectInternal',
+                    'Failed to connect from this master to Kibana URL {url}\n'
+                    'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url)
+                )
+            elif 'urlopen error [Errno -2] Name or service not known' in error:
+                raise OpenShiftCheckException(
+                    'FailedToResolveInternal',
+                    'Failed to connect from this master to Kibana URL {url}\n'
+                    'because the hostname does not resolve.\n'
+                    'Is DNS configured for the Kibana hostname?'.format(url=kibana_url)
+                )
+            elif 'Status code was not' in error:
+                raise OpenShiftCheckException(
+                    'WrongReturnCodeInternal',
+                    'A request from this master to the Kibana URL {url}\n'
+                    'did not return the correct status code (302).\n'
+                    'This could mean that Kibana is malfunctioning, the hostname is\n'
+                    'resolving incorrectly, or other network issues. The output was:\n'
+                    '  {error}'.format(url=kibana_url, error=error)
+                )
+            raise OpenShiftCheckException(
+                'MiscRouteErrorInternal',
+                'Error validating the logging Kibana route internally:\n' + error
+            )
+
+        # in production we would like the kibana route to work from outside the
+        # cluster too; but that may not be the case, so allow disabling just this part.
+        if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true":
+            return
+        error = self._verify_url_external(kibana_url)
+
+        if not error:
+            return
+
+        error_fmt = (
+            'Error validating the logging Kibana route:\n{error}\n'
+            'To disable external Kibana route validation, set the variable:\n'
+            '  openshift_check_efk_kibana_external=False'
+        )
+        if 'urlopen error [Errno 111] Connection refused' in error:
+            msg = (
+                'Failed to connect from the Ansible control host to Kibana URL {url}\n'
+                'Is the router for the Kibana hostname exposed externally?'
+            ).format(url=kibana_url)
+            raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg))
+        elif 'urlopen error [Errno -2] Name or service not known' in error:
+            msg = (
+                'Failed to resolve the Kibana hostname in {url}\n'
+                'from the Ansible control host.\n'
+                'Is DNS configured to resolve this Kibana hostname externally?'
+            ).format(url=kibana_url)
+            raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg))
+        elif 'Expected success (200)' in error:
+            msg = (
+                'A request to Kibana at {url}\n'
+                'returned the wrong error code:\n'
+                '  {error}\n'
+                'This could mean that Kibana is malfunctioning, the hostname is\n'
+                'resolving incorrectly, or other network issues.'
+            ).format(url=kibana_url, error=error)
+            raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg))
+        raise OpenShiftCheckException(
+            'MiscRouteError',
+            'Error validating the logging Kibana route externally:\n' + error
+        )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
new file mode 100644
index 000000000..05ba73ca1
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -0,0 +1,101 @@
+"""
+Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack
+"""
+
+import json
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+class MissingComponentPods(OpenShiftCheckException):
+    """Raised when a component has no pods in the namespace."""
+    pass
+
+
+class CouldNotUseOc(OpenShiftCheckException):
+    """Raised when ocutil has a failure running oc."""
+    pass
+
+
+class LoggingCheck(OpenShiftCheck):
+    """Base class for OpenShift aggregated logging component checks"""
+
+    # FIXME: this should not be listed as a check, since it is not meant to be
+    # run by itself.
+
+    name = "logging"
+
+    def is_active(self):
+        logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False)
+        return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master()
+
+    def run(self):
+        return {}
+
+    def get_pods_for_component(self, logging_component):
+        """Get all pods for a given component. Returns: list of pods."""
+        pod_output = self.exec_oc(
+            "get pods -l component={} -o json".format(logging_component),
+            [],
+        )
+        try:
+            pods = json.loads(pod_output)  # raises ValueError if deserialize fails
+            if not pods or not pods.get('items'):  # also a broken response, treat the same
+                raise ValueError()
+        except ValueError:
+            # successful run but non-parsing data generally means there were no pods to be found
+            raise MissingComponentPods(
+                'There are no "{}" component pods in the "{}" namespace.\n'
+                'Is logging deployed?'.format(logging_component, self.logging_namespace())
+            )
+
+        return pods['items']
+
+    @staticmethod
+    def not_running_pods(pods):
+        """Returns: list of pods not in a ready and running state"""
+        return [
+            pod for pod in pods
+            if not pod.get("status", {}).get("containerStatuses") or any(
+                container['ready'] is False
+                for container in pod['status']['containerStatuses']
+            ) or not any(
+                condition['type'] == 'Ready' and condition['status'] == 'True'
+                for condition in pod['status'].get('conditions', [])
+            )
+        ]
+
+    def logging_namespace(self):
+        """Returns the namespace in which logging is configured to deploy."""
+        return self.get_var("openshift_logging_namespace", default="logging")
+
+    def exec_oc(self, cmd_str="", extra_args=None, save_as_name=None):
+        """
+        Execute an 'oc' command in the remote host.
+        Returns: output of command and namespace,
+        or raises CouldNotUseOc on error
+        """
+        config_base = self.get_var("openshift", "common", "config_base")
+        args = {
+            "namespace": self.logging_namespace(),
+            "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+            "cmd": cmd_str,
+            "extra_args": list(extra_args) if extra_args else [],
+        }
+
+        result = self.execute_module("ocutil", args, save_as_name=save_as_name)
+        if result.get("failed"):
+            if result['result'] == '[Errno 2] No such file or directory':
+                raise CouldNotUseOc(
+                    "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+                    "Has an installation been run on this host yet?"
+                )
+
+            raise CouldNotUseOc(
+                'Unexpected error using `oc` to validate the logging stack components.\n'
+                'Error executing `oc {cmd}`:\n'
+                '{error}'.format(cmd=args['cmd'], error=result['result'])
+            )
+
+        return result.get("result", "")
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
new file mode 100644
index 000000000..cacdf4213
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -0,0 +1,129 @@
+"""
+Check for ensuring logs from pods can be queried in a reasonable amount of time.
+"""
+
+import json
+import time
+
+from uuid import uuid4
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+ES_CMD_TIMEOUT_SECONDS = 30
+
+
+class LoggingIndexTime(LoggingCheck):
+    """Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time."""
+    name = "logging_index_time"
+    tags = ["health", "logging"]
+
+    def run(self):
+        """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs."""
+        try:
+            log_index_timeout = int(
+                self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS)
+            )
+        except ValueError:
+            raise OpenShiftCheckException(
+                'InvalidTimeout',
+                'Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
+                'Value must be an integer representing an amount in seconds.'
+            )
+
+        running_component_pods = dict()
+
+        # get all component pods
+        for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']):
+            pods = self.get_pods_for_component(component)
+            running_pods = self.running_pods(pods)
+
+            if not running_pods:
+                raise OpenShiftCheckException(
+                    component + 'NoRunningPods',
+                    'No {} pods in the "Running" state were found.'
+                    'At least one pod is required in order to perform this check.'.format(name)
+                )
+
+            running_component_pods[component] = running_pods
+
+        uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0])
+        self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout)
+        return {}
+
+    def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs):
+        """Retry an Elasticsearch query every second until query success, or a defined
+        length of time has passed."""
+        deadline = time.time() + timeout_secs
+        interval = 1
+        while not self.query_es_from_es(es_pod, uuid):
+            if time.time() + interval > deadline:
+                raise OpenShiftCheckException(
+                    "NoMatchFound",
+                    "expecting match in Elasticsearch for message with uuid {}, "
+                    "but no matches were found after {}s.".format(uuid, timeout_secs)
+                )
+            time.sleep(interval)
+
+    def curl_kibana_with_uuid(self, kibana_pod):
+        """curl Kibana with a unique uuid."""
+        uuid = self.generate_uuid()
+        pod_name = kibana_pod["metadata"]["name"]
+        exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}"
+        exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid)
+
+        error_str = self.exec_oc(exec_cmd, [])
+
+        try:
+            error_code = json.loads(error_str)["statusCode"]
+        except (KeyError, ValueError):
+            raise OpenShiftCheckException(
+                'kibanaInvalidResponse',
+                'invalid response returned from Kibana request:\n'
+                'Command: {}\nResponse: {}'.format(exec_cmd, error_str)
+            )
+
+        if error_code != 404:
+            raise OpenShiftCheckException(
+                'kibanaInvalidReturnCode',
+                'invalid error code returned from Kibana request.\n'
+                'Expecting error code "404", but got "{}" instead.'.format(error_code)
+            )
+
+        return uuid
+
+    def query_es_from_es(self, es_pod, uuid):
+        """curl the Elasticsearch pod and look for a unique uuid in its logs."""
+        pod_name = es_pod["metadata"]["name"]
+        exec_cmd = (
+            "exec {pod_name} -- curl --max-time 30 -s -f "
+            "--cacert /etc/elasticsearch/secret/admin-ca "
+            "--cert /etc/elasticsearch/secret/admin-cert "
+            "--key /etc/elasticsearch/secret/admin-key "
+            "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
+        )
+        exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid)
+        result = self.exec_oc(exec_cmd, [], save_as_name="query_for_uuid.json")
+
+        try:
+            count = json.loads(result)["count"]
+        except (KeyError, ValueError):
+            raise OpenShiftCheckException(
+                'esInvalidResponse',
+                'Invalid response from Elasticsearch query:\n'
+                '  {}\n'
+                'Response was:\n{}'.format(exec_cmd, result)
+            )
+
+        return count
+
+    @staticmethod
+    def running_pods(pods):
+        """Filter pods that are running."""
+        return [pod for pod in pods if pod['status']['phase'] == 'Running']
+
+    @staticmethod
+    def generate_uuid():
+        """Wrap uuid generator. Allows for testing with expected values."""
+        return str(uuid4())
diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py
index 28805dc37..e7a8ec976 100644
--- a/roles/openshift_health_checker/openshift_checks/memory_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py
@@ -1,5 +1,8 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that recommended memory is available."""
+from openshift_checks import OpenShiftCheck
+
+MIB = 2**20
+GIB = 2**30
 
 
 class MemoryAvailability(OpenShiftCheck):
@@ -11,33 +14,36 @@ class MemoryAvailability(OpenShiftCheck):
     # Values taken from the official installation documentation:
     # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
     recommended_memory_bytes = {
-        "masters": 16 * 10**9,
-        "nodes": 8 * 10**9,
-        "etcd": 20 * 10**9,
+        "oo_masters_to_config": 16 * GIB,
+        "oo_nodes_to_config": 8 * GIB,
+        "oo_etcd_to_config": 8 * GIB,
     }
+    # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal
+    memtotal_adjustment = 1 * GIB
 
-    @classmethod
-    def is_active(cls, task_vars):
+    def is_active(self):
         """Skip hosts that do not have recommended memory requirements."""
-        group_names = get_var(task_vars, "group_names", default=[])
-        has_memory_recommendation = bool(set(group_names).intersection(cls.recommended_memory_bytes))
-        return super(MemoryAvailability, cls).is_active(task_vars) and has_memory_recommendation
+        group_names = self.get_var("group_names", default=[])
+        has_memory_recommendation = bool(set(group_names).intersection(self.recommended_memory_bytes))
+        return super(MemoryAvailability, self).is_active() and has_memory_recommendation
 
-    def run(self, tmp, task_vars):
-        group_names = get_var(task_vars, "group_names")
-        total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * 10**6
+    def run(self):
+        group_names = self.get_var("group_names")
+        total_memory_bytes = self.get_var("ansible_memtotal_mb") * MIB
 
-        min_memory_bytes = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+        recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+        configured_min = float(self.get_var("openshift_check_min_host_memory_gb", default=0)) * GIB
+        min_memory_bytes = configured_min or recommended_min
 
-        if total_memory_bytes < min_memory_bytes:
+        if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes:
             return {
                 'failed': True,
                 'msg': (
-                    'Available memory ({available:.1f} GB) '
-                    'below recommended value ({recommended:.1f} GB)'
+                    'Available memory ({available:.1f} GiB) is too far '
+                    'below recommended value ({recommended:.1f} GiB)'
                 ).format(
-                    available=float(total_memory_bytes) / 10**9,
-                    recommended=float(min_memory_bytes) / 10**9,
+                    available=float(total_memory_bytes) / GIB,
+                    recommended=float(min_memory_bytes) / GIB,
                 ),
             }
 
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
index 20d160eaf..cfbdea303 100644
--- a/roles/openshift_health_checker/openshift_checks/mixins.py
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -1,15 +1,54 @@
-# pylint: disable=missing-docstring,too-few-public-methods
 """
 Mixin classes meant to be used with subclasses of OpenShiftCheck.
 """
 
-from openshift_checks import get_var
-
 
 class NotContainerizedMixin(object):
     """Mixin for checks that are only active when not in containerized mode."""
+    # permanent # pylint: disable=too-few-public-methods
+    # Reason: The mixin is not intended to stand on its own as a class.
+
+    def is_active(self):
+        """Only run on non-containerized hosts."""
+        is_containerized = self.get_var("openshift", "common", "is_containerized")
+        return super(NotContainerizedMixin, self).is_active() and not is_containerized
+
+
+class DockerHostMixin(object):
+    """Mixin for checks that are only active on hosts that require Docker."""
+
+    dependencies = []
+
+    def is_active(self):
+        """Only run on hosts that depend on Docker."""
+        group_names = set(self.get_var("group_names", default=[]))
+        needs_docker = set(["oo_nodes_to_config"])
+        if self.get_var("openshift.common.is_containerized"):
+            needs_docker.update(["oo_masters_to_config", "oo_etcd_to_config"])
+        return super(DockerHostMixin, self).is_active() and bool(group_names.intersection(needs_docker))
+
+    def ensure_dependencies(self):
+        """
+        Ensure that docker-related packages exist, but not on atomic hosts
+        (which would not be able to install but should already have them).
+        Returns: msg, failed
+        """
+        if self.get_var("openshift", "common", "is_atomic"):
+            return "", False
 
-    @classmethod
-    def is_active(cls, task_vars):
-        is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
-        return super(NotContainerizedMixin, cls).is_active(task_vars) and not is_containerized
+        # NOTE: we would use the "package" module but it's actually an action plugin
+        # and it's not clear how to invoke one of those. This is about the same anyway:
+        result = self.execute_module_with_retries(
+            self.get_var("ansible_pkg_mgr", default="yum"),
+            {"name": self.dependencies, "state": "present"},
+        )
+        msg = result.get("msg", "")
+        if result.get("failed"):
+            if "No package matching" in msg:
+                msg = "Ensure that all required dependencies can be installed via `yum`.\n"
+            msg = (
+                "Unable to install required packages on this host:\n"
+                "    {deps}\n{msg}"
+            ).format(deps=',\n    '.join(self.dependencies), msg=msg)
+        failed = result.get("failed", False) or result.get("rc", 0) != 0
+        return msg, failed
diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py
new file mode 100644
index 000000000..416805c4d
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py
@@ -0,0 +1,54 @@
+"""
+Ansible module for determining if an installed version of Open vSwitch is incompatible with the
+currently installed version of OpenShift.
+"""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+from openshift_checks.mixins import NotContainerizedMixin
+
+
+class OvsVersion(NotContainerizedMixin, OpenShiftCheck):
+    """Check that packages in a package_list are installed on the host
+    and are the correct version as determined by an OpenShift installation.
+    """
+
+    name = "ovs_version"
+    tags = ["health"]
+
+    openshift_to_ovs_version = {
+        "3.6": ["2.6", "2.7"],
+        "3.5": ["2.6", "2.7"],
+        "3.4": "2.4",
+    }
+
+    def is_active(self):
+        """Skip hosts that do not have package requirements."""
+        group_names = self.get_var("group_names", default=[])
+        master_or_node = 'oo_masters_to_config' in group_names or 'oo_nodes_to_config' in group_names
+        return super(OvsVersion, self).is_active() and master_or_node
+
+    def run(self):
+        args = {
+            "package_list": [
+                {
+                    "name": "openvswitch",
+                    "version": self.get_required_ovs_version(),
+                },
+            ],
+        }
+        return self.execute_module("rpm_version", args)
+
+    def get_required_ovs_version(self):
+        """Return the correct Open vSwitch version for the current OpenShift version"""
+        openshift_version_tuple = self.get_major_minor_version(self.get_var("openshift_image_tag"))
+
+        if openshift_version_tuple < (3, 5):
+            return self.openshift_to_ovs_version["3.4"]
+
+        openshift_version = ".".join(str(x) for x in openshift_version_tuple)
+        ovs_version = self.openshift_to_ovs_version.get(openshift_version)
+        if ovs_version:
+            return self.openshift_to_ovs_version[openshift_version]
+
+        msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+        raise OpenShiftCheckException(msg.format(openshift_version))
diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py
index a7eb720fd..090e438ff 100644
--- a/roles/openshift_health_checker/openshift_checks/package_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/package_availability.py
@@ -1,5 +1,6 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that required RPM packages are available."""
+
+from openshift_checks import OpenShiftCheck
 from openshift_checks.mixins import NotContainerizedMixin
 
 
@@ -9,26 +10,27 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
     name = "package_availability"
     tags = ["preflight"]
 
-    @classmethod
-    def is_active(cls, task_vars):
-        return super(PackageAvailability, cls).is_active(task_vars) and task_vars["ansible_pkg_mgr"] == "yum"
+    def is_active(self):
+        """Run only when yum is the package manager as the code is specific to it."""
+        return super(PackageAvailability, self).is_active() and self.get_var("ansible_pkg_mgr") == "yum"
 
-    def run(self, tmp, task_vars):
-        rpm_prefix = get_var(task_vars, "openshift", "common", "service_type")
-        group_names = get_var(task_vars, "group_names", default=[])
+    def run(self):
+        rpm_prefix = self.get_var("openshift", "common", "service_type")
+        group_names = self.get_var("group_names", default=[])
 
         packages = set()
 
-        if "masters" in group_names:
+        if "oo_masters_to_config" in group_names:
             packages.update(self.master_packages(rpm_prefix))
-        if "nodes" in group_names:
+        if "oo_nodes_to_config" in group_names:
             packages.update(self.node_packages(rpm_prefix))
 
         args = {"packages": sorted(set(packages))}
-        return self.execute_module("check_yum_update", args, tmp, task_vars)
+        return self.execute_module_with_retries("check_yum_update", args)
 
     @staticmethod
     def master_packages(rpm_prefix):
+        """Return a list of RPMs that we expect a master install to have available."""
         return [
             "{rpm_prefix}".format(rpm_prefix=rpm_prefix),
             "{rpm_prefix}-clients".format(rpm_prefix=rpm_prefix),
@@ -36,8 +38,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
             "bash-completion",
             "cockpit-bridge",
             "cockpit-docker",
-            "cockpit-kubernetes",
-            "cockpit-shell",
+            "cockpit-system",
             "cockpit-ws",
             "etcd",
             "httpd-tools",
@@ -45,6 +46,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
 
     @staticmethod
     def node_packages(rpm_prefix):
+        """Return a list of RPMs that we expect a node install to have available."""
         return [
             "{rpm_prefix}".format(rpm_prefix=rpm_prefix),
             "{rpm_prefix}-node".format(rpm_prefix=rpm_prefix),
diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py
index fd0c0a755..8464e8a5e 100644
--- a/roles/openshift_health_checker/openshift_checks/package_update.py
+++ b/roles/openshift_health_checker/openshift_checks/package_update.py
@@ -1,14 +1,14 @@
-# pylint: disable=missing-docstring
+"""Check that a yum update would not run into conflicts with available packages."""
 from openshift_checks import OpenShiftCheck
 from openshift_checks.mixins import NotContainerizedMixin
 
 
 class PackageUpdate(NotContainerizedMixin, OpenShiftCheck):
-    """Check that there are no conflicts in RPM packages."""
+    """Check that a yum update would not run into conflicts with available packages."""
 
     name = "package_update"
     tags = ["preflight"]
 
-    def run(self, tmp, task_vars):
+    def run(self):
         args = {"packages": []}
-        return self.execute_module("check_yum_update", args, tmp, task_vars)
+        return self.execute_module_with_retries("check_yum_update", args)
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
index 682f6bd40..2f09b22fc 100644
--- a/roles/openshift_health_checker/openshift_checks/package_version.py
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -1,5 +1,8 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that available RPM packages match the required versions."""
+
+import re
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
 from openshift_checks.mixins import NotContainerizedMixin
 
 
@@ -9,17 +12,116 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
     name = "package_version"
     tags = ["preflight"]
 
-    @classmethod
-    def is_active(cls, task_vars):
+    # NOTE: versions outside those specified are mapped to least/greatest
+    openshift_to_ovs_version = {
+        (3, 4): "2.4",
+        (3, 5): ["2.6", "2.7"],
+        (3, 6): ["2.6", "2.7"],
+    }
+
+    openshift_to_docker_version = {
+        (3, 1): "1.8",
+        (3, 2): "1.10",
+        (3, 3): "1.10",
+        (3, 4): "1.12",
+        (3, 5): "1.12",
+        (3, 6): "1.12",
+    }
+
+    # map major OpenShift release versions across releases to a common major version
+    map_major_release_version = {
+        1: 3,
+    }
+
+    def is_active(self):
         """Skip hosts that do not have package requirements."""
-        group_names = get_var(task_vars, "group_names", default=[])
-        master_or_node = 'masters' in group_names or 'nodes' in group_names
-        return super(PackageVersion, cls).is_active(task_vars) and master_or_node
+        group_names = self.get_var("group_names", default=[])
+        master_or_node = 'oo_masters_to_config' in group_names or 'oo_nodes_to_config' in group_names
+        return super(PackageVersion, self).is_active() and master_or_node
+
+    def run(self):
+        rpm_prefix = self.get_var("openshift", "common", "service_type")
+        openshift_release = self.get_var("openshift_release", default='')
+        deployment_type = self.get_var("openshift_deployment_type")
+        check_multi_minor_release = deployment_type in ['openshift-enterprise']
 
-    def run(self, tmp, task_vars):
         args = {
-            "requested_openshift_release": get_var(task_vars, "openshift_release", default=''),
-            "openshift_deployment_type": get_var(task_vars, "openshift_deployment_type"),
-            "rpm_prefix": get_var(task_vars, "openshift", "common", "service_type"),
+            "package_mgr": self.get_var("ansible_pkg_mgr"),
+            "package_list": [
+                {
+                    "name": "openvswitch",
+                    "version": self.get_required_ovs_version(),
+                    "check_multi": False,
+                },
+                {
+                    "name": "docker",
+                    "version": self.get_required_docker_version(),
+                    "check_multi": False,
+                },
+                {
+                    "name": "{}".format(rpm_prefix),
+                    "version": openshift_release,
+                    "check_multi": check_multi_minor_release,
+                },
+                {
+                    "name": "{}-master".format(rpm_prefix),
+                    "version": openshift_release,
+                    "check_multi": check_multi_minor_release,
+                },
+                {
+                    "name": "{}-node".format(rpm_prefix),
+                    "version": openshift_release,
+                    "check_multi": check_multi_minor_release,
+                },
+            ],
         }
-        return self.execute_module("aos_version", args, tmp, task_vars)
+
+        return self.execute_module_with_retries("aos_version", args)
+
+    def get_required_ovs_version(self):
+        """Return the correct Open vSwitch version(s) for the current OpenShift version."""
+        openshift_version = self.get_openshift_version_tuple()
+
+        earliest = min(self.openshift_to_ovs_version)
+        latest = max(self.openshift_to_ovs_version)
+        if openshift_version < earliest:
+            return self.openshift_to_ovs_version[earliest]
+        if openshift_version > latest:
+            return self.openshift_to_ovs_version[latest]
+
+        ovs_version = self.openshift_to_ovs_version.get(openshift_version)
+        if not ovs_version:
+            msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+            raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version)))
+
+        return ovs_version
+
+    def get_required_docker_version(self):
+        """Return the correct Docker version(s) for the current OpenShift version."""
+        openshift_version = self.get_openshift_version_tuple()
+
+        earliest = min(self.openshift_to_docker_version)
+        latest = max(self.openshift_to_docker_version)
+        if openshift_version < earliest:
+            return self.openshift_to_docker_version[earliest]
+        if openshift_version > latest:
+            return self.openshift_to_docker_version[latest]
+
+        docker_version = self.openshift_to_docker_version.get(openshift_version)
+        if not docker_version:
+            msg = "There is no recommended version of Docker for the current version of OpenShift: {}"
+            raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version)))
+
+        return docker_version
+
+    def get_openshift_version_tuple(self):
+        """Return received image tag as a normalized (X, Y) minor version tuple."""
+        version = self.get_var("openshift_image_tag")
+        comps = [int(component) for component in re.findall(r'\d+', version)]
+
+        if len(comps) < 2:
+            msg = "An invalid version of OpenShift was found for this host: {}"
+            raise OpenShiftCheckException(msg.format(version))
+
+        comps[0] = self.map_major_release_version.get(comps[0], comps[0])
+        return tuple(comps[0:2])