5 files changed, 249 insertions, 98 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index c2792a0fe..962148cb8 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -27,10 +27,12 @@ class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
     def run(self, tmp, task_vars):
         group_names = get_var(task_vars, "group_names")
         ansible_mounts = get_var(task_vars, "ansible_mounts")
-
-        min_free_bytes = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
         free_bytes = self.openshift_available_disk(ansible_mounts)
 
+        recommended_min = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
+        configured_min = int(get_var(task_vars, "openshift_check_min_host_disk_gb", default=0)) * 10**9
+        min_free_bytes = configured_min or recommended_min
+
         if free_bytes < min_free_bytes:
             return {
                 'failed': True,
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index cce289b95..4588ed634 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -13,41 +13,55 @@ class DockerImageAvailability(OpenShiftCheck):
     name = "docker_image_availability"
     tags = ["preflight"]
 
-    skopeo_image = "openshift/openshift-ansible"
+    dependencies = ["skopeo", "python-docker-py"]
 
-    # FIXME(juanvallejo): we should consider other possible values of
-    # `deployment_type` (the key here). See
-    # https://github.com/openshift/openshift-ansible/blob/8e26f8c/roles/openshift_repos/vars/main.yml#L7
-    docker_image_base = {
+    deployment_image_info = {
         "origin": {
-            "repo": "openshift",
-            "image": "origin",
+            "namespace": "openshift",
+            "name": "origin",
         },
         "openshift-enterprise": {
-            "repo": "openshift3",
-            "image": "ose",
+            "namespace": "openshift3",
+            "name": "ose",
         },
     }
 
-    def run(self, tmp, task_vars):
-        required_images = self.required_images(task_vars)
-        missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
+    @classmethod
+    def is_active(cls, task_vars):
+        """Skip hosts with unsupported deployment types."""
+        deployment_type = get_var(task_vars, "openshift_deployment_type")
+        has_valid_deployment_type = deployment_type in cls.deployment_image_info
 
-        # exit early if all images were found locally
-        if not missing_images:
-            return {"changed": False}
+        return super(DockerImageAvailability, cls).is_active(task_vars) and has_valid_deployment_type
 
-        msg, failed, changed = self.update_skopeo_image(task_vars)
+    def run(self, tmp, task_vars):
+        msg, failed, changed = self.ensure_dependencies(task_vars)
 
         # exit early if Skopeo update fails
         if failed:
+            if "No package matching" in msg:
+                msg = "Ensure that all required dependencies can be installed via `yum`.\n"
             return {
                 "failed": True,
                 "changed": changed,
-                "msg": "Failed to update Skopeo image ({img_name}). {msg}".format(img_name=self.skopeo_image, msg=msg),
+                "msg": (
+                    "Unable to update or install required dependency packages on this host;\n"
+                    "These are required in order to check Docker image availability:"
+                    "\n    {deps}\n{msg}"
+                ).format(deps=',\n    '.join(self.dependencies), msg=msg),
             }
 
+        required_images = self.required_images(task_vars)
+        missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
+
+        # exit early if all images were found locally
+        if not missing_images:
+            return {"changed": changed}
+
         registries = self.known_docker_registries(task_vars)
+        if not registries:
+            return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed}
+
         available_images = self.available_images(missing_images, registries, task_vars)
         unavailable_images = set(missing_images) - set(available_images)
 
@@ -55,44 +69,60 @@ class DockerImageAvailability(OpenShiftCheck):
             return {
                 "failed": True,
                 "msg": (
-                    "One or more required images are not available: {}.\n"
+                    "One or more required Docker images are not available:\n    {}\n"
                     "Configured registries: {}"
-                ).format(", ".join(sorted(unavailable_images)), ", ".join(registries)),
+                ).format(",\n    ".join(sorted(unavailable_images)), ", ".join(registries)),
                 "changed": changed,
             }
 
         return {"changed": changed}
 
     def required_images(self, task_vars):
-        deployment_type = get_var(task_vars, "deployment_type")
-        # FIXME(juanvallejo): we should handle gracefully with a proper error
-        # message when given an unexpected value for `deployment_type`.
-        image_base_name = self.docker_image_base[deployment_type]
-
-        openshift_release = get_var(task_vars, "openshift_release")
-        # FIXME(juanvallejo): this variable is not required when the
-        # installation is non-containerized. The example inventories have it
-        # commented out. We should handle gracefully and with a proper error
-        # message when this variable is required and not set.
-        openshift_image_tag = get_var(task_vars, "openshift_image_tag")
+        deployment_type = get_var(task_vars, "openshift_deployment_type")
+        image_info = self.deployment_image_info[deployment_type]
 
+        openshift_release = get_var(task_vars, "openshift_release", default="latest")
+        openshift_image_tag = get_var(task_vars, "openshift_image_tag")
         is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
 
-        if is_containerized:
-            images = set(self.containerized_docker_images(image_base_name, openshift_release))
-        else:
-            images = set(self.rpm_docker_images(image_base_name, openshift_release))
+        images = set(self.required_docker_images(
+            image_info["namespace"],
+            image_info["name"],
+            ["registry-console"] if "enterprise" in deployment_type else [],  # include enterprise-only image names
+            openshift_release,
+            is_containerized,
+        ))
 
         # append images with qualified image tags to our list of required images.
         # these are images with a (v0.0.0.0) tag, rather than a standard release
         # format tag (v0.0). We want to check this set in both containerized and
         # non-containerized installations.
         images.update(
-            self.qualified_docker_images(self.image_from_base_name(image_base_name), "v" + openshift_image_tag)
+            self.required_qualified_docker_images(
+                image_info["namespace"],
+                image_info["name"],
+                openshift_image_tag,
+            ),
         )
 
         return images
 
+    @staticmethod
+    def required_docker_images(namespace, name, additional_image_names, version, is_containerized):
+        if is_containerized:
+            return ["{}/{}:{}".format(namespace, name, version)] if name else []
+
+        # include additional non-containerized images specific to the current deployment type
+        return ["{}/{}:{}".format(namespace, img_name, version) for img_name in additional_image_names]
+
+    @staticmethod
+    def required_qualified_docker_images(namespace, name, version):
+        # pylint: disable=invalid-name
+        return [
+            "{}/{}-{}:{}".format(namespace, name, suffix, version)
+            for suffix in ["haproxy-router", "docker-registry", "deployer", "pod"]
+        ]
+
     def local_images(self, images, task_vars):
         """Filter a list of images and return those available locally."""
         return [
@@ -107,31 +137,26 @@ class DockerImageAvailability(OpenShiftCheck):
 
         return bool(result.get("images", []))
 
-    def known_docker_registries(self, task_vars):
-        result = self.module_executor("docker_info", {}, task_vars)
+    @staticmethod
+    def known_docker_registries(task_vars):
+        docker_facts = get_var(task_vars, "openshift", "docker")
+        regs = set(docker_facts["additional_registries"])
 
-        if result.get("failed", False):
-            return []
+        deployment_type = get_var(task_vars, "openshift_deployment_type")
+        if deployment_type == "origin":
+            regs.update(["docker.io"])
+        elif "enterprise" in deployment_type:
+            regs.update(["registry.access.redhat.com"])
 
-        # FIXME(juanvallejo): wrong default type, result["info"] is expected to
-        # contain a dictionary (see how we call `docker_info.get` below).
-        docker_info = result.get("info", "")
-        return [registry.get("Name", "") for registry in docker_info.get("Registries", {})]
+        return list(regs)
 
     def available_images(self, images, registries, task_vars):
         """Inspect existing images using Skopeo and return all images successfully inspected."""
         return [
             image for image in images
-            if self.is_image_available(image, registries, task_vars)
+            if any(self.is_available_skopeo_image(image, registry, task_vars) for registry in registries)
         ]
 
-    def is_image_available(self, image, registries, task_vars):
-        for registry in registries:
-            if self.is_available_skopeo_image(image, registry, task_vars):
-                return True
-
-        return False
-
     def is_available_skopeo_image(self, image, registry, task_vars):
         """Uses Skopeo to determine if required image exists in a given registry."""
 
@@ -140,40 +165,15 @@ class DockerImageAvailability(OpenShiftCheck):
             image=image,
         )
 
-        args = {
-            "name": "skopeo_inspect",
-            "image": self.skopeo_image,
-            "command": cmd_str,
-            "detach": False,
-            "cleanup": True,
-        }
-        result = self.module_executor("docker_container", args, task_vars)
-        return result.get("failed", False)
-
-    def containerized_docker_images(self, base_name, version):
-        return [
-            "{image}:{version}".format(image=self.image_from_base_name(base_name), version=version)
-        ]
+        args = {"_raw_params": cmd_str}
+        result = self.module_executor("command", args, task_vars)
+        return not result.get("failed", False) and result.get("rc", 0) == 0
 
-    @staticmethod
-    def rpm_docker_images(base, version):
-        return [
-            "{image_repo}/registry-console:{version}".format(image_repo=base["repo"], version=version)
-        ]
+    # ensures that the skopeo and python-docker-py packages exist
+    # check is skipped on atomic installations
+    def ensure_dependencies(self, task_vars):
+        if get_var(task_vars, "openshift", "common", "is_atomic"):
+            return "", False, False
 
-    @staticmethod
-    def qualified_docker_images(image_name, version):
-        return [
-            "{}-{}:{}".format(image_name, component, version)
-            for component in "haproxy-router docker-registry deployer pod".split()
-        ]
-
-    @staticmethod
-    def image_from_base_name(base):
-        return "".join([base["repo"], "/", base["image"]])
-
-    # ensures that the skopeo docker image exists, and updates it
-    # with latest if image was already present locally.
-    def update_skopeo_image(self, task_vars):
-        result = self.module_executor("docker_image", {"name": self.skopeo_image}, task_vars)
-        return result.get("msg", ""), result.get("failed", False), result.get("changed", False)
+        result = self.module_executor("yum", {"name": self.dependencies, "state": "latest"}, task_vars)
+        return result.get("msg", ""), result.get("failed", False) or result.get("rc", 0) != 0, result.get("changed")
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
new file mode 100644
index 000000000..c04a69765
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
@@ -0,0 +1,84 @@
+"""
+Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster.
+"""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class EtcdImageDataSize(OpenShiftCheck):
+    """Check that total size of OpenShift image data does not exceed the recommended limit in an etcd cluster"""
+
+    name = "etcd_imagedata_size"
+    tags = ["etcd"]
+
+    def run(self, tmp, task_vars):
+        etcd_mountpath = self._get_etcd_mountpath(get_var(task_vars, "ansible_mounts"))
+        etcd_avail_diskspace = etcd_mountpath["size_available"]
+        etcd_total_diskspace = etcd_mountpath["size_total"]
+
+        etcd_imagedata_size_limit = get_var(task_vars,
+                                            "etcd_max_image_data_size_bytes",
+                                            default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace)))
+
+        etcd_is_ssl = get_var(task_vars, "openshift", "master", "etcd_use_ssl", default=False)
+        etcd_port = get_var(task_vars, "openshift", "master", "etcd_port", default=2379)
+        etcd_hosts = get_var(task_vars, "openshift", "master", "etcd_hosts")
+
+        config_base = get_var(task_vars, "openshift", "common", "config_base")
+
+        cert = task_vars.get("etcd_client_cert", config_base + "/master/master.etcd-client.crt")
+        key = task_vars.get("etcd_client_key", config_base + "/master/master.etcd-client.key")
+        ca_cert = task_vars.get("etcd_client_ca_cert", config_base + "/master/master.etcd-ca.crt")
+
+        for etcd_host in list(etcd_hosts):
+            args = {
+                "size_limit_bytes": etcd_imagedata_size_limit,
+                "paths": ["/openshift.io/images", "/openshift.io/imagestreams"],
+                "host": etcd_host,
+                "port": etcd_port,
+                "protocol": "https" if etcd_is_ssl else "http",
+                "version_prefix": "/v2",
+                "allow_redirect": True,
+                "ca_cert": ca_cert,
+                "cert": {
+                    "cert": cert,
+                    "key": key,
+                },
+            }
+
+            etcdkeysize = self.module_executor("etcdkeysize", args, task_vars)
+
+            if etcdkeysize.get("rc", 0) != 0 or etcdkeysize.get("failed"):
+                msg = 'Failed to retrieve stats for etcd host "{host}": {reason}'
+                reason = etcdkeysize.get("msg")
+                if etcdkeysize.get("module_stderr"):
+                    reason = etcdkeysize["module_stderr"]
+
+                msg = msg.format(host=etcd_host, reason=reason)
+                return {"failed": True, "changed": False, "msg": msg}
+
+            if etcdkeysize["size_limit_exceeded"]:
+                limit = self._to_gigabytes(etcd_imagedata_size_limit)
+                msg = ("The size of OpenShift image data stored in etcd host "
+                       "\"{host}\" exceeds the maximum recommended limit of {limit:.2f} GB. "
+                       "Use the `oadm prune images` command to cleanup unused Docker images.")
+                return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)}
+
+        return {"changed": False}
+
+    @staticmethod
+    def _get_etcd_mountpath(ansible_mounts):
+        valid_etcd_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
+
+        mount_for_path = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+        for path in valid_etcd_mount_paths:
+            if path in mount_for_path:
+                return mount_for_path[path]
+
+        paths = ', '.join(sorted(mount_for_path)) or 'none'
+        msg = "Unable to determine a valid etcd mountpath. Paths mounted: {}.".format(paths)
+        raise OpenShiftCheckException(msg)
+
+    @staticmethod
+    def _to_gigabytes(byte_size):
+        return float(byte_size) / 10.0**9
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
new file mode 100644
index 000000000..7452c9cc1
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
@@ -0,0 +1,58 @@
+"""A health check for OpenShift clusters."""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class EtcdVolume(OpenShiftCheck):
+    """Ensures etcd storage usage does not exceed a given threshold."""
+
+    name = "etcd_volume"
+    tags = ["etcd", "health"]
+
+    # Default device usage threshold. Value should be in the range [0, 100].
+    default_threshold_percent = 90
+    # Where to find ectd data, higher priority first.
+    supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
+
+    @classmethod
+    def is_active(cls, task_vars):
+        etcd_hosts = get_var(task_vars, "groups", "etcd", default=[]) or get_var(task_vars, "groups", "masters",
+                                                                                 default=[]) or []
+        is_etcd_host = get_var(task_vars, "ansible_ssh_host") in etcd_hosts
+        return super(EtcdVolume, cls).is_active(task_vars) and is_etcd_host
+
+    def run(self, tmp, task_vars):
+        mount_info = self._etcd_mount_info(task_vars)
+        available = mount_info["size_available"]
+        total = mount_info["size_total"]
+        used = total - available
+
+        threshold = get_var(
+            task_vars,
+            "etcd_device_usage_threshold_percent",
+            default=self.default_threshold_percent
+        )
+
+        used_percent = 100.0 * used / total
+
+        if used_percent > threshold:
+            device = mount_info.get("device", "unknown")
+            mount = mount_info.get("mount", "unknown")
+            msg = "etcd storage usage ({:.1f}%) is above threshold ({:.1f}%). Device: {}, mount: {}.".format(
+                used_percent, threshold, device, mount
+            )
+            return {"failed": True, "msg": msg}
+
+        return {"changed": False}
+
+    def _etcd_mount_info(self, task_vars):
+        ansible_mounts = get_var(task_vars, "ansible_mounts")
+        mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+
+        for path in self.supported_mount_paths:
+            if path in mounts:
+                return mounts[path]
+
+        paths = ', '.join(sorted(mounts)) or 'none'
+        msg = "Unable to find etcd storage mount point. Paths mounted: {}.".format(paths)
+        raise OpenShiftCheckException(msg)
diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py
index 28805dc37..f4e31065f 100644
--- a/roles/openshift_health_checker/openshift_checks/memory_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py
@@ -1,6 +1,9 @@
 # pylint: disable=missing-docstring
 from openshift_checks import OpenShiftCheck, get_var
 
+MIB = 2**20
+GIB = 2**30
+
 
 class MemoryAvailability(OpenShiftCheck):
     """Check that recommended memory is available."""
@@ -11,10 +14,12 @@ class MemoryAvailability(OpenShiftCheck):
     # Values taken from the official installation documentation:
     # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
     recommended_memory_bytes = {
-        "masters": 16 * 10**9,
-        "nodes": 8 * 10**9,
-        "etcd": 20 * 10**9,
+        "masters": 16 * GIB,
+        "nodes": 8 * GIB,
+        "etcd": 8 * GIB,
     }
+    # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal
+    memtotal_adjustment = 1 * GIB
 
     @classmethod
     def is_active(cls, task_vars):
@@ -25,19 +30,21 @@ class MemoryAvailability(OpenShiftCheck):
 
     def run(self, tmp, task_vars):
         group_names = get_var(task_vars, "group_names")
-        total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * 10**6
+        total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * MIB
 
-        min_memory_bytes = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+        recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+        configured_min = float(get_var(task_vars, "openshift_check_min_host_memory_gb", default=0)) * GIB
+        min_memory_bytes = configured_min or recommended_min
 
-        if total_memory_bytes < min_memory_bytes:
+        if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes:
             return {
                 'failed': True,
                 'msg': (
-                    'Available memory ({available:.1f} GB) '
-                    'below recommended value ({recommended:.1f} GB)'
+                    'Available memory ({available:.1f} GiB) is too far '
+                    'below recommended value ({recommended:.1f} GiB)'
                 ).format(
-                    available=float(total_memory_bytes) / 10**9,
-                    recommended=float(min_memory_bytes) / 10**9,
+                    available=float(total_memory_bytes) / GIB,
+                    recommended=float(min_memory_bytes) / GIB,
                 ),
             }