summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker/openshift_checks
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
-rw-r--r--roles/openshift_health_checker/openshift_checks/disk_availability.py6
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_image_availability.py172
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py84
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_volume.py58
-rw-r--r--roles/openshift_health_checker/openshift_checks/memory_availability.py27
5 files changed, 249 insertions, 98 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index c2792a0fe..962148cb8 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -27,10 +27,12 @@ class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
def run(self, tmp, task_vars):
group_names = get_var(task_vars, "group_names")
ansible_mounts = get_var(task_vars, "ansible_mounts")
-
- min_free_bytes = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
free_bytes = self.openshift_available_disk(ansible_mounts)
+ recommended_min = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
+ configured_min = int(get_var(task_vars, "openshift_check_min_host_disk_gb", default=0)) * 10**9
+ min_free_bytes = configured_min or recommended_min
+
if free_bytes < min_free_bytes:
return {
'failed': True,
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index cce289b95..4588ed634 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -13,41 +13,55 @@ class DockerImageAvailability(OpenShiftCheck):
name = "docker_image_availability"
tags = ["preflight"]
- skopeo_image = "openshift/openshift-ansible"
+ dependencies = ["skopeo", "python-docker-py"]
- # FIXME(juanvallejo): we should consider other possible values of
- # `deployment_type` (the key here). See
- # https://github.com/openshift/openshift-ansible/blob/8e26f8c/roles/openshift_repos/vars/main.yml#L7
- docker_image_base = {
+ deployment_image_info = {
"origin": {
- "repo": "openshift",
- "image": "origin",
+ "namespace": "openshift",
+ "name": "origin",
},
"openshift-enterprise": {
- "repo": "openshift3",
- "image": "ose",
+ "namespace": "openshift3",
+ "name": "ose",
},
}
- def run(self, tmp, task_vars):
- required_images = self.required_images(task_vars)
- missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
+ @classmethod
+ def is_active(cls, task_vars):
+ """Skip hosts with unsupported deployment types."""
+ deployment_type = get_var(task_vars, "openshift_deployment_type")
+ has_valid_deployment_type = deployment_type in cls.deployment_image_info
- # exit early if all images were found locally
- if not missing_images:
- return {"changed": False}
+ return super(DockerImageAvailability, cls).is_active(task_vars) and has_valid_deployment_type
- msg, failed, changed = self.update_skopeo_image(task_vars)
+ def run(self, tmp, task_vars):
+ msg, failed, changed = self.ensure_dependencies(task_vars)
# exit early if Skopeo update fails
if failed:
+ if "No package matching" in msg:
+ msg = "Ensure that all required dependencies can be installed via `yum`.\n"
return {
"failed": True,
"changed": changed,
- "msg": "Failed to update Skopeo image ({img_name}). {msg}".format(img_name=self.skopeo_image, msg=msg),
+ "msg": (
+ "Unable to update or install required dependency packages on this host;\n"
+ "These are required in order to check Docker image availability:"
+ "\n {deps}\n{msg}"
+ ).format(deps=',\n '.join(self.dependencies), msg=msg),
}
+ required_images = self.required_images(task_vars)
+ missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
+
+ # exit early if all images were found locally
+ if not missing_images:
+ return {"changed": changed}
+
registries = self.known_docker_registries(task_vars)
+ if not registries:
+ return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed}
+
available_images = self.available_images(missing_images, registries, task_vars)
unavailable_images = set(missing_images) - set(available_images)
@@ -55,44 +69,60 @@ class DockerImageAvailability(OpenShiftCheck):
return {
"failed": True,
"msg": (
- "One or more required images are not available: {}.\n"
+ "One or more required Docker images are not available:\n {}\n"
"Configured registries: {}"
- ).format(", ".join(sorted(unavailable_images)), ", ".join(registries)),
+ ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)),
"changed": changed,
}
return {"changed": changed}
def required_images(self, task_vars):
- deployment_type = get_var(task_vars, "deployment_type")
- # FIXME(juanvallejo): we should handle gracefully with a proper error
- # message when given an unexpected value for `deployment_type`.
- image_base_name = self.docker_image_base[deployment_type]
-
- openshift_release = get_var(task_vars, "openshift_release")
- # FIXME(juanvallejo): this variable is not required when the
- # installation is non-containerized. The example inventories have it
- # commented out. We should handle gracefully and with a proper error
- # message when this variable is required and not set.
- openshift_image_tag = get_var(task_vars, "openshift_image_tag")
+ deployment_type = get_var(task_vars, "openshift_deployment_type")
+ image_info = self.deployment_image_info[deployment_type]
+ openshift_release = get_var(task_vars, "openshift_release", default="latest")
+ openshift_image_tag = get_var(task_vars, "openshift_image_tag")
is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
- if is_containerized:
- images = set(self.containerized_docker_images(image_base_name, openshift_release))
- else:
- images = set(self.rpm_docker_images(image_base_name, openshift_release))
+ images = set(self.required_docker_images(
+ image_info["namespace"],
+ image_info["name"],
+ ["registry-console"] if "enterprise" in deployment_type else [], # include enterprise-only image names
+ openshift_release,
+ is_containerized,
+ ))
# append images with qualified image tags to our list of required images.
# these are images with a (v0.0.0.0) tag, rather than a standard release
# format tag (v0.0). We want to check this set in both containerized and
# non-containerized installations.
images.update(
- self.qualified_docker_images(self.image_from_base_name(image_base_name), "v" + openshift_image_tag)
+ self.required_qualified_docker_images(
+ image_info["namespace"],
+ image_info["name"],
+ openshift_image_tag,
+ ),
)
return images
+ @staticmethod
+ def required_docker_images(namespace, name, additional_image_names, version, is_containerized):
+ if is_containerized:
+ return ["{}/{}:{}".format(namespace, name, version)] if name else []
+
+ # include additional non-containerized images specific to the current deployment type
+ return ["{}/{}:{}".format(namespace, img_name, version) for img_name in additional_image_names]
+
+ @staticmethod
+ def required_qualified_docker_images(namespace, name, version):
+ # pylint: disable=invalid-name
+ return [
+ "{}/{}-{}:{}".format(namespace, name, suffix, version)
+ for suffix in ["haproxy-router", "docker-registry", "deployer", "pod"]
+ ]
+
def local_images(self, images, task_vars):
"""Filter a list of images and return those available locally."""
return [
@@ -107,31 +137,26 @@ class DockerImageAvailability(OpenShiftCheck):
return bool(result.get("images", []))
- def known_docker_registries(self, task_vars):
- result = self.module_executor("docker_info", {}, task_vars)
+ @staticmethod
+ def known_docker_registries(task_vars):
+ docker_facts = get_var(task_vars, "openshift", "docker")
+ regs = set(docker_facts["additional_registries"])
- if result.get("failed", False):
- return []
+ deployment_type = get_var(task_vars, "openshift_deployment_type")
+ if deployment_type == "origin":
+ regs.update(["docker.io"])
+ elif "enterprise" in deployment_type:
+ regs.update(["registry.access.redhat.com"])
- # FIXME(juanvallejo): wrong default type, result["info"] is expected to
- # contain a dictionary (see how we call `docker_info.get` below).
- docker_info = result.get("info", "")
- return [registry.get("Name", "") for registry in docker_info.get("Registries", {})]
+ return list(regs)
def available_images(self, images, registries, task_vars):
"""Inspect existing images using Skopeo and return all images successfully inspected."""
return [
image for image in images
- if self.is_image_available(image, registries, task_vars)
+ if any(self.is_available_skopeo_image(image, registry, task_vars) for registry in registries)
]
- def is_image_available(self, image, registries, task_vars):
- for registry in registries:
- if self.is_available_skopeo_image(image, registry, task_vars):
- return True
-
- return False
-
def is_available_skopeo_image(self, image, registry, task_vars):
"""Uses Skopeo to determine if required image exists in a given registry."""
@@ -140,40 +165,15 @@ class DockerImageAvailability(OpenShiftCheck):
image=image,
)
- args = {
- "name": "skopeo_inspect",
- "image": self.skopeo_image,
- "command": cmd_str,
- "detach": False,
- "cleanup": True,
- }
- result = self.module_executor("docker_container", args, task_vars)
- return result.get("failed", False)
-
- def containerized_docker_images(self, base_name, version):
- return [
- "{image}:{version}".format(image=self.image_from_base_name(base_name), version=version)
- ]
+ args = {"_raw_params": cmd_str}
+ result = self.module_executor("command", args, task_vars)
+ return not result.get("failed", False) and result.get("rc", 0) == 0
- @staticmethod
- def rpm_docker_images(base, version):
- return [
- "{image_repo}/registry-console:{version}".format(image_repo=base["repo"], version=version)
- ]
+ # ensures that the skopeo and python-docker-py packages exist
+ # check is skipped on atomic installations
+ def ensure_dependencies(self, task_vars):
+ if get_var(task_vars, "openshift", "common", "is_atomic"):
+ return "", False, False
- @staticmethod
- def qualified_docker_images(image_name, version):
- return [
- "{}-{}:{}".format(image_name, component, version)
- for component in "haproxy-router docker-registry deployer pod".split()
- ]
-
- @staticmethod
- def image_from_base_name(base):
- return "".join([base["repo"], "/", base["image"]])
-
- # ensures that the skopeo docker image exists, and updates it
- # with latest if image was already present locally.
- def update_skopeo_image(self, task_vars):
- result = self.module_executor("docker_image", {"name": self.skopeo_image}, task_vars)
- return result.get("msg", ""), result.get("failed", False), result.get("changed", False)
+ result = self.module_executor("yum", {"name": self.dependencies, "state": "latest"}, task_vars)
+ return result.get("msg", ""), result.get("failed", False) or result.get("rc", 0) != 0, result.get("changed")
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
new file mode 100644
index 000000000..c04a69765
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
@@ -0,0 +1,84 @@
+"""
+Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster.
+"""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class EtcdImageDataSize(OpenShiftCheck):
+ """Check that total size of OpenShift image data does not exceed the recommended limit in an etcd cluster"""
+
+ name = "etcd_imagedata_size"
+ tags = ["etcd"]
+
+ def run(self, tmp, task_vars):
+ etcd_mountpath = self._get_etcd_mountpath(get_var(task_vars, "ansible_mounts"))
+ etcd_avail_diskspace = etcd_mountpath["size_available"]
+ etcd_total_diskspace = etcd_mountpath["size_total"]
+
+ etcd_imagedata_size_limit = get_var(task_vars,
+ "etcd_max_image_data_size_bytes",
+ default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace)))
+
+ etcd_is_ssl = get_var(task_vars, "openshift", "master", "etcd_use_ssl", default=False)
+ etcd_port = get_var(task_vars, "openshift", "master", "etcd_port", default=2379)
+ etcd_hosts = get_var(task_vars, "openshift", "master", "etcd_hosts")
+
+ config_base = get_var(task_vars, "openshift", "common", "config_base")
+
+ cert = task_vars.get("etcd_client_cert", config_base + "/master/master.etcd-client.crt")
+ key = task_vars.get("etcd_client_key", config_base + "/master/master.etcd-client.key")
+ ca_cert = task_vars.get("etcd_client_ca_cert", config_base + "/master/master.etcd-ca.crt")
+
+ for etcd_host in list(etcd_hosts):
+ args = {
+ "size_limit_bytes": etcd_imagedata_size_limit,
+ "paths": ["/openshift.io/images", "/openshift.io/imagestreams"],
+ "host": etcd_host,
+ "port": etcd_port,
+ "protocol": "https" if etcd_is_ssl else "http",
+ "version_prefix": "/v2",
+ "allow_redirect": True,
+ "ca_cert": ca_cert,
+ "cert": {
+ "cert": cert,
+ "key": key,
+ },
+ }
+
+ etcdkeysize = self.module_executor("etcdkeysize", args, task_vars)
+
+ if etcdkeysize.get("rc", 0) != 0 or etcdkeysize.get("failed"):
+ msg = 'Failed to retrieve stats for etcd host "{host}": {reason}'
+ reason = etcdkeysize.get("msg")
+ if etcdkeysize.get("module_stderr"):
+ reason = etcdkeysize["module_stderr"]
+
+ msg = msg.format(host=etcd_host, reason=reason)
+ return {"failed": True, "changed": False, "msg": msg}
+
+ if etcdkeysize["size_limit_exceeded"]:
+ limit = self._to_gigabytes(etcd_imagedata_size_limit)
+ msg = ("The size of OpenShift image data stored in etcd host "
+ "\"{host}\" exceeds the maximum recommended limit of {limit:.2f} GB. "
+ "Use the `oadm prune images` command to cleanup unused Docker images.")
+ return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)}
+
+ return {"changed": False}
+
+ @staticmethod
+ def _get_etcd_mountpath(ansible_mounts):
+ valid_etcd_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
+
+ mount_for_path = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+ for path in valid_etcd_mount_paths:
+ if path in mount_for_path:
+ return mount_for_path[path]
+
+ paths = ', '.join(sorted(mount_for_path)) or 'none'
+ msg = "Unable to determine a valid etcd mountpath. Paths mounted: {}.".format(paths)
+ raise OpenShiftCheckException(msg)
+
+ @staticmethod
+ def _to_gigabytes(byte_size):
+ return float(byte_size) / 10.0**9
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
new file mode 100644
index 000000000..7452c9cc1
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
@@ -0,0 +1,58 @@
+"""A health check for OpenShift clusters."""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class EtcdVolume(OpenShiftCheck):
+ """Ensures etcd storage usage does not exceed a given threshold."""
+
+ name = "etcd_volume"
+ tags = ["etcd", "health"]
+
+ # Default device usage threshold. Value should be in the range [0, 100].
+ default_threshold_percent = 90
+ # Where to find ectd data, higher priority first.
+ supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
+
+ @classmethod
+ def is_active(cls, task_vars):
+ etcd_hosts = get_var(task_vars, "groups", "etcd", default=[]) or get_var(task_vars, "groups", "masters",
+ default=[]) or []
+ is_etcd_host = get_var(task_vars, "ansible_ssh_host") in etcd_hosts
+ return super(EtcdVolume, cls).is_active(task_vars) and is_etcd_host
+
+ def run(self, tmp, task_vars):
+ mount_info = self._etcd_mount_info(task_vars)
+ available = mount_info["size_available"]
+ total = mount_info["size_total"]
+ used = total - available
+
+ threshold = get_var(
+ task_vars,
+ "etcd_device_usage_threshold_percent",
+ default=self.default_threshold_percent
+ )
+
+ used_percent = 100.0 * used / total
+
+ if used_percent > threshold:
+ device = mount_info.get("device", "unknown")
+ mount = mount_info.get("mount", "unknown")
+ msg = "etcd storage usage ({:.1f}%) is above threshold ({:.1f}%). Device: {}, mount: {}.".format(
+ used_percent, threshold, device, mount
+ )
+ return {"failed": True, "msg": msg}
+
+ return {"changed": False}
+
+ def _etcd_mount_info(self, task_vars):
+ ansible_mounts = get_var(task_vars, "ansible_mounts")
+ mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+
+ for path in self.supported_mount_paths:
+ if path in mounts:
+ return mounts[path]
+
+ paths = ', '.join(sorted(mounts)) or 'none'
+ msg = "Unable to find etcd storage mount point. Paths mounted: {}.".format(paths)
+ raise OpenShiftCheckException(msg)
diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py
index 28805dc37..f4e31065f 100644
--- a/roles/openshift_health_checker/openshift_checks/memory_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py
@@ -1,6 +1,9 @@
# pylint: disable=missing-docstring
from openshift_checks import OpenShiftCheck, get_var
+MIB = 2**20
+GIB = 2**30
+
class MemoryAvailability(OpenShiftCheck):
"""Check that recommended memory is available."""
@@ -11,10 +14,12 @@ class MemoryAvailability(OpenShiftCheck):
# Values taken from the official installation documentation:
# https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
recommended_memory_bytes = {
- "masters": 16 * 10**9,
- "nodes": 8 * 10**9,
- "etcd": 20 * 10**9,
+ "masters": 16 * GIB,
+ "nodes": 8 * GIB,
+ "etcd": 8 * GIB,
}
+ # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal
+ memtotal_adjustment = 1 * GIB
@classmethod
def is_active(cls, task_vars):
@@ -25,19 +30,21 @@ class MemoryAvailability(OpenShiftCheck):
def run(self, tmp, task_vars):
group_names = get_var(task_vars, "group_names")
- total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * 10**6
+ total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * MIB
- min_memory_bytes = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+ recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+ configured_min = float(get_var(task_vars, "openshift_check_min_host_memory_gb", default=0)) * GIB
+ min_memory_bytes = configured_min or recommended_min
- if total_memory_bytes < min_memory_bytes:
+ if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes:
return {
'failed': True,
'msg': (
- 'Available memory ({available:.1f} GB) '
- 'below recommended value ({recommended:.1f} GB)'
+ 'Available memory ({available:.1f} GiB) is too far '
+ 'below recommended value ({recommended:.1f} GiB)'
).format(
- available=float(total_memory_bytes) / 10**9,
- recommended=float(min_memory_bytes) / 10**9,
+ available=float(total_memory_bytes) / GIB,
+ recommended=float(min_memory_bytes) / GIB,
),
}