summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker/openshift_checks
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
-rw-r--r--roles/openshift_health_checker/openshift_checks/__init__.py98
-rw-r--r--roles/openshift_health_checker/openshift_checks/disk_availability.py114
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_image_availability.py257
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_storage.py298
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py85
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_traffic.py44
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_volume.py55
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/__init__.py0
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/curator.py54
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py210
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd.py167
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/kibana.py226
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging.py92
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py130
-rw-r--r--roles/openshift_health_checker/openshift_checks/memory_availability.py50
-rw-r--r--roles/openshift_health_checker/openshift_checks/mixins.py59
-rw-r--r--roles/openshift_health_checker/openshift_checks/ovs_version.py77
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_availability.py22
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_update.py8
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_version.py122
20 files changed, 1976 insertions, 192 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index be63d864a..40a28cde5 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -19,15 +19,21 @@ class OpenShiftCheckException(Exception):
@six.add_metaclass(ABCMeta)
class OpenShiftCheck(object):
- """A base class for defining checks for an OpenShift cluster environment."""
+ """
+ A base class for defining checks for an OpenShift cluster environment.
+
+ Expect optional params: method execute_module, dict task_vars, and string tmp.
+ execute_module is expected to have a signature compatible with _execute_module
+ from ansible plugins/action/__init__.py, e.g.:
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *args):
+ This is stored so that it can be invoked in subclasses via check.execute_module("name", args)
+ which provides the check's stored task_vars and tmp.
+ """
- def __init__(self, execute_module=None, module_executor=None):
- if execute_module is module_executor is None:
- raise TypeError(
- "__init__() takes either execute_module (recommended) "
- "or module_executor (deprecated), none given")
- self.execute_module = execute_module or module_executor
- self.module_executor = self.execute_module
+ def __init__(self, execute_module=None, task_vars=None, tmp=None):
+ self._execute_module = execute_module
+ self.task_vars = task_vars or {}
+ self.tmp = tmp
@abstractproperty
def name(self):
@@ -43,13 +49,13 @@ class OpenShiftCheck(object):
"""
return []
- @classmethod
- def is_active(cls, task_vars): # pylint: disable=unused-argument
+ @staticmethod
+ def is_active():
"""Returns true if this check applies to the ansible-playbook run."""
return True
@abstractmethod
- def run(self, tmp, task_vars):
+ def run(self):
"""Executes a check, normally implemented as a module."""
return {}
@@ -62,34 +68,64 @@ class OpenShiftCheck(object):
for subclass in subclass.subclasses():
yield subclass
+ def execute_module(self, module_name=None, module_args=None):
+ """Invoke an Ansible module from a check.
+
+ Invoke stored _execute_module, normally copied from the action
+ plugin, with its params and the task_vars and tmp given at
+ check initialization. No positional parameters beyond these
+ are specified. If it's necessary to specify any of the other
+ parameters to _execute_module then that should just be invoked
+ directly (with awareness of changes in method signature per
+ Ansible version).
+
+ So e.g. check.execute_module("foo", dict(arg1=...))
+ Return: result hash from module execution.
+ """
+ if self._execute_module is None:
+ raise NotImplementedError(
+ self.__class__.__name__ +
+ " invoked execute_module without providing the method at initialization."
+ )
+ return self._execute_module(module_name, module_args, self.tmp, self.task_vars)
+
+ def get_var(self, *keys, **kwargs):
+ """Get deeply nested values from task_vars.
+
+ Ansible task_vars structures are Python dicts, often mapping strings to
+ other dicts. This helper makes it easier to get a nested value, raising
+ OpenShiftCheckException when a key is not found or returning a default value
+ provided as a keyword argument.
+ """
+ try:
+ value = reduce(operator.getitem, keys, self.task_vars)
+ except (KeyError, TypeError):
+ if "default" in kwargs:
+ return kwargs["default"]
+ raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys))))
+ return value
+
LOADER_EXCLUDES = (
"__init__.py",
"mixins.py",
+ "logging.py",
)
-def load_checks():
+def load_checks(path=None, subpkg=""):
"""Dynamically import all check modules for the side effect of registering checks."""
- return [
- import_module(__package__ + "." + name[:-3])
- for name in os.listdir(os.path.dirname(__file__))
- if name.endswith(".py") and name not in LOADER_EXCLUDES
- ]
+ if path is None:
+ path = os.path.dirname(__file__)
+ modules = []
-def get_var(task_vars, *keys, **kwargs):
- """Helper function to get deeply nested values from task_vars.
+ for name in os.listdir(path):
+ if os.path.isdir(os.path.join(path, name)):
+ modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name)
+ continue
- Ansible task_vars structures are Python dicts, often mapping strings to
- other dicts. This helper makes it easier to get a nested value, raising
- OpenShiftCheckException when a key is not found or returning a default value
- provided as a keyword argument.
- """
- try:
- value = reduce(operator.getitem, keys, task_vars)
- except (KeyError, TypeError):
- if "default" in kwargs:
- return kwargs["default"]
- raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys))))
- return value
+ if name.endswith(".py") and name not in LOADER_EXCLUDES:
+ modules.append(import_module(__package__ + subpkg + "." + name[:-3]))
+
+ return modules
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
new file mode 100644
index 000000000..283461294
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -0,0 +1,114 @@
+"""Check that there is enough disk space in predefined paths."""
+
+import os.path
+import tempfile
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+class DiskAvailability(OpenShiftCheck):
+ """Check that recommended disk space is available before a first-time install."""
+
+ name = "disk_availability"
+ tags = ["preflight"]
+
+ # Values taken from the official installation documentation:
+ # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
+ recommended_disk_space_bytes = {
+ '/var': {
+ 'masters': 40 * 10**9,
+ 'nodes': 15 * 10**9,
+ 'etcd': 20 * 10**9,
+ },
+ # Used to copy client binaries into,
+ # see roles/openshift_cli/library/openshift_container_binary_sync.py.
+ '/usr/local/bin': {
+ 'masters': 1 * 10**9,
+ 'nodes': 1 * 10**9,
+ 'etcd': 1 * 10**9,
+ },
+ # Used as temporary storage in several cases.
+ tempfile.gettempdir(): {
+ 'masters': 1 * 10**9,
+ 'nodes': 1 * 10**9,
+ 'etcd': 1 * 10**9,
+ },
+ }
+
+ def is_active(self):
+ """Skip hosts that do not have recommended disk space requirements."""
+ group_names = self.get_var("group_names", default=[])
+ active_groups = set()
+ for recommendation in self.recommended_disk_space_bytes.values():
+ active_groups.update(recommendation.keys())
+ has_disk_space_recommendation = bool(active_groups.intersection(group_names))
+ return super(DiskAvailability, self).is_active() and has_disk_space_recommendation
+
+ def run(self):
+ group_names = self.get_var("group_names")
+ ansible_mounts = self.get_var("ansible_mounts")
+ ansible_mounts = {mount['mount']: mount for mount in ansible_mounts}
+
+ user_config = self.get_var("openshift_check_min_host_disk_gb", default={})
+ try:
+ # For backwards-compatibility, if openshift_check_min_host_disk_gb
+ # is a number, then it overrides the required config for '/var'.
+ number = float(user_config)
+ user_config = {
+ '/var': {
+ 'masters': number,
+ 'nodes': number,
+ 'etcd': number,
+ },
+ }
+ except TypeError:
+ # If it is not a number, then it should be a nested dict.
+ pass
+
+ # TODO: as suggested in
+ # https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021,
+ # maybe we could support checking disk availability in paths that are
+ # not part of the official recommendation but present in the user
+ # configuration.
+ for path, recommendation in self.recommended_disk_space_bytes.items():
+ free_bytes = self.free_bytes(path, ansible_mounts)
+ recommended_bytes = max(recommendation.get(name, 0) for name in group_names)
+
+ config = user_config.get(path, {})
+ # NOTE: the user config is in GB, but we compare bytes, thus the
+ # conversion.
+ config_bytes = max(config.get(name, 0) for name in group_names) * 10**9
+ recommended_bytes = config_bytes or recommended_bytes
+
+ if free_bytes < recommended_bytes:
+ free_gb = float(free_bytes) / 10**9
+ recommended_gb = float(recommended_bytes) / 10**9
+ return {
+ 'failed': True,
+ 'msg': (
+ 'Available disk space in "{}" ({:.1f} GB) '
+ 'is below minimum recommended ({:.1f} GB)'
+ ).format(path, free_gb, recommended_gb)
+ }
+
+ return {}
+
+ @staticmethod
+ def free_bytes(path, ansible_mounts):
+ """Return the size available in path based on ansible_mounts."""
+ mount_point = path
+ # arbitry value to prevent an infinite loop, in the unlike case that '/'
+ # is not in ansible_mounts.
+ max_depth = 32
+ while mount_point not in ansible_mounts and max_depth > 0:
+ mount_point = os.path.dirname(mount_point)
+ max_depth -= 1
+
+ try:
+ free_bytes = ansible_mounts[mount_point]['size_available']
+ except KeyError:
+ known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(ansible_mounts)) or 'none'
+ msg = 'Unable to determine disk availability for "{}". Known mount points: {}.'
+ raise OpenShiftCheckException(msg.format(path, known_mounts))
+
+ return free_bytes
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index cce289b95..77180223e 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -1,179 +1,178 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that required Docker images are available."""
+from openshift_checks import OpenShiftCheck
+from openshift_checks.mixins import DockerHostMixin
-class DockerImageAvailability(OpenShiftCheck):
+
+NODE_IMAGE_SUFFIXES = ["haproxy-router", "docker-registry", "deployer", "pod"]
+DEPLOYMENT_IMAGE_INFO = {
+ "origin": {
+ "namespace": "openshift",
+ "name": "origin",
+ "registry_console_image": "cockpit/kubernetes",
+ },
+ "openshift-enterprise": {
+ "namespace": "openshift3",
+ "name": "ose",
+ "registry_console_image": "registry.access.redhat.com/openshift3/registry-console",
+ },
+}
+
+
+class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
"""Check that required Docker images are available.
- This check attempts to ensure that required docker images are
- either present locally, or able to be pulled down from available
- registries defined in a host machine.
+ Determine docker images that an install would require and check that they
+ are either present in the host's docker index, or available for the host to pull
+ with known registries as defined in our inventory file (or defaults).
"""
name = "docker_image_availability"
tags = ["preflight"]
+ # we use python-docker-py to check local docker for images, and skopeo
+ # to look for images available remotely without waiting to pull them.
+ dependencies = ["python-docker-py", "skopeo"]
- skopeo_image = "openshift/openshift-ansible"
-
- # FIXME(juanvallejo): we should consider other possible values of
- # `deployment_type` (the key here). See
- # https://github.com/openshift/openshift-ansible/blob/8e26f8c/roles/openshift_repos/vars/main.yml#L7
- docker_image_base = {
- "origin": {
- "repo": "openshift",
- "image": "origin",
- },
- "openshift-enterprise": {
- "repo": "openshift3",
- "image": "ose",
- },
- }
-
- def run(self, tmp, task_vars):
- required_images = self.required_images(task_vars)
- missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
+ def is_active(self):
+ """Skip hosts with unsupported deployment types."""
+ deployment_type = self.get_var("openshift_deployment_type")
+ has_valid_deployment_type = deployment_type in DEPLOYMENT_IMAGE_INFO
- # exit early if all images were found locally
- if not missing_images:
- return {"changed": False}
-
- msg, failed, changed = self.update_skopeo_image(task_vars)
+ return super(DockerImageAvailability, self).is_active() and has_valid_deployment_type
- # exit early if Skopeo update fails
+ def run(self):
+ msg, failed, changed = self.ensure_dependencies()
if failed:
return {
"failed": True,
"changed": changed,
- "msg": "Failed to update Skopeo image ({img_name}). {msg}".format(img_name=self.skopeo_image, msg=msg),
+ "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg
}
- registries = self.known_docker_registries(task_vars)
- available_images = self.available_images(missing_images, registries, task_vars)
+ required_images = self.required_images()
+ missing_images = set(required_images) - set(self.local_images(required_images))
+
+ # exit early if all images were found locally
+ if not missing_images:
+ return {"changed": changed}
+
+ registries = self.known_docker_registries()
+ if not registries:
+ return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed}
+
+ available_images = self.available_images(missing_images, registries)
unavailable_images = set(missing_images) - set(available_images)
if unavailable_images:
return {
"failed": True,
"msg": (
- "One or more required images are not available: {}.\n"
+ "One or more required Docker images are not available:\n {}\n"
"Configured registries: {}"
- ).format(", ".join(sorted(unavailable_images)), ", ".join(registries)),
+ ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)),
"changed": changed,
}
return {"changed": changed}
- def required_images(self, task_vars):
- deployment_type = get_var(task_vars, "deployment_type")
- # FIXME(juanvallejo): we should handle gracefully with a proper error
- # message when given an unexpected value for `deployment_type`.
- image_base_name = self.docker_image_base[deployment_type]
-
- openshift_release = get_var(task_vars, "openshift_release")
- # FIXME(juanvallejo): this variable is not required when the
- # installation is non-containerized. The example inventories have it
- # commented out. We should handle gracefully and with a proper error
- # message when this variable is required and not set.
- openshift_image_tag = get_var(task_vars, "openshift_image_tag")
-
- is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
-
- if is_containerized:
- images = set(self.containerized_docker_images(image_base_name, openshift_release))
- else:
- images = set(self.rpm_docker_images(image_base_name, openshift_release))
-
- # append images with qualified image tags to our list of required images.
- # these are images with a (v0.0.0.0) tag, rather than a standard release
- # format tag (v0.0). We want to check this set in both containerized and
- # non-containerized installations.
- images.update(
- self.qualified_docker_images(self.image_from_base_name(image_base_name), "v" + openshift_image_tag)
- )
-
- return images
-
- def local_images(self, images, task_vars):
+ def required_images(self):
+ """
+ Determine which images we expect to need for this host.
+ Returns: a set of required images like 'openshift/origin:v3.6'
+
+ The thorny issue of determining the image names from the variables is under consideration
+ via https://github.com/openshift/openshift-ansible/issues/4415
+
+ For now we operate as follows:
+ * For containerized components (master, node, ...) we look at the deployment type and
+ use openshift/origin or openshift3/ose as the base for those component images. The
+ version is openshift_image_tag as determined by the openshift_version role.
+ * For OpenShift-managed infrastructure (router, registry...) we use oreg_url if
+ it is defined; otherwise we again use the base that depends on the deployment type.
+ Registry is not included in constructed images. It may be in oreg_url or etcd image.
+ """
+ required = set()
+ deployment_type = self.get_var("openshift_deployment_type")
+ host_groups = self.get_var("group_names")
+ # containerized etcd may not have openshift_image_tag, see bz 1466622
+ image_tag = self.get_var("openshift_image_tag", default="latest")
+ image_info = DEPLOYMENT_IMAGE_INFO[deployment_type]
+ if not image_info:
+ return required
+
+ # template for images that run on top of OpenShift
+ image_url = "{}/{}-{}:{}".format(image_info["namespace"], image_info["name"], "${component}", "${version}")
+ image_url = self.get_var("oreg_url", default="") or image_url
+ if 'nodes' in host_groups:
+ for suffix in NODE_IMAGE_SUFFIXES:
+ required.add(image_url.replace("${component}", suffix).replace("${version}", image_tag))
+ # The registry-console is for some reason not prefixed with ose- like the other components.
+ # Nor is it versioned the same, so just look for latest.
+ # Also a completely different name is used for Origin.
+ required.add(image_info["registry_console_image"])
+
+ # images for containerized components
+ if self.get_var("openshift", "common", "is_containerized"):
+ components = set()
+ if 'nodes' in host_groups:
+ components.update(["node", "openvswitch"])
+ if 'masters' in host_groups: # name is "origin" or "ose"
+ components.add(image_info["name"])
+ for component in components:
+ required.add("{}/{}:{}".format(image_info["namespace"], component, image_tag))
+ if 'etcd' in host_groups: # special case, note it is the same for origin/enterprise
+ required.add("registry.access.redhat.com/rhel7/etcd") # and no image tag
+
+ return required
+
+ def local_images(self, images):
"""Filter a list of images and return those available locally."""
return [
image for image in images
- if self.is_image_local(image, task_vars)
+ if self.is_image_local(image)
]
- def is_image_local(self, image, task_vars):
- result = self.module_executor("docker_image_facts", {"name": image}, task_vars)
+ def is_image_local(self, image):
+ """Check if image is already in local docker index."""
+ result = self.execute_module("docker_image_facts", {"name": image})
if result.get("failed", False):
return False
return bool(result.get("images", []))
- def known_docker_registries(self, task_vars):
- result = self.module_executor("docker_info", {}, task_vars)
+ def known_docker_registries(self):
+ """Build a list of docker registries available according to inventory vars."""
+ docker_facts = self.get_var("openshift", "docker")
+ regs = set(docker_facts["additional_registries"])
- if result.get("failed", False):
- return []
+ deployment_type = self.get_var("openshift_deployment_type")
+ if deployment_type == "origin":
+ regs.update(["docker.io"])
+ elif "enterprise" in deployment_type:
+ regs.update(["registry.access.redhat.com"])
- # FIXME(juanvallejo): wrong default type, result["info"] is expected to
- # contain a dictionary (see how we call `docker_info.get` below).
- docker_info = result.get("info", "")
- return [registry.get("Name", "") for registry in docker_info.get("Registries", {})]
+ return list(regs)
- def available_images(self, images, registries, task_vars):
- """Inspect existing images using Skopeo and return all images successfully inspected."""
+ def available_images(self, images, default_registries):
+ """Search remotely for images. Returns: list of images found."""
return [
image for image in images
- if self.is_image_available(image, registries, task_vars)
+ if self.is_available_skopeo_image(image, default_registries)
]
- def is_image_available(self, image, registries, task_vars):
+ def is_available_skopeo_image(self, image, default_registries):
+ """Use Skopeo to determine if required image exists in known registry(s)."""
+ registries = default_registries
+
+ # if image already includes a registry, only use that
+ if image.count("/") > 1:
+ registry, image = image.split("/", 1)
+ registries = [registry]
+
for registry in registries:
- if self.is_available_skopeo_image(image, registry, task_vars):
+ args = {"_raw_params": "skopeo inspect --tls-verify=false docker://{}/{}".format(registry, image)}
+ result = self.execute_module("command", args)
+ if result.get("rc", 0) == 0 and not result.get("failed"):
return True
return False
-
- def is_available_skopeo_image(self, image, registry, task_vars):
- """Uses Skopeo to determine if required image exists in a given registry."""
-
- cmd_str = "skopeo inspect docker://{registry}/{image}".format(
- registry=registry,
- image=image,
- )
-
- args = {
- "name": "skopeo_inspect",
- "image": self.skopeo_image,
- "command": cmd_str,
- "detach": False,
- "cleanup": True,
- }
- result = self.module_executor("docker_container", args, task_vars)
- return result.get("failed", False)
-
- def containerized_docker_images(self, base_name, version):
- return [
- "{image}:{version}".format(image=self.image_from_base_name(base_name), version=version)
- ]
-
- @staticmethod
- def rpm_docker_images(base, version):
- return [
- "{image_repo}/registry-console:{version}".format(image_repo=base["repo"], version=version)
- ]
-
- @staticmethod
- def qualified_docker_images(image_name, version):
- return [
- "{}-{}:{}".format(image_name, component, version)
- for component in "haproxy-router docker-registry deployer pod".split()
- ]
-
- @staticmethod
- def image_from_base_name(base):
- return "".join([base["repo"], "/", base["image"]])
-
- # ensures that the skopeo docker image exists, and updates it
- # with latest if image was already present locally.
- def update_skopeo_image(self, task_vars):
- result = self.module_executor("docker_image", {"name": self.skopeo_image}, task_vars)
- return result.get("msg", ""), result.get("failed", False), result.get("changed", False)
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
new file mode 100644
index 000000000..dea15a56e
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -0,0 +1,298 @@
+"""Check Docker storage driver and usage."""
+import json
+import os.path
+import re
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+from openshift_checks.mixins import DockerHostMixin
+
+
+class DockerStorage(DockerHostMixin, OpenShiftCheck):
+ """Check Docker storage driver compatibility.
+
+ This check ensures that Docker is using a supported storage driver,
+ and that loopback is not being used (if using devicemapper).
+ Also that storage usage is not above threshold.
+ """
+
+ name = "docker_storage"
+ tags = ["pre-install", "health", "preflight"]
+
+ dependencies = ["python-docker-py"]
+ storage_drivers = ["devicemapper", "overlay", "overlay2"]
+ max_thinpool_data_usage_percent = 90.0
+ max_thinpool_meta_usage_percent = 90.0
+ max_overlay_usage_percent = 90.0
+
+ # TODO(lmeyer): mention these in the output when check fails
+ configuration_variables = [
+ (
+ "max_thinpool_data_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for data. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent),
+ ),
+ (
+ "max_thinpool_meta_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for metadata. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent),
+ ),
+ (
+ "max_overlay_usage_percent",
+ "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. "
+ "Format: float. Default: {:.1f}".format(max_overlay_usage_percent),
+ ),
+ ]
+
+ def run(self):
+ msg, failed, changed = self.ensure_dependencies()
+ if failed:
+ return {
+ "failed": True,
+ "changed": changed,
+ "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg
+ }
+
+ # attempt to get the docker info hash from the API
+ docker_info = self.execute_module("docker_info", {})
+ if docker_info.get("failed"):
+ return {"failed": True, "changed": changed,
+ "msg": "Failed to query Docker API. Is docker running on this host?"}
+ if not docker_info.get("info"): # this would be very strange
+ return {"failed": True, "changed": changed,
+ "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))}
+ docker_info = docker_info["info"]
+
+ # check if the storage driver we saw is valid
+ driver = docker_info.get("Driver", "[NONE]")
+ if driver not in self.storage_drivers:
+ msg = (
+ "Detected unsupported Docker storage driver '{driver}'.\n"
+ "Supported storage drivers are: {drivers}"
+ ).format(driver=driver, drivers=', '.join(self.storage_drivers))
+ return {"failed": True, "changed": changed, "msg": msg}
+
+ # driver status info is a list of tuples; convert to dict and validate based on driver
+ driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])}
+
+ result = {}
+
+ if driver == "devicemapper":
+ result = self.check_devicemapper_support(driver_status)
+
+ if driver in ['overlay', 'overlay2']:
+ result = self.check_overlay_support(docker_info, driver_status)
+
+ result['changed'] = result.get('changed', False) or changed
+ return result
+
+ def check_devicemapper_support(self, driver_status):
+ """Check if dm storage driver is supported as configured. Return: result dict."""
+ if driver_status.get("Data loop file"):
+ msg = (
+ "Use of loopback devices with the Docker devicemapper storage driver\n"
+ "(the default storage configuration) is unsupported in production.\n"
+ "Please use docker-storage-setup to configure a backing storage volume.\n"
+ "See http://red.ht/2rNperO for further information."
+ )
+ return {"failed": True, "msg": msg}
+ result = self.check_dm_usage(driver_status)
+ return result
+
+ def check_dm_usage(self, driver_status):
+ """Check usage thresholds for Docker dm storage driver. Return: result dict.
+ Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
+ implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
+ devicemapper storage. The LV is "thin" because it does not use all available storage
+ from its VG, instead expanding as needed; so to determine available space, we gather
+ current usage as the Docker API reports for the driver as well as space available for
+ expansion in the pool's VG.
+ Usage within the LV is divided into pools allocated to data and metadata, either of which
+ could run out of space first; so we check both.
+ """
+ vals = dict(
+ vg_free=self.get_vg_free(driver_status.get("Pool Name")),
+ data_used=driver_status.get("Data Space Used"),
+ data_total=driver_status.get("Data Space Total"),
+ metadata_used=driver_status.get("Metadata Space Used"),
+ metadata_total=driver_status.get("Metadata Space Total"),
+ )
+
+ # convert all human-readable strings to bytes
+ for key, value in vals.copy().items():
+ try:
+ vals[key + "_bytes"] = self.convert_to_bytes(value)
+ except ValueError as err: # unlikely to hit this from API info, but just to be safe
+ return {
+ "failed": True,
+ "values": vals,
+ "msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err))
+ }
+
+ # determine the threshold percentages which usage should not exceed
+ for name, default in [("data", self.max_thinpool_data_usage_percent),
+ ("metadata", self.max_thinpool_meta_usage_percent)]:
+ percent = self.get_var("max_thinpool_" + name + "_usage_percent", default=default)
+ try:
+ vals[name + "_threshold"] = float(percent)
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent)
+ }
+
+ # test whether the thresholds are exceeded
+ messages = []
+ for name in ["data", "metadata"]:
+ vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / (
+ vals[name + "_total_bytes"] + vals["vg_free_bytes"])
+ if vals[name + "_pct_used"] > vals[name + "_threshold"]:
+ messages.append(
+ "Docker thinpool {name} usage percentage {pct:.1f} "
+ "is higher than threshold {thresh:.1f}.".format(
+ name=name,
+ pct=vals[name + "_pct_used"],
+ thresh=vals[name + "_threshold"],
+ ))
+ vals["failed"] = True
+
+ vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
+ return vals
+
+ def get_vg_free(self, pool):
+ """Determine which VG to examine according to the pool name. Return: size vgs reports.
+ Pool name is the only indicator currently available from the Docker API driver info.
+ We assume a name that looks like "vg--name-docker--pool";
+ vg and lv names with inner hyphens doubled, joined by a hyphen.
+ """
+ match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen
+ if not match: # unlikely, but... be clear if we assumed wrong
+ raise OpenShiftCheckException(
+ "This host's Docker reports it is using a storage pool named '{}'.\n"
+ "However this name does not have the expected format of 'vgname-lvname'\n"
+ "so the available storage in the VG cannot be determined.".format(pool)
+ )
+ vg_name = match.groups()[0].replace("--", "-")
+ vgs_cmd = "/sbin/vgs --noheadings -o vg_free --units g --select vg_name=" + vg_name
+ # should return free space like " 12.00g" if the VG exists; empty if it does not
+
+ ret = self.execute_module("command", {"_raw_params": vgs_cmd})
+ if ret.get("failed") or ret.get("rc", 0) != 0:
+ raise OpenShiftCheckException(
+ "Is LVM installed? Failed to run /sbin/vgs "
+ "to determine docker storage usage:\n" + ret.get("msg", "")
+ )
+ size = ret.get("stdout", "").strip()
+ if not size:
+ raise OpenShiftCheckException(
+ "This host's Docker reports it is using a storage pool named '{pool}'.\n"
+ "which we expect to come from local VG '{vg}'.\n"
+ "However, /sbin/vgs did not find this VG. Is Docker for this host"
+ "running and using the storage on the host?".format(pool=pool, vg=vg_name)
+ )
+ return size
+
+ @staticmethod
+ def convert_to_bytes(string):
+ """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes."""
+ units = dict(
+ b=1,
+ k=1024,
+ m=1024**2,
+ g=1024**3,
+ t=1024**4,
+ p=1024**5,
+ )
+ string = string or ""
+ match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string) # float followed by optional unit
+ if not match:
+ raise ValueError("Cannot convert to a byte size: " + string)
+
+ number, unit = match.groups()
+ multiplier = 1 if not unit else units.get(unit.lower())
+ if not multiplier:
+ raise ValueError("Cannot convert to a byte size: " + string)
+
+ return float(number) * multiplier
+
+ def check_overlay_support(self, docker_info, driver_status):
+ """Check if overlay storage driver is supported for this host. Return: result dict."""
+ # check for xfs as backing store
+ backing_fs = driver_status.get("Backing Filesystem", "[NONE]")
+ if backing_fs != "xfs":
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n"
+ "'xfs' as the backing storage, but this host's storage is type '{fs}'."
+ ).format(fs=backing_fs)
+ return {"failed": True, "msg": msg}
+
+ # check support for OS and kernel version
+ o_s = docker_info.get("OperatingSystem", "[NONE]")
+ if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s:
+ # keep it simple, only check enterprise kernel versions; assume everyone else is good
+ kernel = docker_info.get("KernelVersion", "[NONE]")
+ kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)]
+ if kernel_arr < [3, 10, 0, 514]: # rhel < 7.3
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n"
+ "kernel version 3.10.0-514; but Docker reports kernel version {version}."
+ ).format(version=kernel)
+ return {"failed": True, "msg": msg}
+ # NOTE: we could check for --selinux-enabled here but docker won't even start with
+ # that option until it's supported in the kernel so we don't need to.
+
+ return self.check_overlay_usage(docker_info)
+
+ def check_overlay_usage(self, docker_info):
+ """Check disk usage on OverlayFS backing store volume. Return: result dict."""
+ path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"]
+
+ threshold = self.get_var("max_overlay_usage_percent", default=self.max_overlay_usage_percent)
+ try:
+ threshold = float(threshold)
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold),
+ }
+
+ mount = self.find_ansible_mount(path, self.get_var("ansible_mounts"))
+ try:
+ free_bytes = mount['size_available']
+ total_bytes = mount['size_total']
+ usage = 100.0 * (total_bytes - free_bytes) / total_bytes
+ except (KeyError, ZeroDivisionError):
+ return {
+ "failed": True,
+ "msg": "The ansible_mount found for path {} is invalid.\n"
+ "This is likely to be an Ansible bug. The record was:\n"
+ "{}".format(path, json.dumps(mount, indent=2)),
+ }
+
+ if usage > threshold:
+ return {
+ "failed": True,
+ "msg": (
+ "For Docker OverlayFS mount point {path},\n"
+ "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}."
+ ).format(path=mount["mount"], pct=usage, thresh=threshold)
+ }
+
+ return {}
+
+ # TODO(lmeyer): migrate to base class
+ @staticmethod
+ def find_ansible_mount(path, ansible_mounts):
+ """Return the mount point for path from ansible_mounts."""
+
+ mount_for_path = {mount['mount']: mount for mount in ansible_mounts}
+ mount_point = path
+ while mount_point not in mount_for_path:
+ if mount_point in ["/", ""]: # "/" not in ansible_mounts???
+ break
+ mount_point = os.path.dirname(mount_point)
+
+ try:
+ return mount_for_path[mount_point]
+ except KeyError:
+ known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) or 'none'
+ msg = 'Unable to determine mount point for path "{}". Known mount points: {}.'
+ raise OpenShiftCheckException(msg.format(path, known_mounts))
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
new file mode 100644
index 000000000..28c38504d
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
@@ -0,0 +1,85 @@
+"""
+Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster.
+"""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+class EtcdImageDataSize(OpenShiftCheck):
+ """Check that total size of OpenShift image data does not exceed the recommended limit in an etcd cluster"""
+
+ name = "etcd_imagedata_size"
+ tags = ["etcd"]
+
+ def run(self):
+ etcd_mountpath = self._get_etcd_mountpath(self.get_var("ansible_mounts"))
+ etcd_avail_diskspace = etcd_mountpath["size_available"]
+ etcd_total_diskspace = etcd_mountpath["size_total"]
+
+ etcd_imagedata_size_limit = self.get_var(
+ "etcd_max_image_data_size_bytes",
+ default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace))
+ )
+
+ etcd_is_ssl = self.get_var("openshift", "master", "etcd_use_ssl", default=False)
+ etcd_port = self.get_var("openshift", "master", "etcd_port", default=2379)
+ etcd_hosts = self.get_var("openshift", "master", "etcd_hosts")
+
+ config_base = self.get_var("openshift", "common", "config_base")
+
+ cert = self.get_var("etcd_client_cert", default=config_base + "/master/master.etcd-client.crt")
+ key = self.get_var("etcd_client_key", default=config_base + "/master/master.etcd-client.key")
+ ca_cert = self.get_var("etcd_client_ca_cert", default=config_base + "/master/master.etcd-ca.crt")
+
+ for etcd_host in list(etcd_hosts):
+ args = {
+ "size_limit_bytes": etcd_imagedata_size_limit,
+ "paths": ["/openshift.io/images", "/openshift.io/imagestreams"],
+ "host": etcd_host,
+ "port": etcd_port,
+ "protocol": "https" if etcd_is_ssl else "http",
+ "version_prefix": "/v2",
+ "allow_redirect": True,
+ "ca_cert": ca_cert,
+ "cert": {
+ "cert": cert,
+ "key": key,
+ },
+ }
+
+ etcdkeysize = self.execute_module("etcdkeysize", args)
+
+ if etcdkeysize.get("rc", 0) != 0 or etcdkeysize.get("failed"):
+ msg = 'Failed to retrieve stats for etcd host "{host}": {reason}'
+ reason = etcdkeysize.get("msg")
+ if etcdkeysize.get("module_stderr"):
+ reason = etcdkeysize["module_stderr"]
+
+ msg = msg.format(host=etcd_host, reason=reason)
+ return {"failed": True, "changed": False, "msg": msg}
+
+ if etcdkeysize["size_limit_exceeded"]:
+ limit = self._to_gigabytes(etcd_imagedata_size_limit)
+ msg = ("The size of OpenShift image data stored in etcd host "
+ "\"{host}\" exceeds the maximum recommended limit of {limit:.2f} GB. "
+ "Use the `oadm prune images` command to cleanup unused Docker images.")
+ return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)}
+
+ return {"changed": False}
+
+ @staticmethod
+ def _get_etcd_mountpath(ansible_mounts):
+ valid_etcd_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
+
+ mount_for_path = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+ for path in valid_etcd_mount_paths:
+ if path in mount_for_path:
+ return mount_for_path[path]
+
+ paths = ', '.join(sorted(mount_for_path)) or 'none'
+ msg = "Unable to determine a valid etcd mountpath. Paths mounted: {}.".format(paths)
+ raise OpenShiftCheckException(msg)
+
+ @staticmethod
+ def _to_gigabytes(byte_size):
+ return float(byte_size) / 10.0**9
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
new file mode 100644
index 000000000..cc1b14d8a
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
@@ -0,0 +1,44 @@
+"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic."""
+
+from openshift_checks import OpenShiftCheck
+
+
+class EtcdTraffic(OpenShiftCheck):
+ """Check if host is being affected by an increase in etcd traffic."""
+
+ name = "etcd_traffic"
+ tags = ["health", "etcd"]
+
+ def is_active(self):
+ """Skip hosts that do not have etcd in their group names."""
+ group_names = self.get_var("group_names", default=[])
+ valid_group_names = "etcd" in group_names
+
+ version = self.get_var("openshift", "common", "short_version")
+ valid_version = version in ("3.4", "3.5", "1.4", "1.5")
+
+ return super(EtcdTraffic, self).is_active() and valid_group_names and valid_version
+
+ def run(self):
+ is_containerized = self.get_var("openshift", "common", "is_containerized")
+ unit = "etcd_container" if is_containerized else "etcd"
+
+ log_matchers = [{
+ "start_regexp": r"Starting Etcd Server",
+ "regexp": r"etcd: sync duration of [^,]+, expected less than 1s",
+ "unit": unit
+ }]
+
+ match = self.execute_module("search_journalctl", {"log_matchers": log_matchers})
+
+ if match.get("matched"):
+ msg = ("Higher than normal etcd traffic detected.\n"
+ "OpenShift 3.4 introduced an increase in etcd traffic.\n"
+ "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n"
+ "Please refer to https://access.redhat.com/solutions/2916381 for more information.")
+ return {"failed": True, "msg": msg}
+
+ if match.get("failed"):
+ return {"failed": True, "msg": "\n".join(match.get("errors"))}
+
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
new file mode 100644
index 000000000..da7d0364a
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
@@ -0,0 +1,55 @@
+"""A health check for OpenShift clusters."""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+class EtcdVolume(OpenShiftCheck):
+ """Ensures etcd storage usage does not exceed a given threshold."""
+
+ name = "etcd_volume"
+ tags = ["etcd", "health"]
+
+ # Default device usage threshold. Value should be in the range [0, 100].
+ default_threshold_percent = 90
+ # Where to find ectd data, higher priority first.
+ supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
+
+ def is_active(self):
+ etcd_hosts = self.get_var("groups", "etcd", default=[]) or self.get_var("groups", "masters", default=[]) or []
+ is_etcd_host = self.get_var("ansible_ssh_host") in etcd_hosts
+ return super(EtcdVolume, self).is_active() and is_etcd_host
+
+ def run(self):
+ mount_info = self._etcd_mount_info()
+ available = mount_info["size_available"]
+ total = mount_info["size_total"]
+ used = total - available
+
+ threshold = self.get_var(
+ "etcd_device_usage_threshold_percent",
+ default=self.default_threshold_percent
+ )
+
+ used_percent = 100.0 * used / total
+
+ if used_percent > threshold:
+ device = mount_info.get("device", "unknown")
+ mount = mount_info.get("mount", "unknown")
+ msg = "etcd storage usage ({:.1f}%) is above threshold ({:.1f}%). Device: {}, mount: {}.".format(
+ used_percent, threshold, device, mount
+ )
+ return {"failed": True, "msg": msg}
+
+ return {"changed": False}
+
+ def _etcd_mount_info(self):
+ ansible_mounts = self.get_var("ansible_mounts")
+ mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+
+ for path in self.supported_mount_paths:
+ if path in mounts:
+ return mounts[path]
+
+ paths = ', '.join(sorted(mounts)) or 'none'
+ msg = "Unable to find etcd storage mount point. Paths mounted: {}.".format(paths)
+ raise OpenShiftCheckException(msg)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
new file mode 100644
index 000000000..f82ae64d7
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -0,0 +1,54 @@
+"""Check for an aggregated logging Curator deployment"""
+
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Curator(LoggingCheck):
+ """Check for an aggregated logging Curator deployment"""
+
+ name = "curator"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self):
+ self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
+ curator_pods, error = super(Curator, self).get_pods_for_component(
+ self.logging_namespace,
+ "curator",
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_curator(curator_pods)
+
+ if check_error:
+ msg = ("The following Curator deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'}
+
+ def check_curator(self, pods):
+ """Check to see if curator is up and working. Returns: error string"""
+ if not pods:
+ return (
+ "There are no Curator pods for the logging stack,\n"
+ "so nothing will prune Elasticsearch indexes.\n"
+ "Is Curator correctly deployed?"
+ )
+
+ not_running = super(Curator, self).not_running_pods(pods)
+ if len(not_running) == len(pods):
+ return (
+ "The Curator pod is not currently in a running state,\n"
+ "so Elasticsearch indexes may increase without bound."
+ )
+ if len(pods) - len(not_running) > 1:
+ return (
+ "There is more than one Curator pod running. This should not normally happen.\n"
+ "Although this doesn't cause any problems, you may want to investigate."
+ )
+
+ return None
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
new file mode 100644
index 000000000..1e478c04d
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -0,0 +1,210 @@
+"""Check for an aggregated logging Elasticsearch deployment"""
+
+import json
+import re
+
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Elasticsearch(LoggingCheck):
+ """Check for an aggregated logging Elasticsearch deployment"""
+
+ name = "elasticsearch"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
+ es_pods, error = super(Elasticsearch, self).get_pods_for_component(
+ self.logging_namespace,
+ "es",
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_elasticsearch(es_pods)
+
+ if check_error:
+ msg = ("The following Elasticsearch deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'}
+
+ def _not_running_elasticsearch_pods(self, es_pods):
+ """Returns: list of pods that are not running, list of errors about non-running pods"""
+ not_running = super(Elasticsearch, self).not_running_pods(es_pods)
+ if not_running:
+ return not_running, [(
+ 'The following Elasticsearch pods are not running:\n'
+ '{pods}'
+ 'These pods will not aggregate logs from their nodes.'
+ ).format(pods=''.join(
+ " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+ for pod in not_running
+ ))]
+ return not_running, []
+
+ def check_elasticsearch(self, es_pods):
+ """Various checks for elasticsearch. Returns: error string"""
+ not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods)
+ running_pods = [pod for pod in es_pods if pod not in not_running_pods]
+ pods_by_name = {
+ pod['metadata']['name']: pod for pod in running_pods
+ # Filter out pods that are not members of a DC
+ if pod['metadata'].get('labels', {}).get('deploymentconfig')
+ }
+ if not pods_by_name:
+ return 'No logging Elasticsearch pods were found. Is logging deployed?'
+ error_msgs += self._check_elasticsearch_masters(pods_by_name)
+ error_msgs += self._check_elasticsearch_node_list(pods_by_name)
+ error_msgs += self._check_es_cluster_health(pods_by_name)
+ error_msgs += self._check_elasticsearch_diskspace(pods_by_name)
+ return '\n'.join(error_msgs)
+
+ @staticmethod
+ def _build_es_curl_cmd(pod_name, url):
+ base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
+ return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
+
+ def _check_elasticsearch_masters(self, pods_by_name):
+ """Check that Elasticsearch masters are sane. Returns: list of error strings"""
+ es_master_names = set()
+ error_msgs = []
+ for pod_name in pods_by_name.keys():
+ # Compare what each ES node reports as master and compare for split brain
+ get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
+ master_name_str = self._exec_oc(get_master_cmd, [])
+ master_names = (master_name_str or '').split(' ')
+ if len(master_names) > 1:
+ es_master_names.add(master_names[1])
+ else:
+ error_msgs.append(
+ 'No master? Elasticsearch {pod} returned bad string when asked master name:\n'
+ ' {response}'.format(pod=pod_name, response=master_name_str)
+ )
+
+ if not es_master_names:
+ error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?')
+ return '\n'.join(error_msgs)
+
+ if len(es_master_names) > 1:
+ error_msgs.append(
+ 'Found multiple Elasticsearch masters according to the pods:\n'
+ '{master_list}\n'
+ 'This implies that the masters have "split brain" and are not correctly\n'
+ 'replicating data for the logging cluster. Log loss is likely to occur.'
+ .format(master_list='\n'.join(' ' + master for master in es_master_names))
+ )
+
+ return error_msgs
+
+ def _check_elasticsearch_node_list(self, pods_by_name):
+ """Check that reported ES masters are accounted for by pods. Returns: list of error strings"""
+
+ if not pods_by_name:
+ return ['No logging Elasticsearch masters were found. Is logging deployed?']
+
+ # get ES cluster nodes
+ node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
+ cluster_node_data = self._exec_oc(node_cmd, [])
+ try:
+ cluster_nodes = json.loads(cluster_node_data)['nodes']
+ except (ValueError, KeyError):
+ return [
+ 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
+ cluster_node_data
+ ]
+
+ # Try to match all ES-reported node hosts to known pods.
+ error_msgs = []
+ for node in cluster_nodes.values():
+ # Note that with 1.4/3.4 the pod IP may be used as the master name
+ if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
+ for pod_name, pod in pods_by_name.items()):
+ error_msgs.append(
+ 'The Elasticsearch cluster reports a member node "{node}"\n'
+ 'that does not correspond to any known ES pod.'.format(node=node['host'])
+ )
+
+ return error_msgs
+
+ def _check_es_cluster_health(self, pods_by_name):
+ """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
+ error_msgs = []
+ for pod_name in pods_by_name.keys():
+ cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
+ cluster_health_data = self._exec_oc(cluster_health_cmd, [])
+ try:
+ health_res = json.loads(cluster_health_data)
+ if not health_res or not health_res.get('status'):
+ raise ValueError()
+ except ValueError:
+ error_msgs.append(
+ 'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
+ 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
+ )
+ continue
+
+ if health_res['status'] not in ['green', 'yellow']:
+ error_msgs.append(
+ 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
+ )
+
+ return error_msgs
+
+ def _check_elasticsearch_diskspace(self, pods_by_name):
+ """
+ Exec into an ES pod and query the diskspace on the persistent volume.
+ Returns: list of errors
+ """
+ error_msgs = []
+ for pod_name in pods_by_name.keys():
+ df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
+ disk_output = self._exec_oc(df_cmd, [])
+ lines = disk_output.splitlines()
+ # expecting one header looking like 'IUse% Use%' and one body line
+ body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
+ if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
+ error_msgs.append(
+ 'Could not retrieve storage usage from logging ES pod "{pod}".\n'
+ 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
+ )
+ continue
+ inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
+
+ inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90')
+ if int(inode_pct) >= int(inode_pct_thresh):
+ error_msgs.append(
+ 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
+ ' is {pct}, greater than threshold {limit}.\n'
+ ' Note: threshold can be specified in inventory with {param}'.format(
+ pod=pod_name,
+ pct=str(inode_pct),
+ limit=str(inode_pct_thresh),
+ param='openshift_check_efk_es_inode_pct',
+ ))
+ disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80')
+ if int(disk_pct) >= int(disk_pct_thresh):
+ error_msgs.append(
+ 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
+ ' is {pct}, greater than threshold {limit}.\n'
+ ' Note: threshold can be specified in inventory with {param}'.format(
+ pod=pod_name,
+ pct=str(disk_pct),
+ limit=str(disk_pct_thresh),
+ param='openshift_check_efk_es_storage_pct',
+ ))
+
+ return error_msgs
+
+ def _exec_oc(self, cmd_str, extra_args):
+ return super(Elasticsearch, self).exec_oc(
+ self.logging_namespace,
+ cmd_str,
+ extra_args,
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
new file mode 100644
index 000000000..063e707a9
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -0,0 +1,167 @@
+"""Check for an aggregated logging Fluentd deployment"""
+
+import json
+
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Fluentd(LoggingCheck):
+ """Check for an aggregated logging Fluentd deployment"""
+
+ name = "fluentd"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
+ fluentd_pods, error = super(Fluentd, self).get_pods_for_component(
+ self.logging_namespace,
+ "fluentd",
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_fluentd(fluentd_pods)
+
+ if check_error:
+ msg = ("The following Fluentd deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'}
+
+ @staticmethod
+ def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+ """Filter to all nodes with fluentd label. Returns dict(name: node), error string"""
+ label, value = node_selector.split('=', 1)
+ fluentd_nodes = {
+ name: node for name, node in nodes_by_name.items()
+ if node['metadata']['labels'].get(label) == value
+ }
+ if not fluentd_nodes:
+ return None, (
+ 'There are no nodes with the fluentd label {label}.\n'
+ 'This means no logs will be aggregated from the nodes.'
+ ).format(label=node_selector)
+ return fluentd_nodes, None
+
+ def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
+ """Note if nodes are not labeled as expected. Returns: error string"""
+ intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all'])
+ if not intended_nodes or '--all' in intended_nodes:
+ intended_nodes = nodes_by_name.keys()
+ nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
+ if nodes_missing_labels:
+ return (
+ 'The following nodes are supposed to be labeled with {label} but are not:\n'
+ ' {nodes}\n'
+ 'Fluentd will not aggregate logs from these nodes.'
+ ).format(label=node_selector, nodes=', '.join(nodes_missing_labels))
+ return None
+
+ @staticmethod
+ def _check_nodes_have_fluentd(pods, fluentd_nodes):
+ """Make sure fluentd is on all the labeled nodes. Returns: error string"""
+ unmatched_nodes = fluentd_nodes.copy()
+ node_names_by_label = {
+ node['metadata']['labels']['kubernetes.io/hostname']: name
+ for name, node in fluentd_nodes.items()
+ }
+ node_names_by_internal_ip = {
+ address['address']: name
+ for name, node in fluentd_nodes.items()
+ for address in node['status']['addresses']
+ if address['type'] == "InternalIP"
+ }
+ for pod in pods:
+ for name in [
+ pod['spec']['nodeName'],
+ node_names_by_internal_ip.get(pod['spec']['nodeName']),
+ node_names_by_label.get(pod.get('spec', {}).get('host')),
+ ]:
+ unmatched_nodes.pop(name, None)
+ if unmatched_nodes:
+ return (
+ 'The following nodes are supposed to have a Fluentd pod but do not:\n'
+ '{nodes}'
+ 'These nodes will not have their logs aggregated.'
+ ).format(nodes=''.join(
+ " {}\n".format(name)
+ for name in unmatched_nodes.keys()
+ ))
+ return None
+
+ def _check_fluentd_pods_running(self, pods):
+ """Make sure all fluentd pods are running. Returns: error string"""
+ not_running = super(Fluentd, self).not_running_pods(pods)
+ if not_running:
+ return (
+ 'The following Fluentd pods are supposed to be running but are not:\n'
+ '{pods}'
+ 'These pods will not aggregate logs from their nodes.'
+ ).format(pods=''.join(
+ " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+ for pod in not_running
+ ))
+ return None
+
+ def check_fluentd(self, pods):
+ """Verify fluentd is running everywhere. Returns: error string"""
+
+ node_selector = self.get_var(
+ 'openshift_logging_fluentd_nodeselector',
+ default='logging-infra-fluentd=true'
+ )
+
+ nodes_by_name, error = self.get_nodes_by_name()
+
+ if error:
+ return error
+ fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
+ if error:
+ return error
+
+ error_msgs = []
+ error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
+ if error:
+ error_msgs.append(error)
+ error = self._check_nodes_have_fluentd(pods, fluentd_nodes)
+ if error:
+ error_msgs.append(error)
+ error = self._check_fluentd_pods_running(pods)
+ if error:
+ error_msgs.append(error)
+
+ # Make sure there are no extra fluentd pods
+ if len(pods) > len(fluentd_nodes):
+ error_msgs.append(
+ 'There are more Fluentd pods running than nodes labeled.\n'
+ 'This may not cause problems with logging but it likely indicates something wrong.'
+ )
+
+ return '\n'.join(error_msgs)
+
+ def get_nodes_by_name(self):
+ """Retrieve all the node definitions. Returns: dict(name: node), error string"""
+ nodes_json = self._exec_oc("get nodes -o json", [])
+ try:
+ nodes = json.loads(nodes_json)
+ except ValueError: # no valid json - should not happen
+ return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json
+ if not nodes or not nodes.get('items'): # also should not happen
+ return None, "No nodes appear to be defined according to the API."
+ return {
+ node['metadata']['name']: node
+ for node in nodes['items']
+ }, None
+
+ def _exec_oc(self, cmd_str, extra_args):
+ return super(Fluentd, self).exec_oc(
+ self.logging_namespace,
+ cmd_str,
+ extra_args,
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
new file mode 100644
index 000000000..60f94e106
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -0,0 +1,226 @@
+"""
+Module for performing checks on a Kibana logging deployment
+"""
+
+import json
+import ssl
+
+try:
+ from urllib2 import HTTPError, URLError
+ import urllib2
+except ImportError:
+ from urllib.error import HTTPError, URLError
+ import urllib.request as urllib2
+
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Kibana(LoggingCheck):
+ """Module that checks an integrated logging Kibana deployment"""
+
+ name = "kibana"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
+ kibana_pods, error = super(Kibana, self).get_pods_for_component(
+ self.logging_namespace,
+ "kibana",
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_kibana(kibana_pods)
+
+ if not check_error:
+ check_error = self._check_kibana_route()
+
+ if check_error:
+ msg = ("The following Kibana deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'}
+
+ def _verify_url_internal(self, url):
+ """
+ Try to reach a URL from the host.
+ Returns: success (bool), reason (for failure)
+ """
+ args = dict(
+ url=url,
+ follow_redirects='none',
+ validate_certs='no', # likely to be signed with internal CA
+ # TODO(lmeyer): give users option to validate certs
+ status_code=302,
+ )
+ result = self.execute_module('uri', args)
+ if result.get('failed'):
+ return result['msg']
+ return None
+
+ @staticmethod
+ def _verify_url_external(url):
+ """
+ Try to reach a URL from ansible control host.
+ Returns: success (bool), reason (for failure)
+ """
+ # This actually checks from the ansible control host, which may or may not
+ # really be "external" to the cluster.
+
+ # Disable SSL cert validation to work around internally signed certs
+ ctx = ssl.create_default_context()
+ ctx.check_hostname = False # or setting CERT_NONE is refused
+ ctx.verify_mode = ssl.CERT_NONE
+
+ # Verify that the url is returning a valid response
+ try:
+ # We only care if the url connects and responds
+ return_code = urllib2.urlopen(url, context=ctx).getcode()
+ except HTTPError as httperr:
+ return httperr.reason
+ except URLError as urlerr:
+ return str(urlerr)
+
+ # there appears to be no way to prevent urlopen from following redirects
+ if return_code != 200:
+ return 'Expected success (200) but got return code {}'.format(int(return_code))
+
+ return None
+
+ def check_kibana(self, pods):
+ """Check to see if Kibana is up and working. Returns: error string."""
+
+ if not pods:
+ return "There are no Kibana pods deployed, so no access to the logging UI."
+
+ not_running = self.not_running_pods(pods)
+ if len(not_running) == len(pods):
+ return "No Kibana pod is in a running state, so there is no access to the logging UI."
+ elif not_running:
+ return (
+ "The following Kibana pods are not currently in a running state:\n"
+ "{pods}"
+ "However at least one is, so service may not be impacted."
+ ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running))
+
+ return None
+
+ def _get_kibana_url(self):
+ """
+ Get kibana route or report error.
+ Returns: url (or empty), reason for failure
+ """
+
+ # Get logging url
+ get_route = self._exec_oc("get route logging-kibana -o json", [])
+ if not get_route:
+ return None, 'no_route_exists'
+
+ route = json.loads(get_route)
+
+ # check that the route has been accepted by a router
+ ingress = route["status"]["ingress"]
+ # ingress can be null if there is no router, or empty if not routed
+ if not ingress or not ingress[0]:
+ return None, 'route_not_accepted'
+
+ host = route.get("spec", {}).get("host")
+ if not host:
+ return None, 'route_missing_host'
+
+ return 'https://{}/'.format(host), None
+
+ def _check_kibana_route(self):
+ """
+ Check to see if kibana route is up and working.
+ Returns: error string
+ """
+ known_errors = dict(
+ no_route_exists=(
+ 'No route is defined for Kibana in the logging namespace,\n'
+ 'so the logging stack is not accessible. Is logging deployed?\n'
+ 'Did something remove the logging-kibana route?'
+ ),
+ route_not_accepted=(
+ 'The logging-kibana route is not being routed by any router.\n'
+ 'Is the router deployed and working?'
+ ),
+ route_missing_host=(
+ 'The logging-kibana route has no hostname defined,\n'
+ 'which should never happen. Did something alter its definition?'
+ ),
+ )
+
+ kibana_url, error = self._get_kibana_url()
+ if not kibana_url:
+ return known_errors.get(error, error)
+
+ # first, check that kibana is reachable from the master.
+ error = self._verify_url_internal(kibana_url)
+ if error:
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ error = (
+ 'Failed to connect from this master to Kibana URL {url}\n'
+ 'Is kibana running, and is at least one router routing to it?'
+ ).format(url=kibana_url)
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ error = (
+ 'Failed to connect from this master to Kibana URL {url}\n'
+ 'because the hostname does not resolve.\n'
+ 'Is DNS configured for the Kibana hostname?'
+ ).format(url=kibana_url)
+ elif 'Status code was not' in error:
+ error = (
+ 'A request from this master to the Kibana URL {url}\n'
+ 'did not return the correct status code (302).\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues. The output was:\n'
+ ' {error}'
+ ).format(url=kibana_url, error=error)
+ return 'Error validating the logging Kibana route:\n' + error
+
+ # in production we would like the kibana route to work from outside the
+ # cluster too; but that may not be the case, so allow disabling just this part.
+ if not self.get_var("openshift_check_efk_kibana_external", default=True):
+ return None
+ error = self._verify_url_external(kibana_url)
+ if error:
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ error = (
+ 'Failed to connect from the Ansible control host to Kibana URL {url}\n'
+ 'Is the router for the Kibana hostname exposed externally?'
+ ).format(url=kibana_url)
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ error = (
+ 'Failed to resolve the Kibana hostname in {url}\n'
+ 'from the Ansible control host.\n'
+ 'Is DNS configured to resolve this Kibana hostname externally?'
+ ).format(url=kibana_url)
+ elif 'Expected success (200)' in error:
+ error = (
+ 'A request to Kibana at {url}\n'
+ 'returned the wrong error code:\n'
+ ' {error}\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues.'
+ ).format(url=kibana_url, error=error)
+ error = (
+ 'Error validating the logging Kibana route:\n{error}\n'
+ 'To disable external Kibana route validation, set in your inventory:\n'
+ ' openshift_check_efk_kibana_external=False'
+ ).format(error=error)
+ return error
+ return None
+
+ def _exec_oc(self, cmd_str, extra_args):
+ return super(Kibana, self).exec_oc(
+ self.logging_namespace,
+ cmd_str,
+ extra_args,
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
new file mode 100644
index 000000000..a48e1c728
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -0,0 +1,92 @@
+"""
+Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack
+"""
+
+import json
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+class LoggingCheck(OpenShiftCheck):
+ """Base class for OpenShift aggregated logging component checks"""
+
+ name = "logging"
+ logging_namespace = "logging"
+
+ def is_active(self):
+ logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False)
+ return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master()
+
+ def is_first_master(self):
+ """Determine if running on first master. Returns: bool"""
+ # Note: It would be nice to use membership in oo_first_master group, however for now it
+ # seems best to avoid requiring that setup and just check this is the first master.
+ hostname = self.get_var("ansible_ssh_host") or [None]
+ masters = self.get_var("groups", "masters", default=None) or [None]
+ return masters[0] == hostname
+
+ def run(self):
+ pass
+
+ def get_pods_for_component(self, namespace, logging_component):
+ """Get all pods for a given component. Returns: list of pods for component, error string"""
+ pod_output = self.exec_oc(
+ namespace,
+ "get pods -l component={} -o json".format(logging_component),
+ [],
+ )
+ try:
+ pods = json.loads(pod_output)
+ if not pods or not pods.get('items'):
+ raise ValueError()
+ except ValueError:
+ # successful run but non-parsing data generally means there were no pods in the namespace
+ return None, 'No pods were found for the "{}" logging component.'.format(logging_component)
+
+ return pods['items'], None
+
+ @staticmethod
+ def not_running_pods(pods):
+ """Returns: list of pods not in a ready and running state"""
+ return [
+ pod for pod in pods
+ if not pod.get("status", {}).get("containerStatuses") or any(
+ container['ready'] is False
+ for container in pod['status']['containerStatuses']
+ ) or not any(
+ condition['type'] == 'Ready' and condition['status'] == 'True'
+ for condition in pod['status'].get('conditions', [])
+ )
+ ]
+
+ def exec_oc(self, namespace="logging", cmd_str="", extra_args=None):
+ """
+ Execute an 'oc' command in the remote host.
+ Returns: output of command and namespace,
+ or raises OpenShiftCheckException on error
+ """
+ config_base = self.get_var("openshift", "common", "config_base")
+ args = {
+ "namespace": namespace,
+ "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+ "cmd": cmd_str,
+ "extra_args": list(extra_args) if extra_args else [],
+ }
+
+ result = self.execute_module("ocutil", args)
+ if result.get("failed"):
+ msg = (
+ 'Unexpected error using `oc` to validate the logging stack components.\n'
+ 'Error executing `oc {cmd}`:\n'
+ '{error}'
+ ).format(cmd=args['cmd'], error=result['result'])
+
+ if result['result'] == '[Errno 2] No such file or directory':
+ msg = (
+ "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+ "Has an installation been run on this host yet?"
+ )
+ raise OpenShiftCheckException(msg)
+
+ return result.get("result", "")
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
new file mode 100644
index 000000000..b24e88e05
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -0,0 +1,130 @@
+"""
+Check for ensuring logs from pods can be queried in a reasonable amount of time.
+"""
+
+import json
+import time
+
+from uuid import uuid4
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+ES_CMD_TIMEOUT_SECONDS = 30
+
+
+class LoggingIndexTime(LoggingCheck):
+ """Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time."""
+ name = "logging_index_time"
+ tags = ["health", "logging"]
+
+ logging_namespace = "logging"
+
+ def run(self):
+ """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs."""
+ try:
+ log_index_timeout = int(
+ self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS)
+ )
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": ('Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
+ 'Value must be an integer representing an amount in seconds.'),
+ }
+
+ running_component_pods = dict()
+
+ # get all component pods
+ self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace)
+ for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']):
+ pods, error = self.get_pods_for_component(self.logging_namespace, component)
+
+ if error:
+ msg = 'Unable to retrieve pods for the {} logging component: {}'
+ return {"failed": True, "changed": False, "msg": msg.format(name, error)}
+
+ running_pods = self.running_pods(pods)
+
+ if not running_pods:
+ msg = ('No {} pods in the "Running" state were found.'
+ 'At least one pod is required in order to perform this check.')
+ return {"failed": True, "changed": False, "msg": msg.format(name)}
+
+ running_component_pods[component] = running_pods
+
+ uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0])
+ self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout)
+ return {}
+
+ def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs):
+ """Retry an Elasticsearch query every second until query success, or a defined
+ length of time has passed."""
+ deadline = time.time() + timeout_secs
+ interval = 1
+ while not self.query_es_from_es(es_pod, uuid):
+ if time.time() + interval > deadline:
+ msg = "expecting match in Elasticsearch for message with uuid {}, but no matches were found after {}s."
+ raise OpenShiftCheckException(msg.format(uuid, timeout_secs))
+ time.sleep(interval)
+
+ def curl_kibana_with_uuid(self, kibana_pod):
+ """curl Kibana with a unique uuid."""
+ uuid = self.generate_uuid()
+ pod_name = kibana_pod["metadata"]["name"]
+ exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}"
+ exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid)
+
+ error_str = self.exec_oc(self.logging_namespace, exec_cmd, [])
+
+ try:
+ error_code = json.loads(error_str)["statusCode"]
+ except KeyError:
+ msg = ('invalid response returned from Kibana request (Missing "statusCode" key):\n'
+ 'Command: {}\nResponse: {}').format(exec_cmd, error_str)
+ raise OpenShiftCheckException(msg)
+ except ValueError:
+ msg = ('invalid response returned from Kibana request (Non-JSON output):\n'
+ 'Command: {}\nResponse: {}').format(exec_cmd, error_str)
+ raise OpenShiftCheckException(msg)
+
+ if error_code != 404:
+ msg = 'invalid error code returned from Kibana request. Expecting error code "404", but got "{}" instead.'
+ raise OpenShiftCheckException(msg.format(error_code))
+
+ return uuid
+
+ def query_es_from_es(self, es_pod, uuid):
+ """curl the Elasticsearch pod and look for a unique uuid in its logs."""
+ pod_name = es_pod["metadata"]["name"]
+ exec_cmd = (
+ "exec {pod_name} -- curl --max-time 30 -s -f "
+ "--cacert /etc/elasticsearch/secret/admin-ca "
+ "--cert /etc/elasticsearch/secret/admin-cert "
+ "--key /etc/elasticsearch/secret/admin-key "
+ "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
+ )
+ exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace, uuid=uuid)
+ result = self.exec_oc(self.logging_namespace, exec_cmd, [])
+
+ try:
+ count = json.loads(result)["count"]
+ except KeyError:
+ msg = 'invalid response from Elasticsearch query:\n"{}"\nMissing "count" key:\n{}'
+ raise OpenShiftCheckException(msg.format(exec_cmd, result))
+ except ValueError:
+ msg = 'invalid response from Elasticsearch query:\n"{}"\nNon-JSON output:\n{}'
+ raise OpenShiftCheckException(msg.format(exec_cmd, result))
+
+ return count
+
+ @staticmethod
+ def running_pods(pods):
+ """Filter pods that are running."""
+ return [pod for pod in pods if pod['status']['phase'] == 'Running']
+
+ @staticmethod
+ def generate_uuid():
+ """Wrap uuid generator. Allows for testing with expected values."""
+ return str(uuid4())
diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py
new file mode 100644
index 000000000..765ba072d
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py
@@ -0,0 +1,50 @@
+"""Check that recommended memory is available."""
+from openshift_checks import OpenShiftCheck
+
+MIB = 2**20
+GIB = 2**30
+
+
+class MemoryAvailability(OpenShiftCheck):
+ """Check that recommended memory is available."""
+
+ name = "memory_availability"
+ tags = ["preflight"]
+
+ # Values taken from the official installation documentation:
+ # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
+ recommended_memory_bytes = {
+ "masters": 16 * GIB,
+ "nodes": 8 * GIB,
+ "etcd": 8 * GIB,
+ }
+ # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal
+ memtotal_adjustment = 1 * GIB
+
+ def is_active(self):
+ """Skip hosts that do not have recommended memory requirements."""
+ group_names = self.get_var("group_names", default=[])
+ has_memory_recommendation = bool(set(group_names).intersection(self.recommended_memory_bytes))
+ return super(MemoryAvailability, self).is_active() and has_memory_recommendation
+
+ def run(self):
+ group_names = self.get_var("group_names")
+ total_memory_bytes = self.get_var("ansible_memtotal_mb") * MIB
+
+ recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+ configured_min = float(self.get_var("openshift_check_min_host_memory_gb", default=0)) * GIB
+ min_memory_bytes = configured_min or recommended_min
+
+ if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes:
+ return {
+ 'failed': True,
+ 'msg': (
+ 'Available memory ({available:.1f} GiB) is too far '
+ 'below recommended value ({recommended:.1f} GiB)'
+ ).format(
+ available=float(total_memory_bytes) / GIB,
+ recommended=float(min_memory_bytes) / GIB,
+ ),
+ }
+
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
index 657e15160..3b2c64e6a 100644
--- a/roles/openshift_health_checker/openshift_checks/mixins.py
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -1,18 +1,53 @@
-# pylint: disable=missing-docstring
-from openshift_checks import get_var
+"""
+Mixin classes meant to be used with subclasses of OpenShiftCheck.
+"""
class NotContainerizedMixin(object):
"""Mixin for checks that are only active when not in containerized mode."""
+ # permanent # pylint: disable=too-few-public-methods
+ # Reason: The mixin is not intended to stand on its own as a class.
- @classmethod
- def is_active(cls, task_vars):
- return (
- # This mixin is meant to be used with subclasses of OpenShiftCheck.
- super(NotContainerizedMixin, cls).is_active(task_vars) and
- not cls.is_containerized(task_vars)
- )
+ def is_active(self):
+ """Only run on non-containerized hosts."""
+ is_containerized = self.get_var("openshift", "common", "is_containerized")
+ return super(NotContainerizedMixin, self).is_active() and not is_containerized
+
+
+class DockerHostMixin(object):
+ """Mixin for checks that are only active on hosts that require Docker."""
+
+ dependencies = []
- @staticmethod
- def is_containerized(task_vars):
- return get_var(task_vars, "openshift", "common", "is_containerized")
+ def is_active(self):
+ """Only run on hosts that depend on Docker."""
+ is_containerized = self.get_var("openshift", "common", "is_containerized")
+ is_node = "nodes" in self.get_var("group_names", default=[])
+ return super(DockerHostMixin, self).is_active() and (is_containerized or is_node)
+
+ def ensure_dependencies(self):
+ """
+ Ensure that docker-related packages exist, but not on atomic hosts
+ (which would not be able to install but should already have them).
+ Returns: msg, failed, changed
+ """
+ if self.get_var("openshift", "common", "is_atomic"):
+ return "", False, False
+
+ # NOTE: we would use the "package" module but it's actually an action plugin
+ # and it's not clear how to invoke one of those. This is about the same anyway:
+ result = self.execute_module(
+ self.get_var("ansible_pkg_mgr", default="yum"),
+ {"name": self.dependencies, "state": "present"},
+ )
+ msg = result.get("msg", "")
+ if result.get("failed"):
+ if "No package matching" in msg:
+ msg = "Ensure that all required dependencies can be installed via `yum`.\n"
+ msg = (
+ "Unable to install required packages on this host:\n"
+ " {deps}\n{msg}"
+ ).format(deps=',\n '.join(self.dependencies), msg=msg)
+ failed = result.get("failed", False) or result.get("rc", 0) != 0
+ changed = result.get("changed", False)
+ return msg, failed, changed
diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py
new file mode 100644
index 000000000..cd6ebd493
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py
@@ -0,0 +1,77 @@
+"""
+Ansible module for determining if an installed version of Open vSwitch is incompatible with the
+currently installed version of OpenShift.
+"""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+from openshift_checks.mixins import NotContainerizedMixin
+
+
+class OvsVersion(NotContainerizedMixin, OpenShiftCheck):
+ """Check that packages in a package_list are installed on the host
+ and are the correct version as determined by an OpenShift installation.
+ """
+
+ name = "ovs_version"
+ tags = ["health"]
+
+ openshift_to_ovs_version = {
+ "3.6": "2.6",
+ "3.5": "2.6",
+ "3.4": "2.4",
+ }
+
+ # map major release versions across releases
+ # to a common major version
+ openshift_major_release_version = {
+ "1": "3",
+ }
+
+ def is_active(self):
+ """Skip hosts that do not have package requirements."""
+ group_names = self.get_var("group_names", default=[])
+ master_or_node = 'masters' in group_names or 'nodes' in group_names
+ return super(OvsVersion, self).is_active() and master_or_node
+
+ def run(self):
+ args = {
+ "package_list": [
+ {
+ "name": "openvswitch",
+ "version": self.get_required_ovs_version(),
+ },
+ ],
+ }
+ return self.execute_module("rpm_version", args)
+
+ def get_required_ovs_version(self):
+ """Return the correct Open vSwitch version for the current OpenShift version"""
+ openshift_version = self._get_openshift_version()
+
+ if float(openshift_version) < 3.5:
+ return self.openshift_to_ovs_version["3.4"]
+
+ ovs_version = self.openshift_to_ovs_version.get(str(openshift_version))
+ if ovs_version:
+ return self.openshift_to_ovs_version[str(openshift_version)]
+
+ msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(openshift_version))
+
+ def _get_openshift_version(self):
+ openshift_version = self.get_var("openshift_image_tag")
+ if openshift_version and openshift_version[0] == 'v':
+ openshift_version = openshift_version[1:]
+
+ return self._parse_version(openshift_version)
+
+ def _parse_version(self, version):
+ components = version.split(".")
+ if not components or len(components) < 2:
+ msg = "An invalid version of OpenShift was found for this host: {}"
+ raise OpenShiftCheckException(msg.format(version))
+
+ if components[0] in self.openshift_major_release_version:
+ components[0] = self.openshift_major_release_version[components[0]]
+
+ return '.'.join(components[:2])
diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py
index 9891972a6..a86180b00 100644
--- a/roles/openshift_health_checker/openshift_checks/package_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/package_availability.py
@@ -1,5 +1,6 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that required RPM packages are available."""
+
+from openshift_checks import OpenShiftCheck
from openshift_checks.mixins import NotContainerizedMixin
@@ -9,9 +10,13 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
name = "package_availability"
tags = ["preflight"]
- def run(self, tmp, task_vars):
- rpm_prefix = get_var(task_vars, "openshift", "common", "service_type")
- group_names = get_var(task_vars, "group_names", default=[])
+ def is_active(self):
+ """Run only when yum is the package manager as the code is specific to it."""
+ return super(PackageAvailability, self).is_active() and self.get_var("ansible_pkg_mgr") == "yum"
+
+ def run(self):
+ rpm_prefix = self.get_var("openshift", "common", "service_type")
+ group_names = self.get_var("group_names", default=[])
packages = set()
@@ -21,10 +26,11 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
packages.update(self.node_packages(rpm_prefix))
args = {"packages": sorted(set(packages))}
- return self.execute_module("check_yum_update", args, tmp, task_vars)
+ return self.execute_module("check_yum_update", args)
@staticmethod
def master_packages(rpm_prefix):
+ """Return a list of RPMs that we expect a master install to have available."""
return [
"{rpm_prefix}".format(rpm_prefix=rpm_prefix),
"{rpm_prefix}-clients".format(rpm_prefix=rpm_prefix),
@@ -32,8 +38,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
"bash-completion",
"cockpit-bridge",
"cockpit-docker",
- "cockpit-kubernetes",
- "cockpit-shell",
+ "cockpit-system",
"cockpit-ws",
"etcd",
"httpd-tools",
@@ -41,6 +46,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
@staticmethod
def node_packages(rpm_prefix):
+ """Return a list of RPMs that we expect a node install to have available."""
return [
"{rpm_prefix}".format(rpm_prefix=rpm_prefix),
"{rpm_prefix}-node".format(rpm_prefix=rpm_prefix),
diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py
index fd0c0a755..1e9aecbe0 100644
--- a/roles/openshift_health_checker/openshift_checks/package_update.py
+++ b/roles/openshift_health_checker/openshift_checks/package_update.py
@@ -1,14 +1,14 @@
-# pylint: disable=missing-docstring
+"""Check that a yum update would not run into conflicts with available packages."""
from openshift_checks import OpenShiftCheck
from openshift_checks.mixins import NotContainerizedMixin
class PackageUpdate(NotContainerizedMixin, OpenShiftCheck):
- """Check that there are no conflicts in RPM packages."""
+ """Check that a yum update would not run into conflicts with available packages."""
name = "package_update"
tags = ["preflight"]
- def run(self, tmp, task_vars):
+ def run(self):
args = {"packages": []}
- return self.execute_module("check_yum_update", args, tmp, task_vars)
+ return self.execute_module("check_yum_update", args)
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
index 42193a1c6..020786804 100644
--- a/roles/openshift_health_checker/openshift_checks/package_version.py
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -1,5 +1,5 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that available RPM packages match the required versions."""
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
from openshift_checks.mixins import NotContainerizedMixin
@@ -9,12 +9,118 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
name = "package_version"
tags = ["preflight"]
- def run(self, tmp, task_vars):
- rpm_prefix = get_var(task_vars, "openshift", "common", "service_type")
- openshift_release = get_var(task_vars, "openshift_release")
+ openshift_to_ovs_version = {
+ "3.6": ["2.6", "2.7"],
+ "3.5": ["2.6", "2.7"],
+ "3.4": "2.4",
+ }
+
+ openshift_to_docker_version = {
+ "3.1": "1.8",
+ "3.2": "1.10",
+ "3.3": "1.10",
+ "3.4": "1.12",
+ }
+
+ # map major release versions across releases
+ # to a common major version
+ openshift_major_release_version = {
+ "1": "3",
+ }
+
+ def is_active(self):
+ """Skip hosts that do not have package requirements."""
+ group_names = self.get_var("group_names", default=[])
+ master_or_node = 'masters' in group_names or 'nodes' in group_names
+ return super(PackageVersion, self).is_active() and master_or_node
+
+ def run(self):
+ rpm_prefix = self.get_var("openshift", "common", "service_type")
+ openshift_release = self.get_var("openshift_release", default='')
+ deployment_type = self.get_var("openshift_deployment_type")
+ check_multi_minor_release = deployment_type in ['openshift-enterprise']
args = {
- "prefix": rpm_prefix,
- "version": openshift_release,
+ "package_list": [
+ {
+ "name": "openvswitch",
+ "version": self.get_required_ovs_version(),
+ "check_multi": False,
+ },
+ {
+ "name": "docker",
+ "version": self.get_required_docker_version(),
+ "check_multi": False,
+ },
+ {
+ "name": "{}".format(rpm_prefix),
+ "version": openshift_release,
+ "check_multi": check_multi_minor_release,
+ },
+ {
+ "name": "{}-master".format(rpm_prefix),
+ "version": openshift_release,
+ "check_multi": check_multi_minor_release,
+ },
+ {
+ "name": "{}-node".format(rpm_prefix),
+ "version": openshift_release,
+ "check_multi": check_multi_minor_release,
+ },
+ ],
}
- return self.execute_module("aos_version", args, tmp, task_vars)
+
+ return self.execute_module("aos_version", args)
+
+ def get_required_ovs_version(self):
+ """Return the correct Open vSwitch version for the current OpenShift version.
+ If the current OpenShift version is >= 3.5, ensure Open vSwitch version 2.6,
+ Else ensure Open vSwitch version 2.4"""
+ openshift_version = self.get_openshift_version()
+
+ if float(openshift_version) < 3.5:
+ return self.openshift_to_ovs_version["3.4"]
+
+ ovs_version = self.openshift_to_ovs_version.get(str(openshift_version))
+ if ovs_version:
+ return ovs_version
+
+ msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(openshift_version))
+
+ def get_required_docker_version(self):
+ """Return the correct Docker version for the current OpenShift version.
+ If the OpenShift version is 3.1, ensure Docker version 1.8.
+ If the OpenShift version is 3.2 or 3.3, ensure Docker version 1.10.
+ If the current OpenShift version is >= 3.4, ensure Docker version 1.12."""
+ openshift_version = self.get_openshift_version()
+
+ if float(openshift_version) >= 3.4:
+ return self.openshift_to_docker_version["3.4"]
+
+ docker_version = self.openshift_to_docker_version.get(str(openshift_version))
+ if docker_version:
+ return docker_version
+
+ msg = "There is no recommended version of Docker for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(openshift_version))
+
+ def get_openshift_version(self):
+ """Return received image tag as a normalized X.Y minor version string."""
+ openshift_version = self.get_var("openshift_image_tag")
+ if openshift_version and openshift_version[0] == 'v':
+ openshift_version = openshift_version[1:]
+
+ return self.parse_version(openshift_version)
+
+ def parse_version(self, version):
+ """Return a normalized X.Y minor version string."""
+ components = version.split(".")
+ if not components or len(components) < 2:
+ msg = "An invalid version of OpenShift was found for this host: {}"
+ raise OpenShiftCheckException(msg.format(version))
+
+ if components[0] in self.openshift_major_release_version:
+ components[0] = self.openshift_major_release_version[components[0]]
+
+ return '.'.join(components[:2])