summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker/openshift_checks
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
-rw-r--r--roles/openshift_health_checker/openshift_checks/__init__.py105
-rw-r--r--roles/openshift_health_checker/openshift_checks/disk_availability.py53
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_image_availability.py72
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_storage.py189
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py29
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_traffic.py44
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_volume.py23
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/curator.py20
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py63
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd.py51
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py138
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/kibana.py41
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging.py47
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py130
-rw-r--r--roles/openshift_health_checker/openshift_checks/memory_availability.py21
-rw-r--r--roles/openshift_health_checker/openshift_checks/mixins.py25
-rw-r--r--roles/openshift_health_checker/openshift_checks/ovs_version.py50
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_availability.py21
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_update.py8
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_version.py125
20 files changed, 845 insertions, 410 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index 5c9949ced..85cbc6224 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -19,15 +19,21 @@ class OpenShiftCheckException(Exception):
@six.add_metaclass(ABCMeta)
class OpenShiftCheck(object):
- """A base class for defining checks for an OpenShift cluster environment."""
+ """
+ A base class for defining checks for an OpenShift cluster environment.
+
+ Expect optional params: method execute_module, dict task_vars, and string tmp.
+ execute_module is expected to have a signature compatible with _execute_module
+ from ansible plugins/action/__init__.py, e.g.:
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *args):
+ This is stored so that it can be invoked in subclasses via check.execute_module("name", args)
+ which provides the check's stored task_vars and tmp.
+ """
- def __init__(self, execute_module=None, module_executor=None):
- if execute_module is module_executor is None:
- raise TypeError(
- "__init__() takes either execute_module (recommended) "
- "or module_executor (deprecated), none given")
- self.execute_module = execute_module or module_executor
- self.module_executor = self.execute_module
+ def __init__(self, execute_module=None, task_vars=None, tmp=None):
+ self._execute_module = execute_module
+ self.task_vars = task_vars or {}
+ self.tmp = tmp
@abstractproperty
def name(self):
@@ -43,13 +49,13 @@ class OpenShiftCheck(object):
"""
return []
- @classmethod
- def is_active(cls, task_vars): # pylint: disable=unused-argument
+ @staticmethod
+ def is_active():
"""Returns true if this check applies to the ansible-playbook run."""
return True
@abstractmethod
- def run(self, tmp, task_vars):
+ def run(self):
"""Executes a check, normally implemented as a module."""
return {}
@@ -62,6 +68,66 @@ class OpenShiftCheck(object):
for subclass in subclass.subclasses():
yield subclass
+ def execute_module(self, module_name=None, module_args=None):
+ """Invoke an Ansible module from a check.
+
+ Invoke stored _execute_module, normally copied from the action
+ plugin, with its params and the task_vars and tmp given at
+ check initialization. No positional parameters beyond these
+ are specified. If it's necessary to specify any of the other
+ parameters to _execute_module then that should just be invoked
+ directly (with awareness of changes in method signature per
+ Ansible version).
+
+ So e.g. check.execute_module("foo", dict(arg1=...))
+ Return: result hash from module execution.
+ """
+ if self._execute_module is None:
+ raise NotImplementedError(
+ self.__class__.__name__ +
+ " invoked execute_module without providing the method at initialization."
+ )
+ return self._execute_module(module_name, module_args, self.tmp, self.task_vars)
+
+ def get_var(self, *keys, **kwargs):
+ """Get deeply nested values from task_vars.
+
+ Ansible task_vars structures are Python dicts, often mapping strings to
+ other dicts. This helper makes it easier to get a nested value, raising
+ OpenShiftCheckException when a key is not found or returning a default value
+ provided as a keyword argument.
+ """
+ try:
+ value = reduce(operator.getitem, keys, self.task_vars)
+ except (KeyError, TypeError):
+ if "default" in kwargs:
+ return kwargs["default"]
+ raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys))))
+ return value
+
+ @staticmethod
+ def get_major_minor_version(openshift_image_tag):
+ """Parse and return the deployed version of OpenShift as a tuple."""
+ if openshift_image_tag and openshift_image_tag[0] == 'v':
+ openshift_image_tag = openshift_image_tag[1:]
+
+ # map major release versions across releases
+ # to a common major version
+ openshift_major_release_version = {
+ "1": "3",
+ }
+
+ components = openshift_image_tag.split(".")
+ if not components or len(components) < 2:
+ msg = "An invalid version of OpenShift was found for this host: {}"
+ raise OpenShiftCheckException(msg.format(openshift_image_tag))
+
+ if components[0] in openshift_major_release_version:
+ components[0] = openshift_major_release_version[components[0]]
+
+ components = tuple(int(x) for x in components[:2])
+ return components
+
LOADER_EXCLUDES = (
"__init__.py",
@@ -86,20 +152,3 @@ def load_checks(path=None, subpkg=""):
modules.append(import_module(__package__ + subpkg + "." + name[:-3]))
return modules
-
-
-def get_var(task_vars, *keys, **kwargs):
- """Helper function to get deeply nested values from task_vars.
-
- Ansible task_vars structures are Python dicts, often mapping strings to
- other dicts. This helper makes it easier to get a nested value, raising
- OpenShiftCheckException when a key is not found or returning a default value
- provided as a keyword argument.
- """
- try:
- value = reduce(operator.getitem, keys, task_vars)
- except (KeyError, TypeError):
- if "default" in kwargs:
- return kwargs["default"]
- raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys))))
- return value
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index e93e81efa..39ac0e4ec 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -3,7 +3,7 @@
import os.path
import tempfile
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
class DiskAvailability(OpenShiftCheck):
@@ -35,22 +35,30 @@ class DiskAvailability(OpenShiftCheck):
},
}
- @classmethod
- def is_active(cls, task_vars):
+ # recommended disk space for each location under an upgrade context
+ recommended_disk_upgrade_bytes = {
+ '/var': {
+ 'masters': 10 * 10**9,
+ 'nodes': 5 * 10 ** 9,
+ 'etcd': 5 * 10 ** 9,
+ },
+ }
+
+ def is_active(self):
"""Skip hosts that do not have recommended disk space requirements."""
- group_names = get_var(task_vars, "group_names", default=[])
+ group_names = self.get_var("group_names", default=[])
active_groups = set()
- for recommendation in cls.recommended_disk_space_bytes.values():
+ for recommendation in self.recommended_disk_space_bytes.values():
active_groups.update(recommendation.keys())
has_disk_space_recommendation = bool(active_groups.intersection(group_names))
- return super(DiskAvailability, cls).is_active(task_vars) and has_disk_space_recommendation
+ return super(DiskAvailability, self).is_active() and has_disk_space_recommendation
- def run(self, tmp, task_vars):
- group_names = get_var(task_vars, "group_names")
- ansible_mounts = get_var(task_vars, "ansible_mounts")
+ def run(self):
+ group_names = self.get_var("group_names")
+ ansible_mounts = self.get_var("ansible_mounts")
ansible_mounts = {mount['mount']: mount for mount in ansible_mounts}
- user_config = get_var(task_vars, "openshift_check_min_host_disk_gb", default={})
+ user_config = self.get_var("openshift_check_min_host_disk_gb", default={})
try:
# For backwards-compatibility, if openshift_check_min_host_disk_gb
# is a number, then it overrides the required config for '/var'.
@@ -81,9 +89,34 @@ class DiskAvailability(OpenShiftCheck):
config_bytes = max(config.get(name, 0) for name in group_names) * 10**9
recommended_bytes = config_bytes or recommended_bytes
+ # if an "upgrade" context is set, update the minimum disk requirement
+ # as this signifies an in-place upgrade - the node might have the
+ # required total disk space, but some of that space may already be
+ # in use by the existing OpenShift deployment.
+ context = self.get_var("r_openshift_health_checker_playbook_context", default="")
+ if context == "upgrade":
+ recommended_upgrade_paths = self.recommended_disk_upgrade_bytes.get(path, {})
+ if recommended_upgrade_paths:
+ recommended_bytes = config_bytes or max(recommended_upgrade_paths.get(name, 0)
+ for name in group_names)
+
if free_bytes < recommended_bytes:
free_gb = float(free_bytes) / 10**9
recommended_gb = float(recommended_bytes) / 10**9
+ msg = (
+ 'Available disk space in "{}" ({:.1f} GB) '
+ 'is below minimum recommended ({:.1f} GB)'
+ ).format(path, free_gb, recommended_gb)
+
+ # warn if check failed under an "upgrade" context
+ # due to limits imposed by the user config
+ if config_bytes and context == "upgrade":
+ msg += ('\n\nMake sure to account for decreased disk space during an upgrade\n'
+ 'due to an existing OpenShift deployment. Please check the value of\n'
+ ' openshift_check_min_host_disk_gb={}\n'
+ 'in your Ansible inventory, and lower the recommended disk space availability\n'
+ 'if necessary for this upgrade.').format(config_bytes)
+
return {
'failed': True,
'msg': (
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index bde81ad2c..77180223e 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -1,6 +1,6 @@
"""Check that required Docker images are available."""
-from openshift_checks import OpenShiftCheck, get_var
+from openshift_checks import OpenShiftCheck
from openshift_checks.mixins import DockerHostMixin
@@ -22,25 +22,26 @@ DEPLOYMENT_IMAGE_INFO = {
class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
"""Check that required Docker images are available.
- This check attempts to ensure that required docker images are
- either present locally, or able to be pulled down from available
- registries defined in a host machine.
+ Determine docker images that an install would require and check that they
+ are either present in the host's docker index, or available for the host to pull
+ with known registries as defined in our inventory file (or defaults).
"""
name = "docker_image_availability"
tags = ["preflight"]
- dependencies = ["skopeo", "python-docker-py"]
+ # we use python-docker-py to check local docker for images, and skopeo
+ # to look for images available remotely without waiting to pull them.
+ dependencies = ["python-docker-py", "skopeo"]
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Skip hosts with unsupported deployment types."""
- deployment_type = get_var(task_vars, "openshift_deployment_type")
+ deployment_type = self.get_var("openshift_deployment_type")
has_valid_deployment_type = deployment_type in DEPLOYMENT_IMAGE_INFO
- return super(DockerImageAvailability, cls).is_active(task_vars) and has_valid_deployment_type
+ return super(DockerImageAvailability, self).is_active() and has_valid_deployment_type
- def run(self, tmp, task_vars):
- msg, failed, changed = self.ensure_dependencies(task_vars)
+ def run(self):
+ msg, failed, changed = self.ensure_dependencies()
if failed:
return {
"failed": True,
@@ -48,18 +49,18 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
"msg": "Some dependencies are required in order to check Docker image availability.\n" + msg
}
- required_images = self.required_images(task_vars)
- missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
+ required_images = self.required_images()
+ missing_images = set(required_images) - set(self.local_images(required_images))
# exit early if all images were found locally
if not missing_images:
return {"changed": changed}
- registries = self.known_docker_registries(task_vars)
+ registries = self.known_docker_registries()
if not registries:
return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed}
- available_images = self.available_images(missing_images, registries, task_vars)
+ available_images = self.available_images(missing_images, registries)
unavailable_images = set(missing_images) - set(available_images)
if unavailable_images:
@@ -74,8 +75,7 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
return {"changed": changed}
- @staticmethod
- def required_images(task_vars):
+ def required_images(self):
"""
Determine which images we expect to need for this host.
Returns: a set of required images like 'openshift/origin:v3.6'
@@ -92,17 +92,17 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
Registry is not included in constructed images. It may be in oreg_url or etcd image.
"""
required = set()
- deployment_type = get_var(task_vars, "openshift_deployment_type")
- host_groups = get_var(task_vars, "group_names")
+ deployment_type = self.get_var("openshift_deployment_type")
+ host_groups = self.get_var("group_names")
# containerized etcd may not have openshift_image_tag, see bz 1466622
- image_tag = get_var(task_vars, "openshift_image_tag", default="latest")
+ image_tag = self.get_var("openshift_image_tag", default="latest")
image_info = DEPLOYMENT_IMAGE_INFO[deployment_type]
if not image_info:
return required
# template for images that run on top of OpenShift
image_url = "{}/{}-{}:{}".format(image_info["namespace"], image_info["name"], "${component}", "${version}")
- image_url = get_var(task_vars, "oreg_url", default="") or image_url
+ image_url = self.get_var("oreg_url", default="") or image_url
if 'nodes' in host_groups:
for suffix in NODE_IMAGE_SUFFIXES:
required.add(image_url.replace("${component}", suffix).replace("${version}", image_tag))
@@ -112,7 +112,7 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
required.add(image_info["registry_console_image"])
# images for containerized components
- if get_var(task_vars, "openshift", "common", "is_containerized"):
+ if self.get_var("openshift", "common", "is_containerized"):
components = set()
if 'nodes' in host_groups:
components.update(["node", "openvswitch"])
@@ -125,28 +125,27 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
return required
- def local_images(self, images, task_vars):
+ def local_images(self, images):
"""Filter a list of images and return those available locally."""
return [
image for image in images
- if self.is_image_local(image, task_vars)
+ if self.is_image_local(image)
]
- def is_image_local(self, image, task_vars):
+ def is_image_local(self, image):
"""Check if image is already in local docker index."""
- result = self.execute_module("docker_image_facts", {"name": image}, task_vars=task_vars)
+ result = self.execute_module("docker_image_facts", {"name": image})
if result.get("failed", False):
return False
return bool(result.get("images", []))
- @staticmethod
- def known_docker_registries(task_vars):
+ def known_docker_registries(self):
"""Build a list of docker registries available according to inventory vars."""
- docker_facts = get_var(task_vars, "openshift", "docker")
+ docker_facts = self.get_var("openshift", "docker")
regs = set(docker_facts["additional_registries"])
- deployment_type = get_var(task_vars, "openshift_deployment_type")
+ deployment_type = self.get_var("openshift_deployment_type")
if deployment_type == "origin":
regs.update(["docker.io"])
elif "enterprise" in deployment_type:
@@ -154,24 +153,25 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
return list(regs)
- def available_images(self, images, registries, task_vars):
- """Inspect existing images using Skopeo and return all images successfully inspected."""
+ def available_images(self, images, default_registries):
+ """Search remotely for images. Returns: list of images found."""
return [
image for image in images
- if self.is_available_skopeo_image(image, registries, task_vars)
+ if self.is_available_skopeo_image(image, default_registries)
]
- def is_available_skopeo_image(self, image, registries, task_vars):
+ def is_available_skopeo_image(self, image, default_registries):
"""Use Skopeo to determine if required image exists in known registry(s)."""
+ registries = default_registries
- # if image does already includes a registry, just use that
+ # if image already includes a registry, only use that
if image.count("/") > 1:
registry, image = image.split("/", 1)
registries = [registry]
for registry in registries:
args = {"_raw_params": "skopeo inspect --tls-verify=false docker://{}/{}".format(registry, image)}
- result = self.execute_module("command", args, task_vars=task_vars)
+ result = self.execute_module("command", args)
if result.get("rc", 0) == 0 and not result.get("failed"):
return True
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
index 8d0fbcc9c..dea15a56e 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_storage.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -1,7 +1,8 @@
"""Check Docker storage driver and usage."""
import json
+import os.path
import re
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
from openshift_checks.mixins import DockerHostMixin
@@ -17,15 +18,32 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
tags = ["pre-install", "health", "preflight"]
dependencies = ["python-docker-py"]
- storage_drivers = ["devicemapper", "overlay2"]
+ storage_drivers = ["devicemapper", "overlay", "overlay2"]
max_thinpool_data_usage_percent = 90.0
max_thinpool_meta_usage_percent = 90.0
+ max_overlay_usage_percent = 90.0
- # pylint: disable=too-many-return-statements
- # Reason: permanent stylistic exception;
- # it is clearer to return on failures and there are just many ways to fail here.
- def run(self, tmp, task_vars):
- msg, failed, changed = self.ensure_dependencies(task_vars)
+ # TODO(lmeyer): mention these in the output when check fails
+ configuration_variables = [
+ (
+ "max_thinpool_data_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for data. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent),
+ ),
+ (
+ "max_thinpool_meta_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for metadata. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent),
+ ),
+ (
+ "max_overlay_usage_percent",
+ "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. "
+ "Format: float. Default: {:.1f}".format(max_overlay_usage_percent),
+ ),
+ ]
+
+ def run(self):
+ msg, failed, changed = self.ensure_dependencies()
if failed:
return {
"failed": True,
@@ -34,17 +52,17 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
}
# attempt to get the docker info hash from the API
- info = self.execute_module("docker_info", {}, task_vars=task_vars)
- if info.get("failed"):
+ docker_info = self.execute_module("docker_info", {})
+ if docker_info.get("failed"):
return {"failed": True, "changed": changed,
"msg": "Failed to query Docker API. Is docker running on this host?"}
- if not info.get("info"): # this would be very strange
+ if not docker_info.get("info"): # this would be very strange
return {"failed": True, "changed": changed,
- "msg": "Docker API query missing info:\n{}".format(json.dumps(info))}
- info = info["info"]
+ "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))}
+ docker_info = docker_info["info"]
# check if the storage driver we saw is valid
- driver = info.get("Driver", "[NONE]")
+ driver = docker_info.get("Driver", "[NONE]")
if driver not in self.storage_drivers:
msg = (
"Detected unsupported Docker storage driver '{driver}'.\n"
@@ -53,26 +71,34 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
return {"failed": True, "changed": changed, "msg": msg}
# driver status info is a list of tuples; convert to dict and validate based on driver
- driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])}
+ driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])}
+
+ result = {}
+
if driver == "devicemapper":
- if driver_status.get("Data loop file"):
- msg = (
- "Use of loopback devices with the Docker devicemapper storage driver\n"
- "(the default storage configuration) is unsupported in production.\n"
- "Please use docker-storage-setup to configure a backing storage volume.\n"
- "See http://red.ht/2rNperO for further information."
- )
- return {"failed": True, "changed": changed, "msg": msg}
- result = self._check_dm_usage(driver_status, task_vars)
- result['changed'] = result.get('changed', False) or changed
- return result
+ result = self.check_devicemapper_support(driver_status)
- # TODO(lmeyer): determine how to check usage for overlay2
+ if driver in ['overlay', 'overlay2']:
+ result = self.check_overlay_support(docker_info, driver_status)
- return {"changed": changed}
+ result['changed'] = result.get('changed', False) or changed
+ return result
- def _check_dm_usage(self, driver_status, task_vars):
- """
+ def check_devicemapper_support(self, driver_status):
+ """Check if dm storage driver is supported as configured. Return: result dict."""
+ if driver_status.get("Data loop file"):
+ msg = (
+ "Use of loopback devices with the Docker devicemapper storage driver\n"
+ "(the default storage configuration) is unsupported in production.\n"
+ "Please use docker-storage-setup to configure a backing storage volume.\n"
+ "See http://red.ht/2rNperO for further information."
+ )
+ return {"failed": True, "msg": msg}
+ result = self.check_dm_usage(driver_status)
+ return result
+
+ def check_dm_usage(self, driver_status):
+ """Check usage thresholds for Docker dm storage driver. Return: result dict.
Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
devicemapper storage. The LV is "thin" because it does not use all available storage
@@ -83,7 +109,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
could run out of space first; so we check both.
"""
vals = dict(
- vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars),
+ vg_free=self.get_vg_free(driver_status.get("Pool Name")),
data_used=driver_status.get("Data Space Used"),
data_total=driver_status.get("Data Space Total"),
metadata_used=driver_status.get("Metadata Space Used"),
@@ -93,7 +119,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
# convert all human-readable strings to bytes
for key, value in vals.copy().items():
try:
- vals[key + "_bytes"] = self._convert_to_bytes(value)
+ vals[key + "_bytes"] = self.convert_to_bytes(value)
except ValueError as err: # unlikely to hit this from API info, but just to be safe
return {
"failed": True,
@@ -104,7 +130,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
# determine the threshold percentages which usage should not exceed
for name, default in [("data", self.max_thinpool_data_usage_percent),
("metadata", self.max_thinpool_meta_usage_percent)]:
- percent = get_var(task_vars, "max_thinpool_" + name + "_usage_percent", default=default)
+ percent = self.get_var("max_thinpool_" + name + "_usage_percent", default=default)
try:
vals[name + "_threshold"] = float(percent)
except ValueError:
@@ -131,10 +157,12 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
return vals
- def _get_vg_free(self, pool, task_vars):
- # Determine which VG to examine according to the pool name, the only indicator currently
- # available from the Docker API driver info. We assume a name that looks like
- # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen.
+ def get_vg_free(self, pool):
+ """Determine which VG to examine according to the pool name. Return: size vgs reports.
+ Pool name is the only indicator currently available from the Docker API driver info.
+ We assume a name that looks like "vg--name-docker--pool";
+ vg and lv names with inner hyphens doubled, joined by a hyphen.
+ """
match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen
if not match: # unlikely, but... be clear if we assumed wrong
raise OpenShiftCheckException(
@@ -146,7 +174,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
vgs_cmd = "/sbin/vgs --noheadings -o vg_free --units g --select vg_name=" + vg_name
# should return free space like " 12.00g" if the VG exists; empty if it does not
- ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars=task_vars)
+ ret = self.execute_module("command", {"_raw_params": vgs_cmd})
if ret.get("failed") or ret.get("rc", 0) != 0:
raise OpenShiftCheckException(
"Is LVM installed? Failed to run /sbin/vgs "
@@ -163,7 +191,8 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
return size
@staticmethod
- def _convert_to_bytes(string):
+ def convert_to_bytes(string):
+ """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes."""
units = dict(
b=1,
k=1024,
@@ -183,3 +212,87 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
raise ValueError("Cannot convert to a byte size: " + string)
return float(number) * multiplier
+
+ def check_overlay_support(self, docker_info, driver_status):
+ """Check if overlay storage driver is supported for this host. Return: result dict."""
+ # check for xfs as backing store
+ backing_fs = driver_status.get("Backing Filesystem", "[NONE]")
+ if backing_fs != "xfs":
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n"
+ "'xfs' as the backing storage, but this host's storage is type '{fs}'."
+ ).format(fs=backing_fs)
+ return {"failed": True, "msg": msg}
+
+ # check support for OS and kernel version
+ o_s = docker_info.get("OperatingSystem", "[NONE]")
+ if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s:
+ # keep it simple, only check enterprise kernel versions; assume everyone else is good
+ kernel = docker_info.get("KernelVersion", "[NONE]")
+ kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)]
+ if kernel_arr < [3, 10, 0, 514]: # rhel < 7.3
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n"
+ "kernel version 3.10.0-514; but Docker reports kernel version {version}."
+ ).format(version=kernel)
+ return {"failed": True, "msg": msg}
+ # NOTE: we could check for --selinux-enabled here but docker won't even start with
+ # that option until it's supported in the kernel so we don't need to.
+
+ return self.check_overlay_usage(docker_info)
+
+ def check_overlay_usage(self, docker_info):
+ """Check disk usage on OverlayFS backing store volume. Return: result dict."""
+ path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"]
+
+ threshold = self.get_var("max_overlay_usage_percent", default=self.max_overlay_usage_percent)
+ try:
+ threshold = float(threshold)
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold),
+ }
+
+ mount = self.find_ansible_mount(path, self.get_var("ansible_mounts"))
+ try:
+ free_bytes = mount['size_available']
+ total_bytes = mount['size_total']
+ usage = 100.0 * (total_bytes - free_bytes) / total_bytes
+ except (KeyError, ZeroDivisionError):
+ return {
+ "failed": True,
+ "msg": "The ansible_mount found for path {} is invalid.\n"
+ "This is likely to be an Ansible bug. The record was:\n"
+ "{}".format(path, json.dumps(mount, indent=2)),
+ }
+
+ if usage > threshold:
+ return {
+ "failed": True,
+ "msg": (
+ "For Docker OverlayFS mount point {path},\n"
+ "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}."
+ ).format(path=mount["mount"], pct=usage, thresh=threshold)
+ }
+
+ return {}
+
+ # TODO(lmeyer): migrate to base class
+ @staticmethod
+ def find_ansible_mount(path, ansible_mounts):
+ """Return the mount point for path from ansible_mounts."""
+
+ mount_for_path = {mount['mount']: mount for mount in ansible_mounts}
+ mount_point = path
+ while mount_point not in mount_for_path:
+ if mount_point in ["/", ""]: # "/" not in ansible_mounts???
+ break
+ mount_point = os.path.dirname(mount_point)
+
+ try:
+ return mount_for_path[mount_point]
+ except KeyError:
+ known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) or 'none'
+ msg = 'Unable to determine mount point for path "{}". Known mount points: {}.'
+ raise OpenShiftCheckException(msg.format(path, known_mounts))
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
index c04a69765..28c38504d 100644
--- a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
+++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
@@ -2,7 +2,7 @@
Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster.
"""
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
class EtcdImageDataSize(OpenShiftCheck):
@@ -11,24 +11,25 @@ class EtcdImageDataSize(OpenShiftCheck):
name = "etcd_imagedata_size"
tags = ["etcd"]
- def run(self, tmp, task_vars):
- etcd_mountpath = self._get_etcd_mountpath(get_var(task_vars, "ansible_mounts"))
+ def run(self):
+ etcd_mountpath = self._get_etcd_mountpath(self.get_var("ansible_mounts"))
etcd_avail_diskspace = etcd_mountpath["size_available"]
etcd_total_diskspace = etcd_mountpath["size_total"]
- etcd_imagedata_size_limit = get_var(task_vars,
- "etcd_max_image_data_size_bytes",
- default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace)))
+ etcd_imagedata_size_limit = self.get_var(
+ "etcd_max_image_data_size_bytes",
+ default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace))
+ )
- etcd_is_ssl = get_var(task_vars, "openshift", "master", "etcd_use_ssl", default=False)
- etcd_port = get_var(task_vars, "openshift", "master", "etcd_port", default=2379)
- etcd_hosts = get_var(task_vars, "openshift", "master", "etcd_hosts")
+ etcd_is_ssl = self.get_var("openshift", "master", "etcd_use_ssl", default=False)
+ etcd_port = self.get_var("openshift", "master", "etcd_port", default=2379)
+ etcd_hosts = self.get_var("openshift", "master", "etcd_hosts")
- config_base = get_var(task_vars, "openshift", "common", "config_base")
+ config_base = self.get_var("openshift", "common", "config_base")
- cert = task_vars.get("etcd_client_cert", config_base + "/master/master.etcd-client.crt")
- key = task_vars.get("etcd_client_key", config_base + "/master/master.etcd-client.key")
- ca_cert = task_vars.get("etcd_client_ca_cert", config_base + "/master/master.etcd-ca.crt")
+ cert = self.get_var("etcd_client_cert", default=config_base + "/master/master.etcd-client.crt")
+ key = self.get_var("etcd_client_key", default=config_base + "/master/master.etcd-client.key")
+ ca_cert = self.get_var("etcd_client_ca_cert", default=config_base + "/master/master.etcd-ca.crt")
for etcd_host in list(etcd_hosts):
args = {
@@ -46,7 +47,7 @@ class EtcdImageDataSize(OpenShiftCheck):
},
}
- etcdkeysize = self.module_executor("etcdkeysize", args, task_vars)
+ etcdkeysize = self.execute_module("etcdkeysize", args)
if etcdkeysize.get("rc", 0) != 0 or etcdkeysize.get("failed"):
msg = 'Failed to retrieve stats for etcd host "{host}": {reason}'
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
new file mode 100644
index 000000000..cc1b14d8a
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
@@ -0,0 +1,44 @@
+"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic."""
+
+from openshift_checks import OpenShiftCheck
+
+
+class EtcdTraffic(OpenShiftCheck):
+ """Check if host is being affected by an increase in etcd traffic."""
+
+ name = "etcd_traffic"
+ tags = ["health", "etcd"]
+
+ def is_active(self):
+ """Skip hosts that do not have etcd in their group names."""
+ group_names = self.get_var("group_names", default=[])
+ valid_group_names = "etcd" in group_names
+
+ version = self.get_var("openshift", "common", "short_version")
+ valid_version = version in ("3.4", "3.5", "1.4", "1.5")
+
+ return super(EtcdTraffic, self).is_active() and valid_group_names and valid_version
+
+ def run(self):
+ is_containerized = self.get_var("openshift", "common", "is_containerized")
+ unit = "etcd_container" if is_containerized else "etcd"
+
+ log_matchers = [{
+ "start_regexp": r"Starting Etcd Server",
+ "regexp": r"etcd: sync duration of [^,]+, expected less than 1s",
+ "unit": unit
+ }]
+
+ match = self.execute_module("search_journalctl", {"log_matchers": log_matchers})
+
+ if match.get("matched"):
+ msg = ("Higher than normal etcd traffic detected.\n"
+ "OpenShift 3.4 introduced an increase in etcd traffic.\n"
+ "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n"
+ "Please refer to https://access.redhat.com/solutions/2916381 for more information.")
+ return {"failed": True, "msg": msg}
+
+ if match.get("failed"):
+ return {"failed": True, "msg": "\n".join(match.get("errors"))}
+
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
index 7452c9cc1..da7d0364a 100644
--- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py
+++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
@@ -1,6 +1,6 @@
"""A health check for OpenShift clusters."""
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
class EtcdVolume(OpenShiftCheck):
@@ -14,21 +14,18 @@ class EtcdVolume(OpenShiftCheck):
# Where to find ectd data, higher priority first.
supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
- @classmethod
- def is_active(cls, task_vars):
- etcd_hosts = get_var(task_vars, "groups", "etcd", default=[]) or get_var(task_vars, "groups", "masters",
- default=[]) or []
- is_etcd_host = get_var(task_vars, "ansible_ssh_host") in etcd_hosts
- return super(EtcdVolume, cls).is_active(task_vars) and is_etcd_host
+ def is_active(self):
+ etcd_hosts = self.get_var("groups", "etcd", default=[]) or self.get_var("groups", "masters", default=[]) or []
+ is_etcd_host = self.get_var("ansible_ssh_host") in etcd_hosts
+ return super(EtcdVolume, self).is_active() and is_etcd_host
- def run(self, tmp, task_vars):
- mount_info = self._etcd_mount_info(task_vars)
+ def run(self):
+ mount_info = self._etcd_mount_info()
available = mount_info["size_available"]
total = mount_info["size_total"]
used = total - available
- threshold = get_var(
- task_vars,
+ threshold = self.get_var(
"etcd_device_usage_threshold_percent",
default=self.default_threshold_percent
)
@@ -45,8 +42,8 @@ class EtcdVolume(OpenShiftCheck):
return {"changed": False}
- def _etcd_mount_info(self, task_vars):
- ansible_mounts = get_var(task_vars, "ansible_mounts")
+ def _etcd_mount_info(self):
+ ansible_mounts = self.get_var("ansible_mounts")
mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts}
for path in self.supported_mount_paths:
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
index c9fc59896..32d853d57 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/curator.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -1,28 +1,21 @@
-"""
-Module for performing checks on an Curator logging deployment
-"""
+"""Check for an aggregated logging Curator deployment"""
-from openshift_checks import get_var
from openshift_checks.logging.logging import LoggingCheck
class Curator(LoggingCheck):
- """Module that checks an integrated logging Curator deployment"""
+ """Check for an aggregated logging Curator deployment"""
name = "curator"
tags = ["health", "logging"]
- logging_namespace = None
-
- def run(self, tmp, task_vars):
+ def run(self):
"""Check various things and gather errors. Returns: result as hash"""
- self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
- curator_pods, error = super(Curator, self).get_pods_for_component(
- self.module_executor,
+ self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
+ curator_pods, error = self.get_pods_for_component(
self.logging_namespace,
"curator",
- task_vars
)
if error:
return {"failed": True, "changed": False, "msg": error}
@@ -30,7 +23,6 @@ class Curator(LoggingCheck):
if check_error:
msg = ("The following Curator deployment issue was found:"
- "\n-------\n"
"{}".format(check_error))
return {"failed": True, "changed": False, "msg": msg}
@@ -46,7 +38,7 @@ class Curator(LoggingCheck):
"Is Curator correctly deployed?"
)
- not_running = super(Curator, self).not_running_pods(pods)
+ not_running = self.not_running_pods(pods)
if len(not_running) == len(pods):
return (
"The Curator pod is not currently in a running state,\n"
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
index 01cb35b81..8bdda1f32 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -1,39 +1,31 @@
-"""
-Module for performing checks on an Elasticsearch logging deployment
-"""
+"""Check for an aggregated logging Elasticsearch deployment"""
import json
import re
-from openshift_checks import get_var
from openshift_checks.logging.logging import LoggingCheck
class Elasticsearch(LoggingCheck):
- """Module that checks an integrated logging Elasticsearch deployment"""
+ """Check for an aggregated logging Elasticsearch deployment"""
name = "elasticsearch"
tags = ["health", "logging"]
- logging_namespace = None
-
- def run(self, tmp, task_vars):
+ def run(self):
"""Check various things and gather errors. Returns: result as hash"""
- self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
- es_pods, error = super(Elasticsearch, self).get_pods_for_component(
- self.execute_module,
+ self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
+ es_pods, error = self.get_pods_for_component(
self.logging_namespace,
"es",
- task_vars,
)
if error:
return {"failed": True, "changed": False, "msg": error}
- check_error = self.check_elasticsearch(es_pods, task_vars)
+ check_error = self.check_elasticsearch(es_pods)
if check_error:
msg = ("The following Elasticsearch deployment issue was found:"
- "\n-------\n"
"{}".format(check_error))
return {"failed": True, "changed": False, "msg": msg}
@@ -41,8 +33,8 @@ class Elasticsearch(LoggingCheck):
return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'}
def _not_running_elasticsearch_pods(self, es_pods):
- """Returns: list of running pods, list of errors about non-running pods"""
- not_running = super(Elasticsearch, self).not_running_pods(es_pods)
+ """Returns: list of pods that are not running, list of errors about non-running pods"""
+ not_running = self.not_running_pods(es_pods)
if not_running:
return not_running, [(
'The following Elasticsearch pods are not running:\n'
@@ -54,7 +46,7 @@ class Elasticsearch(LoggingCheck):
))]
return not_running, []
- def check_elasticsearch(self, es_pods, task_vars):
+ def check_elasticsearch(self, es_pods):
"""Various checks for elasticsearch. Returns: error string"""
not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods)
running_pods = [pod for pod in es_pods if pod not in not_running_pods]
@@ -65,10 +57,10 @@ class Elasticsearch(LoggingCheck):
}
if not pods_by_name:
return 'No logging Elasticsearch pods were found. Is logging deployed?'
- error_msgs += self._check_elasticsearch_masters(pods_by_name, task_vars)
- error_msgs += self._check_elasticsearch_node_list(pods_by_name, task_vars)
- error_msgs += self._check_es_cluster_health(pods_by_name, task_vars)
- error_msgs += self._check_elasticsearch_diskspace(pods_by_name, task_vars)
+ error_msgs += self._check_elasticsearch_masters(pods_by_name)
+ error_msgs += self._check_elasticsearch_node_list(pods_by_name)
+ error_msgs += self._check_es_cluster_health(pods_by_name)
+ error_msgs += self._check_elasticsearch_diskspace(pods_by_name)
return '\n'.join(error_msgs)
@staticmethod
@@ -76,14 +68,14 @@ class Elasticsearch(LoggingCheck):
base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
- def _check_elasticsearch_masters(self, pods_by_name, task_vars):
+ def _check_elasticsearch_masters(self, pods_by_name):
"""Check that Elasticsearch masters are sane. Returns: list of error strings"""
es_master_names = set()
error_msgs = []
for pod_name in pods_by_name.keys():
# Compare what each ES node reports as master and compare for split brain
get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
- master_name_str = self._exec_oc(get_master_cmd, [], task_vars)
+ master_name_str = self.exec_oc(self.logging_namespace, get_master_cmd, [])
master_names = (master_name_str or '').split(' ')
if len(master_names) > 1:
es_master_names.add(master_names[1])
@@ -108,7 +100,7 @@ class Elasticsearch(LoggingCheck):
return error_msgs
- def _check_elasticsearch_node_list(self, pods_by_name, task_vars):
+ def _check_elasticsearch_node_list(self, pods_by_name):
"""Check that reported ES masters are accounted for by pods. Returns: list of error strings"""
if not pods_by_name:
@@ -116,7 +108,7 @@ class Elasticsearch(LoggingCheck):
# get ES cluster nodes
node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
- cluster_node_data = self._exec_oc(node_cmd, [], task_vars)
+ cluster_node_data = self.exec_oc(self.logging_namespace, node_cmd, [])
try:
cluster_nodes = json.loads(cluster_node_data)['nodes']
except (ValueError, KeyError):
@@ -138,12 +130,12 @@ class Elasticsearch(LoggingCheck):
return error_msgs
- def _check_es_cluster_health(self, pods_by_name, task_vars):
+ def _check_es_cluster_health(self, pods_by_name):
"""Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
error_msgs = []
for pod_name in pods_by_name.keys():
cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
- cluster_health_data = self._exec_oc(cluster_health_cmd, [], task_vars)
+ cluster_health_data = self.exec_oc(self.logging_namespace, cluster_health_cmd, [])
try:
health_res = json.loads(cluster_health_data)
if not health_res or not health_res.get('status'):
@@ -162,7 +154,7 @@ class Elasticsearch(LoggingCheck):
return error_msgs
- def _check_elasticsearch_diskspace(self, pods_by_name, task_vars):
+ def _check_elasticsearch_diskspace(self, pods_by_name):
"""
Exec into an ES pod and query the diskspace on the persistent volume.
Returns: list of errors
@@ -170,7 +162,7 @@ class Elasticsearch(LoggingCheck):
error_msgs = []
for pod_name in pods_by_name.keys():
df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
- disk_output = self._exec_oc(df_cmd, [], task_vars)
+ disk_output = self.exec_oc(self.logging_namespace, df_cmd, [])
lines = disk_output.splitlines()
# expecting one header looking like 'IUse% Use%' and one body line
body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
@@ -182,7 +174,7 @@ class Elasticsearch(LoggingCheck):
continue
inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
- inode_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_inode_pct', default='90')
+ inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90')
if int(inode_pct) >= int(inode_pct_thresh):
error_msgs.append(
'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
@@ -193,7 +185,7 @@ class Elasticsearch(LoggingCheck):
limit=str(inode_pct_thresh),
param='openshift_check_efk_es_inode_pct',
))
- disk_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_storage_pct', default='80')
+ disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80')
if int(disk_pct) >= int(disk_pct_thresh):
error_msgs.append(
'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
@@ -206,12 +198,3 @@ class Elasticsearch(LoggingCheck):
))
return error_msgs
-
- def _exec_oc(self, cmd_str, extra_args, task_vars):
- return super(Elasticsearch, self).exec_oc(
- self.execute_module,
- self.logging_namespace,
- cmd_str,
- extra_args,
- task_vars,
- )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
index 627567293..b3485bf44 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -1,37 +1,30 @@
-"""
-Module for performing checks on an Fluentd logging deployment
-"""
+"""Check for an aggregated logging Fluentd deployment"""
import json
-from openshift_checks import get_var
from openshift_checks.logging.logging import LoggingCheck
class Fluentd(LoggingCheck):
- """Module that checks an integrated logging Fluentd deployment"""
+ """Check for an aggregated logging Fluentd deployment"""
+
name = "fluentd"
tags = ["health", "logging"]
- logging_namespace = None
-
- def run(self, tmp, task_vars):
+ def run(self):
"""Check various things and gather errors. Returns: result as hash"""
- self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+ self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
fluentd_pods, error = super(Fluentd, self).get_pods_for_component(
- self.execute_module,
self.logging_namespace,
"fluentd",
- task_vars,
)
if error:
return {"failed": True, "changed": False, "msg": error}
- check_error = self.check_fluentd(fluentd_pods, task_vars)
+ check_error = self.check_fluentd(fluentd_pods)
if check_error:
msg = ("The following Fluentd deployment issue was found:"
- "\n-------\n"
"{}".format(check_error))
return {"failed": True, "changed": False, "msg": msg}
@@ -53,10 +46,9 @@ class Fluentd(LoggingCheck):
).format(label=node_selector)
return fluentd_nodes, None
- @staticmethod
- def _check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars):
+ def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
"""Note if nodes are not labeled as expected. Returns: error string"""
- intended_nodes = get_var(task_vars, 'openshift_logging_fluentd_hosts', default=['--all'])
+ intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all'])
if not intended_nodes or '--all' in intended_nodes:
intended_nodes = nodes_by_name.keys()
nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
@@ -114,13 +106,15 @@ class Fluentd(LoggingCheck):
))
return None
- def check_fluentd(self, pods, task_vars):
+ def check_fluentd(self, pods):
"""Verify fluentd is running everywhere. Returns: error string"""
- node_selector = get_var(task_vars, 'openshift_logging_fluentd_nodeselector',
- default='logging-infra-fluentd=true')
+ node_selector = self.get_var(
+ 'openshift_logging_fluentd_nodeselector',
+ default='logging-infra-fluentd=true'
+ )
- nodes_by_name, error = self.get_nodes_by_name(task_vars)
+ nodes_by_name, error = self.get_nodes_by_name()
if error:
return error
@@ -129,7 +123,7 @@ class Fluentd(LoggingCheck):
return error
error_msgs = []
- error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars)
+ error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
if error:
error_msgs.append(error)
error = self._check_nodes_have_fluentd(pods, fluentd_nodes)
@@ -148,9 +142,13 @@ class Fluentd(LoggingCheck):
return '\n'.join(error_msgs)
- def get_nodes_by_name(self, task_vars):
+ def get_nodes_by_name(self):
"""Retrieve all the node definitions. Returns: dict(name: node), error string"""
- nodes_json = self._exec_oc("get nodes -o json", [], task_vars)
+ nodes_json = self.exec_oc(
+ self.logging_namespace,
+ "get nodes -o json",
+ []
+ )
try:
nodes = json.loads(nodes_json)
except ValueError: # no valid json - should not happen
@@ -161,10 +159,3 @@ class Fluentd(LoggingCheck):
node['metadata']['name']: node
for node in nodes['items']
}, None
-
- def _exec_oc(self, cmd_str, extra_args, task_vars):
- return super(Fluentd, self).exec_oc(self.execute_module,
- self.logging_namespace,
- cmd_str,
- extra_args,
- task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
new file mode 100644
index 000000000..0970f0a63
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
@@ -0,0 +1,138 @@
+"""
+Module for performing checks on a Fluentd logging deployment configuration
+"""
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class FluentdConfig(LoggingCheck):
+ """Module that checks logging configuration of an integrated logging Fluentd deployment"""
+ name = "fluentd_config"
+ tags = ["health"]
+
+ def is_active(self):
+ logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False)
+
+ try:
+ version = self.get_major_minor_version(self.get_var("openshift_image_tag"))
+ except ValueError:
+ # if failed to parse OpenShift version, perform check anyway (if logging enabled)
+ return logging_deployed
+
+ return logging_deployed and version < (3, 6)
+
+ def run(self):
+ """Check that Fluentd has running pods, and that its logging config matches Docker's logging config."""
+ self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace)
+ config_error = self.check_logging_config()
+ if config_error:
+ msg = ("The following Fluentd logging configuration problem was found:"
+ "\n{}".format(config_error))
+ return {"failed": True, "msg": msg}
+
+ return {}
+
+ def check_logging_config(self):
+ """Ensure that the configured Docker logging driver matches fluentd settings.
+ This means that, at least for now, if the following condition is met:
+
+ openshift_logging_fluentd_use_journal == True
+
+ then the value of the configured Docker logging driver should be "journald".
+ Otherwise, the value of the Docker logging driver should be "json-file".
+ Returns an error string if the above condition is not met, or None otherwise."""
+ use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True)
+
+ # if check is running on a master, retrieve all running pods
+ # and check any pod's container for the env var "USE_JOURNAL"
+ group_names = self.get_var("group_names")
+ if "masters" in group_names:
+ use_journald = self.check_fluentd_env_var()
+
+ docker_info = self.execute_module("docker_info", {})
+ try:
+ logging_driver = docker_info["info"]["LoggingDriver"]
+ except KeyError:
+ return "Unable to determine Docker logging driver."
+
+ logging_driver = docker_info["info"]["LoggingDriver"]
+ recommended_logging_driver = "journald"
+ error = None
+
+ # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver`
+ # option as an inventory file variable, or adding the log driver value as part of the
+ # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that
+ # can be passed to the Docker binary; the only other recommendation that can be made, would be
+ # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running
+ # individual containers.
+ if use_journald and logging_driver != "journald":
+ error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n'
+ 'This differs from your Docker configuration, which has been set to use "{driver}" '
+ 'as the default method of storing logs.\n'
+ 'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+ 'from your Docker containers.').format(driver=logging_driver)
+ elif not use_journald and logging_driver != "json-file":
+ recommended_logging_driver = "json-file"
+ error = ('Your Fluentd configuration is set to aggregate Docker container logs from '
+ 'individual json log files per container.\n '
+ 'This differs from your Docker configuration, which has been set to use '
+ '"{driver}" as the default method of storing logs.\n'
+ 'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+ 'from your Docker containers.').format(driver=logging_driver)
+
+ if error:
+ error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n'
+ ' openshift_docker_options="--log-driver={driver}"\n\n'
+ 'Alternatively, you can add the following option to your Docker configuration, located in'
+ '"/etc/docker/daemon.json":\n\n'
+ '{{ "log-driver": "{driver}" }}\n\n'
+ 'See https://docs.docker.com/engine/admin/logging/json-file '
+ 'for more information.').format(driver=recommended_logging_driver)
+
+ return error
+
+ def check_fluentd_env_var(self):
+ """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod."""
+ running_pods = self.running_fluentd_pods()
+
+ try:
+ pod_containers = running_pods[0]["spec"]["containers"]
+ except KeyError:
+ return "Unable to detect running containers on selected Fluentd pod."
+
+ if not pod_containers:
+ msg = ('There are no running containers on selected Fluentd pod "{}".\n'
+ 'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", ""))
+ raise OpenShiftCheckException(msg)
+
+ pod_env = pod_containers[0].get("env")
+ if not pod_env:
+ msg = ('There are no environment variables set on the Fluentd container "{}".\n'
+ 'Unable to calculate expected logging driver.').format(pod_containers[0].get("name"))
+ raise OpenShiftCheckException(msg)
+
+ for env in pod_env:
+ if env["name"] == "USE_JOURNAL":
+ return env.get("value", "false") != "false"
+
+ return False
+
+ def running_fluentd_pods(self):
+ """Return a list of running fluentd pods."""
+ fluentd_pods, error = self.get_pods_for_component(
+ self.logging_namespace,
+ "fluentd",
+ )
+ if error:
+ msg = 'Unable to retrieve any pods for the "fluentd" logging component: {}'.format(error)
+ raise OpenShiftCheckException(msg)
+
+ running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running']
+ if not running_fluentd_pods:
+ msg = ('No Fluentd pods were found to be in the "Running" state. '
+ 'At least one Fluentd pod is required in order to perform this check.')
+
+ raise OpenShiftCheckException(msg)
+
+ return running_fluentd_pods
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
index 442f407b1..efb14ab42 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -12,7 +12,6 @@ except ImportError:
from urllib.error import HTTPError, URLError
import urllib.request as urllib2
-from openshift_checks import get_var
from openshift_checks.logging.logging import LoggingCheck
@@ -22,35 +21,30 @@ class Kibana(LoggingCheck):
name = "kibana"
tags = ["health", "logging"]
- logging_namespace = None
-
- def run(self, tmp, task_vars):
+ def run(self):
"""Check various things and gather errors. Returns: result as hash"""
- self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
- kibana_pods, error = super(Kibana, self).get_pods_for_component(
- self.execute_module,
+ self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
+ kibana_pods, error = self.get_pods_for_component(
self.logging_namespace,
"kibana",
- task_vars,
)
if error:
return {"failed": True, "changed": False, "msg": error}
check_error = self.check_kibana(kibana_pods)
if not check_error:
- check_error = self._check_kibana_route(task_vars)
+ check_error = self._check_kibana_route()
if check_error:
msg = ("The following Kibana deployment issue was found:"
- "\n-------\n"
"{}".format(check_error))
return {"failed": True, "changed": False, "msg": msg}
# TODO(lmeyer): run it all again for the ops cluster
return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'}
- def _verify_url_internal(self, url, task_vars):
+ def _verify_url_internal(self, url):
"""
Try to reach a URL from the host.
Returns: success (bool), reason (for failure)
@@ -62,7 +56,7 @@ class Kibana(LoggingCheck):
# TODO(lmeyer): give users option to validate certs
status_code=302,
)
- result = self.execute_module('uri', args, task_vars)
+ result = self.execute_module('uri', args)
if result.get('failed'):
return result['msg']
return None
@@ -114,14 +108,18 @@ class Kibana(LoggingCheck):
return None
- def _get_kibana_url(self, task_vars):
+ def _get_kibana_url(self):
"""
Get kibana route or report error.
Returns: url (or empty), reason for failure
"""
# Get logging url
- get_route = self._exec_oc("get route logging-kibana -o json", [], task_vars)
+ get_route = self.exec_oc(
+ self.logging_namespace,
+ "get route logging-kibana -o json",
+ [],
+ )
if not get_route:
return None, 'no_route_exists'
@@ -139,7 +137,7 @@ class Kibana(LoggingCheck):
return 'https://{}/'.format(host), None
- def _check_kibana_route(self, task_vars):
+ def _check_kibana_route(self):
"""
Check to see if kibana route is up and working.
Returns: error string
@@ -160,12 +158,12 @@ class Kibana(LoggingCheck):
),
)
- kibana_url, error = self._get_kibana_url(task_vars)
+ kibana_url, error = self._get_kibana_url()
if not kibana_url:
return known_errors.get(error, error)
# first, check that kibana is reachable from the master.
- error = self._verify_url_internal(kibana_url, task_vars)
+ error = self._verify_url_internal(kibana_url)
if error:
if 'urlopen error [Errno 111] Connection refused' in error:
error = (
@@ -190,7 +188,7 @@ class Kibana(LoggingCheck):
# in production we would like the kibana route to work from outside the
# cluster too; but that may not be the case, so allow disabling just this part.
- if not get_var(task_vars, "openshift_check_efk_kibana_external", default=True):
+ if not self.get_var("openshift_check_efk_kibana_external", default=True):
return None
error = self._verify_url_external(kibana_url)
if error:
@@ -220,10 +218,3 @@ class Kibana(LoggingCheck):
).format(error=error)
return error
return None
-
- def _exec_oc(self, cmd_str, extra_args, task_vars):
- return super(Kibana, self).exec_oc(self.execute_module,
- self.logging_namespace,
- cmd_str,
- extra_args,
- task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
index 05b4d300c..43ba6c406 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -5,39 +5,39 @@ Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana st
import json
import os
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
class LoggingCheck(OpenShiftCheck):
- """Base class for logging component checks"""
+ """Base class for OpenShift aggregated logging component checks"""
+
+ # FIXME: this should not be listed as a check, since it is not meant to be
+ # run by itself.
name = "logging"
+ logging_namespace = "logging"
- @classmethod
- def is_active(cls, task_vars):
- return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars)
+ def is_active(self):
+ logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False)
+ return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master()
- @staticmethod
- def is_first_master(task_vars):
- """Run only on first master and only when logging is configured. Returns: bool"""
- logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True)
+ def is_first_master(self):
+ """Determine if running on first master. Returns: bool"""
# Note: It would be nice to use membership in oo_first_master group, however for now it
# seems best to avoid requiring that setup and just check this is the first master.
- hostname = get_var(task_vars, "ansible_ssh_host") or [None]
- masters = get_var(task_vars, "groups", "masters", default=None) or [None]
- return logging_deployed and masters[0] == hostname
+ hostname = self.get_var("ansible_ssh_host") or [None]
+ masters = self.get_var("groups", "masters", default=None) or [None]
+ return masters[0] == hostname
- def run(self, tmp, task_vars):
- pass
+ def run(self):
+ return {}
- def get_pods_for_component(self, execute_module, namespace, logging_component, task_vars):
+ def get_pods_for_component(self, namespace, logging_component):
"""Get all pods for a given component. Returns: list of pods for component, error string"""
pod_output = self.exec_oc(
- execute_module,
namespace,
"get pods -l component={} -o json".format(logging_component),
[],
- task_vars
)
try:
pods = json.loads(pod_output)
@@ -45,7 +45,7 @@ class LoggingCheck(OpenShiftCheck):
raise ValueError()
except ValueError:
# successful run but non-parsing data generally means there were no pods in the namespace
- return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace)
+ return None, 'No pods were found for the "{}" logging component.'.format(logging_component)
return pods['items'], None
@@ -54,23 +54,22 @@ class LoggingCheck(OpenShiftCheck):
"""Returns: list of pods not in a ready and running state"""
return [
pod for pod in pods
- if any(
+ if not pod.get("status", {}).get("containerStatuses") or any(
container['ready'] is False
for container in pod['status']['containerStatuses']
) or not any(
condition['type'] == 'Ready' and condition['status'] == 'True'
- for condition in pod['status']['conditions']
+ for condition in pod['status'].get('conditions', [])
)
]
- @staticmethod
- def exec_oc(execute_module=None, namespace="logging", cmd_str="", extra_args=None, task_vars=None):
+ def exec_oc(self, namespace="logging", cmd_str="", extra_args=None):
"""
Execute an 'oc' command in the remote host.
Returns: output of command and namespace,
or raises OpenShiftCheckException on error
"""
- config_base = get_var(task_vars, "openshift", "common", "config_base")
+ config_base = self.get_var("openshift", "common", "config_base")
args = {
"namespace": namespace,
"config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
@@ -78,7 +77,7 @@ class LoggingCheck(OpenShiftCheck):
"extra_args": list(extra_args) if extra_args else [],
}
- result = execute_module("ocutil", args, task_vars)
+ result = self.execute_module("ocutil", args)
if result.get("failed"):
msg = (
'Unexpected error using `oc` to validate the logging stack components.\n'
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
new file mode 100644
index 000000000..b24e88e05
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -0,0 +1,130 @@
+"""
+Check for ensuring logs from pods can be queried in a reasonable amount of time.
+"""
+
+import json
+import time
+
+from uuid import uuid4
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+ES_CMD_TIMEOUT_SECONDS = 30
+
+
+class LoggingIndexTime(LoggingCheck):
+ """Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time."""
+ name = "logging_index_time"
+ tags = ["health", "logging"]
+
+ logging_namespace = "logging"
+
+ def run(self):
+ """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs."""
+ try:
+ log_index_timeout = int(
+ self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS)
+ )
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": ('Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
+ 'Value must be an integer representing an amount in seconds.'),
+ }
+
+ running_component_pods = dict()
+
+ # get all component pods
+ self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace)
+ for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']):
+ pods, error = self.get_pods_for_component(self.logging_namespace, component)
+
+ if error:
+ msg = 'Unable to retrieve pods for the {} logging component: {}'
+ return {"failed": True, "changed": False, "msg": msg.format(name, error)}
+
+ running_pods = self.running_pods(pods)
+
+ if not running_pods:
+ msg = ('No {} pods in the "Running" state were found.'
+ 'At least one pod is required in order to perform this check.')
+ return {"failed": True, "changed": False, "msg": msg.format(name)}
+
+ running_component_pods[component] = running_pods
+
+ uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0])
+ self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout)
+ return {}
+
+ def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs):
+ """Retry an Elasticsearch query every second until query success, or a defined
+ length of time has passed."""
+ deadline = time.time() + timeout_secs
+ interval = 1
+ while not self.query_es_from_es(es_pod, uuid):
+ if time.time() + interval > deadline:
+ msg = "expecting match in Elasticsearch for message with uuid {}, but no matches were found after {}s."
+ raise OpenShiftCheckException(msg.format(uuid, timeout_secs))
+ time.sleep(interval)
+
+ def curl_kibana_with_uuid(self, kibana_pod):
+ """curl Kibana with a unique uuid."""
+ uuid = self.generate_uuid()
+ pod_name = kibana_pod["metadata"]["name"]
+ exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}"
+ exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid)
+
+ error_str = self.exec_oc(self.logging_namespace, exec_cmd, [])
+
+ try:
+ error_code = json.loads(error_str)["statusCode"]
+ except KeyError:
+ msg = ('invalid response returned from Kibana request (Missing "statusCode" key):\n'
+ 'Command: {}\nResponse: {}').format(exec_cmd, error_str)
+ raise OpenShiftCheckException(msg)
+ except ValueError:
+ msg = ('invalid response returned from Kibana request (Non-JSON output):\n'
+ 'Command: {}\nResponse: {}').format(exec_cmd, error_str)
+ raise OpenShiftCheckException(msg)
+
+ if error_code != 404:
+ msg = 'invalid error code returned from Kibana request. Expecting error code "404", but got "{}" instead.'
+ raise OpenShiftCheckException(msg.format(error_code))
+
+ return uuid
+
+ def query_es_from_es(self, es_pod, uuid):
+ """curl the Elasticsearch pod and look for a unique uuid in its logs."""
+ pod_name = es_pod["metadata"]["name"]
+ exec_cmd = (
+ "exec {pod_name} -- curl --max-time 30 -s -f "
+ "--cacert /etc/elasticsearch/secret/admin-ca "
+ "--cert /etc/elasticsearch/secret/admin-cert "
+ "--key /etc/elasticsearch/secret/admin-key "
+ "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
+ )
+ exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace, uuid=uuid)
+ result = self.exec_oc(self.logging_namespace, exec_cmd, [])
+
+ try:
+ count = json.loads(result)["count"]
+ except KeyError:
+ msg = 'invalid response from Elasticsearch query:\n"{}"\nMissing "count" key:\n{}'
+ raise OpenShiftCheckException(msg.format(exec_cmd, result))
+ except ValueError:
+ msg = 'invalid response from Elasticsearch query:\n"{}"\nNon-JSON output:\n{}'
+ raise OpenShiftCheckException(msg.format(exec_cmd, result))
+
+ return count
+
+ @staticmethod
+ def running_pods(pods):
+ """Filter pods that are running."""
+ return [pod for pod in pods if pod['status']['phase'] == 'Running']
+
+ @staticmethod
+ def generate_uuid():
+ """Wrap uuid generator. Allows for testing with expected values."""
+ return str(uuid4())
diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py
index f4e31065f..765ba072d 100644
--- a/roles/openshift_health_checker/openshift_checks/memory_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py
@@ -1,5 +1,5 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that recommended memory is available."""
+from openshift_checks import OpenShiftCheck
MIB = 2**20
GIB = 2**30
@@ -21,19 +21,18 @@ class MemoryAvailability(OpenShiftCheck):
# https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal
memtotal_adjustment = 1 * GIB
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Skip hosts that do not have recommended memory requirements."""
- group_names = get_var(task_vars, "group_names", default=[])
- has_memory_recommendation = bool(set(group_names).intersection(cls.recommended_memory_bytes))
- return super(MemoryAvailability, cls).is_active(task_vars) and has_memory_recommendation
+ group_names = self.get_var("group_names", default=[])
+ has_memory_recommendation = bool(set(group_names).intersection(self.recommended_memory_bytes))
+ return super(MemoryAvailability, self).is_active() and has_memory_recommendation
- def run(self, tmp, task_vars):
- group_names = get_var(task_vars, "group_names")
- total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * MIB
+ def run(self):
+ group_names = self.get_var("group_names")
+ total_memory_bytes = self.get_var("ansible_memtotal_mb") * MIB
recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
- configured_min = float(get_var(task_vars, "openshift_check_min_host_memory_gb", default=0)) * GIB
+ configured_min = float(self.get_var("openshift_check_min_host_memory_gb", default=0)) * GIB
min_memory_bytes = configured_min or recommended_min
if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes:
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
index 2cb2e21aa..3b2c64e6a 100644
--- a/roles/openshift_health_checker/openshift_checks/mixins.py
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -2,19 +2,16 @@
Mixin classes meant to be used with subclasses of OpenShiftCheck.
"""
-from openshift_checks import get_var
-
class NotContainerizedMixin(object):
"""Mixin for checks that are only active when not in containerized mode."""
# permanent # pylint: disable=too-few-public-methods
# Reason: The mixin is not intended to stand on its own as a class.
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Only run on non-containerized hosts."""
- is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
- return super(NotContainerizedMixin, cls).is_active(task_vars) and not is_containerized
+ is_containerized = self.get_var("openshift", "common", "is_containerized")
+ return super(NotContainerizedMixin, self).is_active() and not is_containerized
class DockerHostMixin(object):
@@ -22,28 +19,26 @@ class DockerHostMixin(object):
dependencies = []
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Only run on hosts that depend on Docker."""
- is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
- is_node = "nodes" in get_var(task_vars, "group_names", default=[])
- return super(DockerHostMixin, cls).is_active(task_vars) and (is_containerized or is_node)
+ is_containerized = self.get_var("openshift", "common", "is_containerized")
+ is_node = "nodes" in self.get_var("group_names", default=[])
+ return super(DockerHostMixin, self).is_active() and (is_containerized or is_node)
- def ensure_dependencies(self, task_vars):
+ def ensure_dependencies(self):
"""
Ensure that docker-related packages exist, but not on atomic hosts
(which would not be able to install but should already have them).
Returns: msg, failed, changed
"""
- if get_var(task_vars, "openshift", "common", "is_atomic"):
+ if self.get_var("openshift", "common", "is_atomic"):
return "", False, False
# NOTE: we would use the "package" module but it's actually an action plugin
# and it's not clear how to invoke one of those. This is about the same anyway:
result = self.execute_module(
- get_var(task_vars, "ansible_pkg_mgr", default="yum"),
+ self.get_var("ansible_pkg_mgr", default="yum"),
{"name": self.dependencies, "state": "present"},
- task_vars=task_vars,
)
msg = result.get("msg", "")
if result.get("failed"):
diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py
index 2dd045f1f..d5e55bc25 100644
--- a/roles/openshift_health_checker/openshift_checks/ovs_version.py
+++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py
@@ -3,7 +3,7 @@ Ansible module for determining if an installed version of Open vSwitch is incomp
currently installed version of OpenShift.
"""
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
from openshift_checks.mixins import NotContainerizedMixin
@@ -21,58 +21,34 @@ class OvsVersion(NotContainerizedMixin, OpenShiftCheck):
"3.4": "2.4",
}
- # map major release versions across releases
- # to a common major version
- openshift_major_release_version = {
- "1": "3",
- }
-
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Skip hosts that do not have package requirements."""
- group_names = get_var(task_vars, "group_names", default=[])
+ group_names = self.get_var("group_names", default=[])
master_or_node = 'masters' in group_names or 'nodes' in group_names
- return super(OvsVersion, cls).is_active(task_vars) and master_or_node
+ return super(OvsVersion, self).is_active() and master_or_node
- def run(self, tmp, task_vars):
+ def run(self):
args = {
"package_list": [
{
"name": "openvswitch",
- "version": self.get_required_ovs_version(task_vars),
+ "version": self.get_required_ovs_version(),
},
],
}
- return self.execute_module("rpm_version", args, task_vars=task_vars)
+ return self.execute_module("rpm_version", args)
- def get_required_ovs_version(self, task_vars):
+ def get_required_ovs_version(self):
"""Return the correct Open vSwitch version for the current OpenShift version"""
- openshift_version = self._get_openshift_version(task_vars)
+ openshift_version_tuple = self.get_major_minor_version(self.get_var("openshift_image_tag"))
- if float(openshift_version) < 3.5:
+ if openshift_version_tuple < (3, 5):
return self.openshift_to_ovs_version["3.4"]
- ovs_version = self.openshift_to_ovs_version.get(str(openshift_version))
+ openshift_version = ".".join(str(x) for x in openshift_version_tuple)
+ ovs_version = self.openshift_to_ovs_version.get(openshift_version)
if ovs_version:
- return self.openshift_to_ovs_version[str(openshift_version)]
+ return self.openshift_to_ovs_version[openshift_version]
msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
raise OpenShiftCheckException(msg.format(openshift_version))
-
- def _get_openshift_version(self, task_vars):
- openshift_version = get_var(task_vars, "openshift_image_tag")
- if openshift_version and openshift_version[0] == 'v':
- openshift_version = openshift_version[1:]
-
- return self._parse_version(openshift_version)
-
- def _parse_version(self, version):
- components = version.split(".")
- if not components or len(components) < 2:
- msg = "An invalid version of OpenShift was found for this host: {}"
- raise OpenShiftCheckException(msg.format(version))
-
- if components[0] in self.openshift_major_release_version:
- components[0] = self.openshift_major_release_version[components[0]]
-
- return '.'.join(components[:2])
diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py
index 0dd2b1286..a86180b00 100644
--- a/roles/openshift_health_checker/openshift_checks/package_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/package_availability.py
@@ -1,5 +1,6 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that required RPM packages are available."""
+
+from openshift_checks import OpenShiftCheck
from openshift_checks.mixins import NotContainerizedMixin
@@ -9,13 +10,13 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
name = "package_availability"
tags = ["preflight"]
- @classmethod
- def is_active(cls, task_vars):
- return super(PackageAvailability, cls).is_active(task_vars) and task_vars["ansible_pkg_mgr"] == "yum"
+ def is_active(self):
+ """Run only when yum is the package manager as the code is specific to it."""
+ return super(PackageAvailability, self).is_active() and self.get_var("ansible_pkg_mgr") == "yum"
- def run(self, tmp, task_vars):
- rpm_prefix = get_var(task_vars, "openshift", "common", "service_type")
- group_names = get_var(task_vars, "group_names", default=[])
+ def run(self):
+ rpm_prefix = self.get_var("openshift", "common", "service_type")
+ group_names = self.get_var("group_names", default=[])
packages = set()
@@ -25,10 +26,11 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
packages.update(self.node_packages(rpm_prefix))
args = {"packages": sorted(set(packages))}
- return self.execute_module("check_yum_update", args, tmp=tmp, task_vars=task_vars)
+ return self.execute_module("check_yum_update", args)
@staticmethod
def master_packages(rpm_prefix):
+ """Return a list of RPMs that we expect a master install to have available."""
return [
"{rpm_prefix}".format(rpm_prefix=rpm_prefix),
"{rpm_prefix}-clients".format(rpm_prefix=rpm_prefix),
@@ -44,6 +46,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
@staticmethod
def node_packages(rpm_prefix):
+ """Return a list of RPMs that we expect a node install to have available."""
return [
"{rpm_prefix}".format(rpm_prefix=rpm_prefix),
"{rpm_prefix}-node".format(rpm_prefix=rpm_prefix),
diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py
index f432380c6..1e9aecbe0 100644
--- a/roles/openshift_health_checker/openshift_checks/package_update.py
+++ b/roles/openshift_health_checker/openshift_checks/package_update.py
@@ -1,14 +1,14 @@
-# pylint: disable=missing-docstring
+"""Check that a yum update would not run into conflicts with available packages."""
from openshift_checks import OpenShiftCheck
from openshift_checks.mixins import NotContainerizedMixin
class PackageUpdate(NotContainerizedMixin, OpenShiftCheck):
- """Check that there are no conflicts in RPM packages."""
+ """Check that a yum update would not run into conflicts with available packages."""
name = "package_update"
tags = ["preflight"]
- def run(self, tmp, task_vars):
+ def run(self):
args = {"packages": []}
- return self.execute_module("check_yum_update", args, tmp=tmp, task_vars=task_vars)
+ return self.execute_module("check_yum_update", args)
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
index 6a76bb93d..8b780114f 100644
--- a/roles/openshift_health_checker/openshift_checks/package_version.py
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -1,5 +1,8 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+"""Check that available RPM packages match the required versions."""
+
+import re
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
from openshift_checks.mixins import NotContainerizedMixin
@@ -9,48 +12,49 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
name = "package_version"
tags = ["preflight"]
+ # NOTE: versions outside those specified are mapped to least/greatest
openshift_to_ovs_version = {
- "3.6": "2.6",
- "3.5": "2.6",
- "3.4": "2.4",
+ (3, 4): "2.4",
+ (3, 5): ["2.6", "2.7"],
+ (3, 6): ["2.6", "2.7"],
}
openshift_to_docker_version = {
- "3.1": "1.8",
- "3.2": "1.10",
- "3.3": "1.10",
- "3.4": "1.12",
+ (3, 1): "1.8",
+ (3, 2): "1.10",
+ (3, 3): "1.10",
+ (3, 4): "1.12",
+ (3, 5): "1.12",
+ (3, 6): "1.12",
}
- # map major release versions across releases
- # to a common major version
- openshift_major_release_version = {
- "1": "3",
+ # map major OpenShift release versions across releases to a common major version
+ map_major_release_version = {
+ 1: 3,
}
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Skip hosts that do not have package requirements."""
- group_names = get_var(task_vars, "group_names", default=[])
+ group_names = self.get_var("group_names", default=[])
master_or_node = 'masters' in group_names or 'nodes' in group_names
- return super(PackageVersion, cls).is_active(task_vars) and master_or_node
+ return super(PackageVersion, self).is_active() and master_or_node
- def run(self, tmp, task_vars):
- rpm_prefix = get_var(task_vars, "openshift", "common", "service_type")
- openshift_release = get_var(task_vars, "openshift_release", default='')
- deployment_type = get_var(task_vars, "openshift_deployment_type")
+ def run(self):
+ rpm_prefix = self.get_var("openshift", "common", "service_type")
+ openshift_release = self.get_var("openshift_release", default='')
+ deployment_type = self.get_var("openshift_deployment_type")
check_multi_minor_release = deployment_type in ['openshift-enterprise']
args = {
"package_list": [
{
"name": "openvswitch",
- "version": self.get_required_ovs_version(task_vars),
+ "version": self.get_required_ovs_version(),
"check_multi": False,
},
{
"name": "docker",
- "version": self.get_required_docker_version(task_vars),
+ "version": self.get_required_docker_version(),
"check_multi": False,
},
{
@@ -71,55 +75,52 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
],
}
- return self.execute_module("aos_version", args, tmp=tmp, task_vars=task_vars)
+ return self.execute_module("aos_version", args)
- def get_required_ovs_version(self, task_vars):
- """Return the correct Open vSwitch version for the current OpenShift version.
- If the current OpenShift version is >= 3.5, ensure Open vSwitch version 2.6,
- Else ensure Open vSwitch version 2.4"""
- openshift_version = self.get_openshift_version(task_vars)
+ def get_required_ovs_version(self):
+ """Return the correct Open vSwitch version(s) for the current OpenShift version."""
+ openshift_version = self.get_openshift_version_tuple()
- if float(openshift_version) < 3.5:
- return self.openshift_to_ovs_version["3.4"]
+ earliest = min(self.openshift_to_ovs_version)
+ latest = max(self.openshift_to_ovs_version)
+ if openshift_version < earliest:
+ return self.openshift_to_ovs_version[earliest]
+ if openshift_version > latest:
+ return self.openshift_to_ovs_version[latest]
- ovs_version = self.openshift_to_ovs_version.get(str(openshift_version))
- if ovs_version:
- return ovs_version
+ ovs_version = self.openshift_to_ovs_version.get(openshift_version)
+ if not ovs_version:
+ msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version)))
- msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
- raise OpenShiftCheckException(msg.format(openshift_version))
+ return ovs_version
- def get_required_docker_version(self, task_vars):
- """Return the correct Docker version for the current OpenShift version.
- If the OpenShift version is 3.1, ensure Docker version 1.8.
- If the OpenShift version is 3.2 or 3.3, ensure Docker version 1.10.
- If the current OpenShift version is >= 3.4, ensure Docker version 1.12."""
- openshift_version = self.get_openshift_version(task_vars)
+ def get_required_docker_version(self):
+ """Return the correct Docker version(s) for the current OpenShift version."""
+ openshift_version = self.get_openshift_version_tuple()
- if float(openshift_version) >= 3.4:
- return self.openshift_to_docker_version["3.4"]
+ earliest = min(self.openshift_to_docker_version)
+ latest = max(self.openshift_to_docker_version)
+ if openshift_version < earliest:
+ return self.openshift_to_docker_version[earliest]
+ if openshift_version > latest:
+ return self.openshift_to_docker_version[latest]
- docker_version = self.openshift_to_docker_version.get(str(openshift_version))
- if docker_version:
- return docker_version
+ docker_version = self.openshift_to_docker_version.get(openshift_version)
+ if not docker_version:
+ msg = "There is no recommended version of Docker for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version)))
- msg = "There is no recommended version of Docker for the current version of OpenShift: {}"
- raise OpenShiftCheckException(msg.format(openshift_version))
+ return docker_version
- def get_openshift_version(self, task_vars):
- openshift_version = get_var(task_vars, "openshift_image_tag")
- if openshift_version and openshift_version[0] == 'v':
- openshift_version = openshift_version[1:]
+ def get_openshift_version_tuple(self):
+ """Return received image tag as a normalized (X, Y) minor version tuple."""
+ version = self.get_var("openshift_image_tag")
+ comps = [int(component) for component in re.findall(r'\d+', version)]
- return self.parse_version(openshift_version)
-
- def parse_version(self, version):
- components = version.split(".")
- if not components or len(components) < 2:
+ if len(comps) < 2:
msg = "An invalid version of OpenShift was found for this host: {}"
raise OpenShiftCheckException(msg.format(version))
- if components[0] in self.openshift_major_release_version:
- components[0] = self.openshift_major_release_version[components[0]]
-
- return '.'.join(components[:2])
+ comps[0] = self.map_major_release_version.get(comps[0], comps[0])
+ return tuple(comps[0:2])