summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker/openshift_checks
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_storage.py179
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_traffic.py47
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/kibana.py2
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging.py17
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py132
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_version.py4
6 files changed, 337 insertions, 44 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
index 8d0fbcc9c..d2227d244 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_storage.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -1,5 +1,6 @@
"""Check Docker storage driver and usage."""
import json
+import os.path
import re
from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
from openshift_checks.mixins import DockerHostMixin
@@ -17,13 +18,30 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
tags = ["pre-install", "health", "preflight"]
dependencies = ["python-docker-py"]
- storage_drivers = ["devicemapper", "overlay2"]
+ storage_drivers = ["devicemapper", "overlay", "overlay2"]
max_thinpool_data_usage_percent = 90.0
max_thinpool_meta_usage_percent = 90.0
+ max_overlay_usage_percent = 90.0
+
+ # TODO(lmeyer): mention these in the output when check fails
+ configuration_variables = [
+ (
+ "max_thinpool_data_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for data. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent),
+ ),
+ (
+ "max_thinpool_meta_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for metadata. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent),
+ ),
+ (
+ "max_overlay_usage_percent",
+ "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. "
+ "Format: float. Default: {:.1f}".format(max_overlay_usage_percent),
+ ),
+ ]
- # pylint: disable=too-many-return-statements
- # Reason: permanent stylistic exception;
- # it is clearer to return on failures and there are just many ways to fail here.
def run(self, tmp, task_vars):
msg, failed, changed = self.ensure_dependencies(task_vars)
if failed:
@@ -34,17 +52,17 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
}
# attempt to get the docker info hash from the API
- info = self.execute_module("docker_info", {}, task_vars=task_vars)
- if info.get("failed"):
+ docker_info = self.execute_module("docker_info", {}, task_vars=task_vars)
+ if docker_info.get("failed"):
return {"failed": True, "changed": changed,
"msg": "Failed to query Docker API. Is docker running on this host?"}
- if not info.get("info"): # this would be very strange
+ if not docker_info.get("info"): # this would be very strange
return {"failed": True, "changed": changed,
- "msg": "Docker API query missing info:\n{}".format(json.dumps(info))}
- info = info["info"]
+ "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))}
+ docker_info = docker_info["info"]
# check if the storage driver we saw is valid
- driver = info.get("Driver", "[NONE]")
+ driver = docker_info.get("Driver", "[NONE]")
if driver not in self.storage_drivers:
msg = (
"Detected unsupported Docker storage driver '{driver}'.\n"
@@ -53,26 +71,34 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
return {"failed": True, "changed": changed, "msg": msg}
# driver status info is a list of tuples; convert to dict and validate based on driver
- driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])}
+ driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])}
+
+ result = {}
+
if driver == "devicemapper":
- if driver_status.get("Data loop file"):
- msg = (
- "Use of loopback devices with the Docker devicemapper storage driver\n"
- "(the default storage configuration) is unsupported in production.\n"
- "Please use docker-storage-setup to configure a backing storage volume.\n"
- "See http://red.ht/2rNperO for further information."
- )
- return {"failed": True, "changed": changed, "msg": msg}
- result = self._check_dm_usage(driver_status, task_vars)
- result['changed'] = result.get('changed', False) or changed
- return result
+ result = self.check_devicemapper_support(driver_status, task_vars)
- # TODO(lmeyer): determine how to check usage for overlay2
+ if driver in ['overlay', 'overlay2']:
+ result = self.check_overlay_support(docker_info, driver_status, task_vars)
- return {"changed": changed}
+ result['changed'] = result.get('changed', False) or changed
+ return result
- def _check_dm_usage(self, driver_status, task_vars):
- """
+ def check_devicemapper_support(self, driver_status, task_vars):
+ """Check if dm storage driver is supported as configured. Return: result dict."""
+ if driver_status.get("Data loop file"):
+ msg = (
+ "Use of loopback devices with the Docker devicemapper storage driver\n"
+ "(the default storage configuration) is unsupported in production.\n"
+ "Please use docker-storage-setup to configure a backing storage volume.\n"
+ "See http://red.ht/2rNperO for further information."
+ )
+ return {"failed": True, "msg": msg}
+ result = self.check_dm_usage(driver_status, task_vars)
+ return result
+
+ def check_dm_usage(self, driver_status, task_vars):
+ """Check usage thresholds for Docker dm storage driver. Return: result dict.
Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
devicemapper storage. The LV is "thin" because it does not use all available storage
@@ -83,7 +109,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
could run out of space first; so we check both.
"""
vals = dict(
- vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars),
+ vg_free=self.get_vg_free(driver_status.get("Pool Name"), task_vars),
data_used=driver_status.get("Data Space Used"),
data_total=driver_status.get("Data Space Total"),
metadata_used=driver_status.get("Metadata Space Used"),
@@ -93,7 +119,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
# convert all human-readable strings to bytes
for key, value in vals.copy().items():
try:
- vals[key + "_bytes"] = self._convert_to_bytes(value)
+ vals[key + "_bytes"] = self.convert_to_bytes(value)
except ValueError as err: # unlikely to hit this from API info, but just to be safe
return {
"failed": True,
@@ -131,10 +157,12 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
return vals
- def _get_vg_free(self, pool, task_vars):
- # Determine which VG to examine according to the pool name, the only indicator currently
- # available from the Docker API driver info. We assume a name that looks like
- # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen.
+ def get_vg_free(self, pool, task_vars):
+ """Determine which VG to examine according to the pool name. Return: size vgs reports.
+ Pool name is the only indicator currently available from the Docker API driver info.
+ We assume a name that looks like "vg--name-docker--pool";
+ vg and lv names with inner hyphens doubled, joined by a hyphen.
+ """
match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen
if not match: # unlikely, but... be clear if we assumed wrong
raise OpenShiftCheckException(
@@ -163,7 +191,8 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
return size
@staticmethod
- def _convert_to_bytes(string):
+ def convert_to_bytes(string):
+ """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes."""
units = dict(
b=1,
k=1024,
@@ -183,3 +212,87 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
raise ValueError("Cannot convert to a byte size: " + string)
return float(number) * multiplier
+
+ def check_overlay_support(self, docker_info, driver_status, task_vars):
+ """Check if overlay storage driver is supported for this host. Return: result dict."""
+ # check for xfs as backing store
+ backing_fs = driver_status.get("Backing Filesystem", "[NONE]")
+ if backing_fs != "xfs":
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n"
+ "'xfs' as the backing storage, but this host's storage is type '{fs}'."
+ ).format(fs=backing_fs)
+ return {"failed": True, "msg": msg}
+
+ # check support for OS and kernel version
+ o_s = docker_info.get("OperatingSystem", "[NONE]")
+ if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s:
+ # keep it simple, only check enterprise kernel versions; assume everyone else is good
+ kernel = docker_info.get("KernelVersion", "[NONE]")
+ kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)]
+ if kernel_arr < [3, 10, 0, 514]: # rhel < 7.3
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n"
+ "kernel version 3.10.0-514; but Docker reports kernel version {version}."
+ ).format(version=kernel)
+ return {"failed": True, "msg": msg}
+ # NOTE: we could check for --selinux-enabled here but docker won't even start with
+ # that option until it's supported in the kernel so we don't need to.
+
+ return self.check_overlay_usage(docker_info, task_vars)
+
+ def check_overlay_usage(self, docker_info, task_vars):
+ """Check disk usage on OverlayFS backing store volume. Return: result dict."""
+ path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"]
+
+ threshold = get_var(task_vars, "max_overlay_usage_percent", default=self.max_overlay_usage_percent)
+ try:
+ threshold = float(threshold)
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold),
+ }
+
+ mount = self.find_ansible_mount(path, get_var(task_vars, "ansible_mounts"))
+ try:
+ free_bytes = mount['size_available']
+ total_bytes = mount['size_total']
+ usage = 100.0 * (total_bytes - free_bytes) / total_bytes
+ except (KeyError, ZeroDivisionError):
+ return {
+ "failed": True,
+ "msg": "The ansible_mount found for path {} is invalid.\n"
+ "This is likely to be an Ansible bug. The record was:\n"
+ "{}".format(path, json.dumps(mount, indent=2)),
+ }
+
+ if usage > threshold:
+ return {
+ "failed": True,
+ "msg": (
+ "For Docker OverlayFS mount point {path},\n"
+ "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}."
+ ).format(path=mount["mount"], pct=usage, thresh=threshold)
+ }
+
+ return {}
+
+ # TODO(lmeyer): migrate to base class
+ @staticmethod
+ def find_ansible_mount(path, ansible_mounts):
+ """Return the mount point for path from ansible_mounts."""
+
+ mount_for_path = {mount['mount']: mount for mount in ansible_mounts}
+ mount_point = path
+ while mount_point not in mount_for_path:
+ if mount_point in ["/", ""]: # "/" not in ansible_mounts???
+ break
+ mount_point = os.path.dirname(mount_point)
+
+ try:
+ return mount_for_path[mount_point]
+ except KeyError:
+ known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) or 'none'
+ msg = 'Unable to determine mount point for path "{}". Known mount points: {}.'
+ raise OpenShiftCheckException(msg.format(path, known_mounts))
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
new file mode 100644
index 000000000..40c87873d
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
@@ -0,0 +1,47 @@
+"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic."""
+
+from openshift_checks import OpenShiftCheck, get_var
+
+
+class EtcdTraffic(OpenShiftCheck):
+ """Check if host is being affected by an increase in etcd traffic."""
+
+ name = "etcd_traffic"
+ tags = ["health", "etcd"]
+
+ @classmethod
+ def is_active(cls, task_vars):
+ """Skip hosts that do not have etcd in their group names."""
+ group_names = get_var(task_vars, "group_names", default=[])
+ valid_group_names = "etcd" in group_names
+
+ version = get_var(task_vars, "openshift", "common", "short_version")
+ valid_version = version in ("3.4", "3.5", "1.4", "1.5")
+
+ return super(EtcdTraffic, cls).is_active(task_vars) and valid_group_names and valid_version
+
+ def run(self, tmp, task_vars):
+ is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
+ unit = "etcd_container" if is_containerized else "etcd"
+
+ log_matchers = [{
+ "start_regexp": r"Starting Etcd Server",
+ "regexp": r"etcd: sync duration of [^,]+, expected less than 1s",
+ "unit": unit
+ }]
+
+ match = self.execute_module("search_journalctl", {
+ "log_matchers": log_matchers,
+ }, task_vars)
+
+ if match.get("matched"):
+ msg = ("Higher than normal etcd traffic detected.\n"
+ "OpenShift 3.4 introduced an increase in etcd traffic.\n"
+ "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n"
+ "Please refer to https://access.redhat.com/solutions/2916381 for more information.")
+ return {"failed": True, "msg": msg}
+
+ if match.get("failed"):
+ return {"failed": True, "msg": "\n".join(match.get("errors"))}
+
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
index 442f407b1..551e8dfa0 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -62,7 +62,7 @@ class Kibana(LoggingCheck):
# TODO(lmeyer): give users option to validate certs
status_code=302,
)
- result = self.execute_module('uri', args, task_vars)
+ result = self.execute_module('uri', args, None, task_vars)
if result.get('failed'):
return result['msg']
return None
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
index 05b4d300c..02a094007 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -12,20 +12,21 @@ class LoggingCheck(OpenShiftCheck):
"""Base class for logging component checks"""
name = "logging"
+ logging_namespace = "logging"
@classmethod
def is_active(cls, task_vars):
- return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars)
+ logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=False)
+ return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars) and logging_deployed
@staticmethod
def is_first_master(task_vars):
- """Run only on first master and only when logging is configured. Returns: bool"""
- logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True)
+ """Run only on first master. Returns: bool"""
# Note: It would be nice to use membership in oo_first_master group, however for now it
# seems best to avoid requiring that setup and just check this is the first master.
hostname = get_var(task_vars, "ansible_ssh_host") or [None]
masters = get_var(task_vars, "groups", "masters", default=None) or [None]
- return logging_deployed and masters[0] == hostname
+ return masters and masters[0] == hostname
def run(self, tmp, task_vars):
pass
@@ -45,7 +46,7 @@ class LoggingCheck(OpenShiftCheck):
raise ValueError()
except ValueError:
# successful run but non-parsing data generally means there were no pods in the namespace
- return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace)
+ return None, 'No pods were found for the "{}" logging component.'.format(logging_component)
return pods['items'], None
@@ -54,12 +55,12 @@ class LoggingCheck(OpenShiftCheck):
"""Returns: list of pods not in a ready and running state"""
return [
pod for pod in pods
- if any(
+ if not pod.get("status", {}).get("containerStatuses") or any(
container['ready'] is False
for container in pod['status']['containerStatuses']
) or not any(
condition['type'] == 'Ready' and condition['status'] == 'True'
- for condition in pod['status']['conditions']
+ for condition in pod['status'].get('conditions', [])
)
]
@@ -78,7 +79,7 @@ class LoggingCheck(OpenShiftCheck):
"extra_args": list(extra_args) if extra_args else [],
}
- result = execute_module("ocutil", args, task_vars)
+ result = execute_module("ocutil", args, None, task_vars)
if result.get("failed"):
msg = (
'Unexpected error using `oc` to validate the logging stack components.\n'
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
new file mode 100644
index 000000000..2ddd7549d
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -0,0 +1,132 @@
+"""
+Check for ensuring logs from pods can be queried in a reasonable amount of time.
+"""
+
+import json
+import time
+
+from uuid import uuid4
+
+from openshift_checks import get_var, OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+ES_CMD_TIMEOUT_SECONDS = 30
+
+
+class LoggingIndexTime(LoggingCheck):
+ """Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time."""
+ name = "logging_index_time"
+ tags = ["health", "logging"]
+
+ logging_namespace = "logging"
+
+ def run(self, tmp, task_vars):
+ """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs."""
+ try:
+ log_index_timeout = int(
+ get_var(task_vars, "openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS)
+ )
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": ('Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
+ 'Value must be an integer representing an amount in seconds.'),
+ }
+
+ running_component_pods = dict()
+
+ # get all component pods
+ self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default=self.logging_namespace)
+ for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']):
+ pods, error = self.get_pods_for_component(
+ self.execute_module, self.logging_namespace, component, task_vars,
+ )
+
+ if error:
+ msg = 'Unable to retrieve pods for the {} logging component: {}'
+ return {"failed": True, "changed": False, "msg": msg.format(name, error)}
+
+ running_pods = self.running_pods(pods)
+
+ if not running_pods:
+ msg = ('No {} pods in the "Running" state were found.'
+ 'At least one pod is required in order to perform this check.')
+ return {"failed": True, "changed": False, "msg": msg.format(name)}
+
+ running_component_pods[component] = running_pods
+
+ uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0], task_vars)
+ self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout, task_vars)
+ return {}
+
+ def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs, task_vars):
+ """Retry an Elasticsearch query every second until query success, or a defined
+ length of time has passed."""
+ deadline = time.time() + timeout_secs
+ interval = 1
+ while not self.query_es_from_es(es_pod, uuid, task_vars):
+ if time.time() + interval > deadline:
+ msg = "expecting match in Elasticsearch for message with uuid {}, but no matches were found after {}s."
+ raise OpenShiftCheckException(msg.format(uuid, timeout_secs))
+ time.sleep(interval)
+
+ def curl_kibana_with_uuid(self, kibana_pod, task_vars):
+ """curl Kibana with a unique uuid."""
+ uuid = self.generate_uuid()
+ pod_name = kibana_pod["metadata"]["name"]
+ exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}"
+ exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid)
+
+ error_str = self.exec_oc(self.execute_module, self.logging_namespace, exec_cmd, [], task_vars)
+
+ try:
+ error_code = json.loads(error_str)["statusCode"]
+ except KeyError:
+ msg = ('invalid response returned from Kibana request (Missing "statusCode" key):\n'
+ 'Command: {}\nResponse: {}').format(exec_cmd, error_str)
+ raise OpenShiftCheckException(msg)
+ except ValueError:
+ msg = ('invalid response returned from Kibana request (Non-JSON output):\n'
+ 'Command: {}\nResponse: {}').format(exec_cmd, error_str)
+ raise OpenShiftCheckException(msg)
+
+ if error_code != 404:
+ msg = 'invalid error code returned from Kibana request. Expecting error code "404", but got "{}" instead.'
+ raise OpenShiftCheckException(msg.format(error_code))
+
+ return uuid
+
+ def query_es_from_es(self, es_pod, uuid, task_vars):
+ """curl the Elasticsearch pod and look for a unique uuid in its logs."""
+ pod_name = es_pod["metadata"]["name"]
+ exec_cmd = (
+ "exec {pod_name} -- curl --max-time 30 -s -f "
+ "--cacert /etc/elasticsearch/secret/admin-ca "
+ "--cert /etc/elasticsearch/secret/admin-cert "
+ "--key /etc/elasticsearch/secret/admin-key "
+ "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
+ )
+ exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace, uuid=uuid)
+ result = self.exec_oc(self.execute_module, self.logging_namespace, exec_cmd, [], task_vars)
+
+ try:
+ count = json.loads(result)["count"]
+ except KeyError:
+ msg = 'invalid response from Elasticsearch query:\n"{}"\nMissing "count" key:\n{}'
+ raise OpenShiftCheckException(msg.format(exec_cmd, result))
+ except ValueError:
+ msg = 'invalid response from Elasticsearch query:\n"{}"\nNon-JSON output:\n{}'
+ raise OpenShiftCheckException(msg.format(exec_cmd, result))
+
+ return count
+
+ @staticmethod
+ def running_pods(pods):
+ """Filter pods that are running."""
+ return [pod for pod in pods if pod['status']['phase'] == 'Running']
+
+ @staticmethod
+ def generate_uuid():
+ """Wrap uuid generator. Allows for testing with expected values."""
+ return str(uuid4())
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
index 6a76bb93d..204752bd0 100644
--- a/roles/openshift_health_checker/openshift_checks/package_version.py
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -10,8 +10,8 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
tags = ["preflight"]
openshift_to_ovs_version = {
- "3.6": "2.6",
- "3.5": "2.6",
+ "3.6": ["2.6", "2.7"],
+ "3.5": ["2.6", "2.7"],
"3.4": "2.4",
}