diff options
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
4 files changed, 208 insertions, 161 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py index 4588ed634..27e6fe383 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py +++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py @@ -1,8 +1,9 @@ # pylint: disable=missing-docstring from openshift_checks import OpenShiftCheck, get_var +from openshift_checks.mixins import DockerHostMixin -class DockerImageAvailability(OpenShiftCheck): +class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): """Check that required Docker images are available. This check attempts to ensure that required docker images are @@ -36,19 +37,11 @@ class DockerImageAvailability(OpenShiftCheck): def run(self, tmp, task_vars): msg, failed, changed = self.ensure_dependencies(task_vars) - - # exit early if Skopeo update fails if failed: - if "No package matching" in msg: - msg = "Ensure that all required dependencies can be installed via `yum`.\n" return { "failed": True, "changed": changed, - "msg": ( - "Unable to update or install required dependency packages on this host;\n" - "These are required in order to check Docker image availability:" - "\n {deps}\n{msg}" - ).format(deps=',\n '.join(self.dependencies), msg=msg), + "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg } required_images = self.required_images(task_vars) @@ -168,12 +161,3 @@ class DockerImageAvailability(OpenShiftCheck): args = {"_raw_params": cmd_str} result = self.module_executor("command", args, task_vars) return not result.get("failed", False) and result.get("rc", 0) == 0 - - # ensures that the skopeo and python-docker-py packages exist - # check is skipped on atomic installations - def ensure_dependencies(self, task_vars): - if get_var(task_vars, "openshift", "common", "is_atomic"): - return "", False, False - - result = self.module_executor("yum", {"name": self.dependencies, "state": "latest"}, task_vars) - return result.get("msg", ""), result.get("failed", False) or result.get("rc", 0) != 0, result.get("changed") diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py index 2dfe10a02..5c9bed97e 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_storage.py +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -1,110 +1,185 @@ -# pylint: disable=missing-docstring +"""Check Docker storage driver and usage.""" import json - +import re from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var +from openshift_checks.mixins import DockerHostMixin -class DockerStorage(OpenShiftCheck): - """Check Docker storage sanity. +class DockerStorage(DockerHostMixin, OpenShiftCheck): + """Check Docker storage driver compatibility. - Check for thinpool usage during a containerized installation + This check ensures that Docker is using a supported storage driver, + and that loopback is not being used (if using devicemapper). + Also that storage usage is not above threshold. """ name = "docker_storage" - tags = ["preflight"] + tags = ["pre-install", "health", "preflight"] + dependencies = ["python-docker-py"] + storage_drivers = ["devicemapper", "overlay2"] max_thinpool_data_usage_percent = 90.0 max_thinpool_meta_usage_percent = 90.0 - @classmethod - def is_active(cls, task_vars): - """Only run on hosts that depend on Docker.""" - is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") - is_node = "nodes" in get_var(task_vars, "group_names", default=[]) - return (super(DockerStorage, cls).is_active(task_vars) and is_containerized) or is_node - + # pylint: disable=too-many-return-statements + # Reason: permanent stylistic exception; + # it is clearer to return on failures and there are just many ways to fail here. def run(self, tmp, task_vars): - try: - self.max_thinpool_data_usage_percent = float(get_var(task_vars, "max_thinpool_data_usage_percent", - default=self.max_thinpool_data_usage_percent)) - self.max_thinpool_meta_usage_percent = float(get_var(task_vars, "max_thinpool_metadata_usage_percent", - default=self.max_thinpool_meta_usage_percent)) - except ValueError as err: + msg, failed, changed = self.ensure_dependencies(task_vars) + if failed: return { "failed": True, - "msg": "Unable to convert thinpool data usage limit to float: {}".format(str(err)) + "changed": changed, + "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg } - err_msg = self.check_thinpool_usage(task_vars) - if err_msg: - return {"failed": True, "msg": err_msg} - - return {} - - def check_thinpool_usage(self, task_vars): - lvs = self.get_lvs_data(task_vars) - lv_data = self.extract_thinpool_obj(lvs) - - data_percent = self.get_thinpool_data_usage(lv_data) - metadata_percent = self.get_thinpool_metadata_usage(lv_data) - - if data_percent > self.max_thinpool_data_usage_percent: - msg = "thinpool data usage above maximum threshold of {threshold}%" - return msg.format(threshold=self.max_thinpool_data_usage_percent) - - if metadata_percent > self.max_thinpool_meta_usage_percent: - msg = "thinpool metadata usage above maximum threshold of {threshold}%" - return msg.format(threshold=self.max_thinpool_meta_usage_percent) - - return "" - - def get_lvs_data(self, task_vars): - lvs_cmd = "/sbin/lvs --select vg_name=docker --select lv_name=docker-pool --report-format json" - result = self.exec_cmd(lvs_cmd, task_vars) - - if result.get("failed", False): - msg = "no thinpool usage data returned by the host: {}" - raise OpenShiftCheckException(msg.format(result.get("msg", ""))) - - try: - data_json = json.loads(result.get("stdout", "")) - except ValueError as err: - raise OpenShiftCheckException("Invalid JSON value returned by lvs command: {}".format(str(err))) - - data = data_json.get("report") - if not data: - raise OpenShiftCheckException("no thinpool usage data returned by the host.") - - return data - - @staticmethod - def get_thinpool_data_usage(thinpool_lv_data): - data = thinpool_lv_data.get("data_percent") - if not data: - raise OpenShiftCheckException("no thinpool usage data returned by the host.") - - return float(data) + # attempt to get the docker info hash from the API + info = self.execute_module("docker_info", {}, task_vars) + if info.get("failed"): + return {"failed": True, "changed": changed, + "msg": "Failed to query Docker API. Is docker running on this host?"} + if not info.get("info"): # this would be very strange + return {"failed": True, "changed": changed, + "msg": "Docker API query missing info:\n{}".format(json.dumps(info))} + info = info["info"] + + # check if the storage driver we saw is valid + driver = info.get("Driver", "[NONE]") + if driver not in self.storage_drivers: + msg = ( + "Detected unsupported Docker storage driver '{driver}'.\n" + "Supported storage drivers are: {drivers}" + ).format(driver=driver, drivers=', '.join(self.storage_drivers)) + return {"failed": True, "changed": changed, "msg": msg} + + # driver status info is a list of tuples; convert to dict and validate based on driver + driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])} + if driver == "devicemapper": + if driver_status.get("Data loop file"): + msg = ( + "Use of loopback devices with the Docker devicemapper storage driver\n" + "(the default storage configuration) is unsupported in production.\n" + "Please use docker-storage-setup to configure a backing storage volume.\n" + "See http://red.ht/2rNperO for further information." + ) + return {"failed": True, "changed": changed, "msg": msg} + result = self._check_dm_usage(driver_status, task_vars) + result["changed"] = changed + return result + + # TODO(lmeyer): determine how to check usage for overlay2 + + return {"changed": changed} + + def _check_dm_usage(self, driver_status, task_vars): + """ + Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool + implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures + devicemapper storage. The LV is "thin" because it does not use all available storage + from its VG, instead expanding as needed; so to determine available space, we gather + current usage as the Docker API reports for the driver as well as space available for + expansion in the pool's VG. + Usage within the LV is divided into pools allocated to data and metadata, either of which + could run out of space first; so we check both. + """ + vals = dict( + vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars), + data_used=driver_status.get("Data Space Used"), + data_total=driver_status.get("Data Space Total"), + metadata_used=driver_status.get("Metadata Space Used"), + metadata_total=driver_status.get("Metadata Space Total"), + ) + + # convert all human-readable strings to bytes + for key, value in vals.copy().items(): + try: + vals[key + "_bytes"] = self._convert_to_bytes(value) + except ValueError as err: # unlikely to hit this from API info, but just to be safe + return { + "failed": True, + "values": vals, + "msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err)) + } + + # determine the threshold percentages which usage should not exceed + for name, default in [("data", self.max_thinpool_data_usage_percent), + ("metadata", self.max_thinpool_meta_usage_percent)]: + percent = get_var(task_vars, "max_thinpool_" + name + "_usage_percent", default=default) + try: + vals[name + "_threshold"] = float(percent) + except ValueError: + return { + "failed": True, + "msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent) + } + + # test whether the thresholds are exceeded + messages = [] + for name in ["data", "metadata"]: + vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / ( + vals[name + "_total_bytes"] + vals["vg_free_bytes"]) + if vals[name + "_pct_used"] > vals[name + "_threshold"]: + messages.append( + "Docker thinpool {name} usage percentage {pct:.1f} " + "is higher than threshold {thresh:.1f}.".format( + name=name, + pct=vals[name + "_pct_used"], + thresh=vals[name + "_threshold"], + )) + vals["failed"] = True + + vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."]) + return vals + + def _get_vg_free(self, pool, task_vars): + # Determine which VG to examine according to the pool name, the only indicator currently + # available from the Docker API driver info. We assume a name that looks like + # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen. + match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen + if not match: # unlikely, but... be clear if we assumed wrong + raise OpenShiftCheckException( + "This host's Docker reports it is using a storage pool named '{}'.\n" + "However this name does not have the expected format of 'vgname-lvname'\n" + "so the available storage in the VG cannot be determined.".format(pool) + ) + vg_name = match.groups()[0].replace("--", "-") + vgs_cmd = "/sbin/vgs --noheadings -o vg_free --select vg_name=" + vg_name + # should return free space like " 12.00g" if the VG exists; empty if it does not + + ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars) + if ret.get("failed") or ret.get("rc", 0) != 0: + raise OpenShiftCheckException( + "Is LVM installed? Failed to run /sbin/vgs " + "to determine docker storage usage:\n" + ret.get("msg", "") + ) + size = ret.get("stdout", "").strip() + if not size: + raise OpenShiftCheckException( + "This host's Docker reports it is using a storage pool named '{pool}'.\n" + "which we expect to come from local VG '{vg}'.\n" + "However, /sbin/vgs did not find this VG. Is Docker for this host" + "running and using the storage on the host?".format(pool=pool, vg=vg_name) + ) + return size @staticmethod - def get_thinpool_metadata_usage(thinpool_lv_data): - data = thinpool_lv_data.get("metadata_percent") - if not data: - raise OpenShiftCheckException("no thinpool usage data returned by the host.") - - return float(data) - - @staticmethod - def extract_thinpool_obj(thinpool_data): - if not thinpool_data or not thinpool_data[0]: - raise OpenShiftCheckException("no thinpool usage data returned by the host.") - - lv_data = thinpool_data[0].get("lv") - if not lv_data or not lv_data[0]: - raise OpenShiftCheckException("no thinpool usage data returned by the host.") - - return lv_data[0] - - def exec_cmd(self, cmd_str, task_vars): - return self.execute_module("command", { - "_raw_params": cmd_str, - }, task_vars) + def _convert_to_bytes(string): + units = dict( + b=1, + k=1024, + m=1024**2, + g=1024**3, + t=1024**4, + p=1024**5, + ) + string = string or "" + match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string) # float followed by optional unit + if not match: + raise ValueError("Cannot convert to a byte size: " + string) + + number, unit = match.groups() + multiplier = 1 if not unit else units.get(unit.lower()) + if not multiplier: + raise ValueError("Cannot convert to a byte size: " + string) + + return float(number) * multiplier diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage_driver.py b/roles/openshift_health_checker/openshift_checks/docker_storage_driver.py deleted file mode 100644 index 94ea7ba9c..000000000 --- a/roles/openshift_health_checker/openshift_checks/docker_storage_driver.py +++ /dev/null @@ -1,50 +0,0 @@ -# pylint: disable=missing-docstring -from openshift_checks import OpenShiftCheck, get_var - - -class DockerStorageDriver(OpenShiftCheck): - """Check Docker storage driver compatibility. - - This check ensures that Docker is using a supported storage driver, - and that Loopback is not being used (if using devicemapper). - """ - - name = "docker_storage_driver" - tags = ["preflight"] - - storage_drivers = ["devicemapper", "overlay2"] - - @classmethod - def is_active(cls, task_vars): - """Skip non-containerized installations.""" - is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") - return super(DockerStorageDriver, cls).is_active(task_vars) and is_containerized - - def run(self, tmp, task_vars): - info = self.execute_module("docker_info", {}, task_vars).get("info", {}) - - if not self.is_supported_storage_driver(info): - msg = "Unsupported Docker storage driver detected. Supported storage drivers: {drivers}" - return {"failed": True, "msg": msg.format(drivers=', '.join(self.storage_drivers))} - - if self.is_using_loopback_device(info): - msg = "Use of loopback devices is discouraged. Try running Docker with `--storage-opt dm.thinpooldev`" - return {"failed": True, "msg": msg} - - return {} - - def is_supported_storage_driver(self, docker_info): - return docker_info.get("Driver", "") in self.storage_drivers - - @staticmethod - def is_using_loopback_device(docker_info): - # Loopback device usage is only an issue if using devicemapper. - # Skip this check if using any other storage driver. - if docker_info.get("Driver", "") != "devicemapper": - return False - - for status in docker_info.get("DriverStatus", []): - if status[0] == "Data loop file": - return bool(status[1]) - - return False diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py index 20d160eaf..1181784ab 100644 --- a/roles/openshift_health_checker/openshift_checks/mixins.py +++ b/roles/openshift_health_checker/openshift_checks/mixins.py @@ -1,4 +1,3 @@ -# pylint: disable=missing-docstring,too-few-public-methods """ Mixin classes meant to be used with subclasses of OpenShiftCheck. """ @@ -8,8 +7,47 @@ from openshift_checks import get_var class NotContainerizedMixin(object): """Mixin for checks that are only active when not in containerized mode.""" + # permanent # pylint: disable=too-few-public-methods + # Reason: The mixin is not intended to stand on its own as a class. @classmethod def is_active(cls, task_vars): + """Only run on non-containerized hosts.""" is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") return super(NotContainerizedMixin, cls).is_active(task_vars) and not is_containerized + + +class DockerHostMixin(object): + """Mixin for checks that are only active on hosts that require Docker.""" + + dependencies = [] + + @classmethod + def is_active(cls, task_vars): + """Only run on hosts that depend on Docker.""" + is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") + is_node = "nodes" in get_var(task_vars, "group_names", default=[]) + return super(DockerHostMixin, cls).is_active(task_vars) and (is_containerized or is_node) + + def ensure_dependencies(self, task_vars): + """ + Ensure that docker-related packages exist, but not on atomic hosts + (which would not be able to install but should already have them). + Returns: msg, failed, changed + """ + if get_var(task_vars, "openshift", "common", "is_atomic"): + return "", False, False + + # NOTE: we would use the "package" module but it's actually an action plugin + # and it's not clear how to invoke one of those. This is about the same anyway: + pkg_manager = get_var(task_vars, "ansible_pkg_mgr", default="yum") + result = self.module_executor(pkg_manager, {"name": self.dependencies, "state": "present"}, task_vars) + msg = result.get("msg", "") + if result.get("failed"): + if "No package matching" in msg: + msg = "Ensure that all required dependencies can be installed via `yum`.\n" + msg = ( + "Unable to install required packages on this host:\n" + " {deps}\n{msg}" + ).format(deps=',\n '.join(self.dependencies), msg=msg) + return msg, result.get("failed") or result.get("rc", 0) != 0, result.get("changed") |