diff options
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
17 files changed, 386 insertions, 118 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index 07ec6f7ef..b7b16e0ea 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -2,14 +2,18 @@ Health checks for OpenShift clusters. """ +import json import operator import os +import time +import collections from abc import ABCMeta, abstractmethod, abstractproperty from importlib import import_module from ansible.module_utils import six from ansible.module_utils.six.moves import reduce # pylint: disable=import-error,redefined-builtin +from ansible.module_utils.six import string_types from ansible.plugins.filter.core import to_bool as ansible_to_bool @@ -27,7 +31,7 @@ class OpenShiftCheckException(Exception): class OpenShiftCheckExceptionList(OpenShiftCheckException): - """A container for multiple logging errors that may be detected in one check.""" + """A container for multiple errors that may be detected in one check.""" def __init__(self, errors): self.errors = errors super(OpenShiftCheckExceptionList, self).__init__( @@ -40,26 +44,56 @@ class OpenShiftCheckExceptionList(OpenShiftCheckException): return self.errors[index] +FileToSave = collections.namedtuple("FileToSave", "filename contents remote_filename") + + +# pylint: disable=too-many-instance-attributes; all represent significantly different state. +# Arguably they could be separated into two hashes, one for storing parameters, and one for +# storing result state; but that smells more like clutter than clarity. @six.add_metaclass(ABCMeta) class OpenShiftCheck(object): - """ - A base class for defining checks for an OpenShift cluster environment. + """A base class for defining checks for an OpenShift cluster environment. - Expect optional params: method execute_module, dict task_vars, and string tmp. + Optional init params: method execute_module, dict task_vars, and string tmp execute_module is expected to have a signature compatible with _execute_module from ansible plugins/action/__init__.py, e.g.: def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *args): This is stored so that it can be invoked in subclasses via check.execute_module("name", args) which provides the check's stored task_vars and tmp. - """ - def __init__(self, execute_module=None, task_vars=None, tmp=None): + Optional init param: want_full_results + If the check can gather logs, tarballs, etc., do so when True; but no need to spend + the time if they're not wanted (won't be written to output directory). + """ + # pylint: disable=too-many-arguments + def __init__(self, execute_module=None, task_vars=None, tmp=None, want_full_results=False, + templar=None): + # store a method for executing ansible modules from the check self._execute_module = execute_module + # the task variables and tmpdir passed into the health checker task self.task_vars = task_vars or {} + # We may need to template some task_vars + self._templar = templar self.tmp = tmp + # a boolean for disabling the gathering of results (files, computations) that won't + # actually be recorded/used + self.want_full_results = want_full_results + + # mainly for testing purposes; see execute_module_with_retries + self._module_retries = 3 + self._module_retry_interval = 5 # seconds + # state to be recorded for inspection after the check runs: + # # set to True when the check changes the host, for accurate total "changed" count self.changed = False + # list of OpenShiftCheckException for check to report (alternative to returning a failed result) + self.failures = [] + # list of FileToSave - files the check specifies to be written locally if so configured + self.files_to_save = [] + # log messages for the check - tuples of (description, msg) where msg is serializable. + # These are intended to be a sequential record of what the check observed and determined. + self.logs = [] @abstractproperty def name(self): @@ -80,9 +114,20 @@ class OpenShiftCheck(object): """Returns true if this check applies to the ansible-playbook run.""" return True + def is_first_master(self): + """Determine if running on first master. Returns: bool""" + masters = self.get_var("groups", "oo_first_master", default=None) or [None] + return masters[0] == self.get_var("ansible_host") + @abstractmethod def run(self): - """Executes a check, normally implemented as a module.""" + """Executes a check against a host and returns a result hash similar to Ansible modules. + + Actually the direction ahead is to record state in the attributes and + not bother building a result hash. Instead, return an empty hash and let + the action plugin fill it in. Or raise an OpenShiftCheckException. + Returning a hash may become deprecated if it does not prove necessary. + """ return {} @classmethod @@ -94,7 +139,43 @@ class OpenShiftCheck(object): for subclass in subclass.subclasses(): yield subclass - def execute_module(self, module_name=None, module_args=None): + def register_failure(self, error): + """Record in the check that a failure occurred. + + Recorded failures are merged into the result hash for now. They are also saved to output directory + (if provided) <check>.failures.json and registered as a log entry for context <check>.log.json. + """ + # It should be an exception; make it one if not + if not isinstance(error, OpenShiftCheckException): + error = OpenShiftCheckException(str(error)) + self.failures.append(error) + # duplicate it in the logs so it can be seen in the context of any + # information that led to the failure + self.register_log("failure: " + error.name, str(error)) + + def register_log(self, context, msg): + """Record an entry for the check log. + + Notes are intended to serve as context of the whole sequence of what the check observed. + They are be saved as an ordered list in a local check log file. + They are not to included in the result or in the ansible log; it's just for the record. + """ + self.logs.append([context, msg]) + + def register_file(self, filename, contents=None, remote_filename=""): + """Record a file that a check makes available to be saved individually to output directory. + + Either file contents should be passed in, or a file to be copied from the remote host + should be specified. Contents that are not a string are to be serialized as JSON. + + NOTE: When copying a file from remote host, it is slurped into memory as base64, meaning + you should avoid using this on huge files (more than say 10M). + """ + if contents is None and not remote_filename: + raise OpenShiftCheckException("File data/source not specified; this is a bug in the check.") + self.files_to_save.append(FileToSave(filename, contents, remote_filename)) + + def execute_module(self, module_name=None, module_args=None, save_as_name=None, register=True): """Invoke an Ansible module from a check. Invoke stored _execute_module, normally copied from the action @@ -106,6 +187,12 @@ class OpenShiftCheck(object): Ansible version). So e.g. check.execute_module("foo", dict(arg1=...)) + + save_as_name specifies a file name for saving the result to an output directory, + if needed, and is intended to uniquely identify the result of invoking execute_module. + If not provided, the module name will be used. + If register is set False, then the result won't be registered in logs or files to save. + Return: result hash from module execution. """ if self._execute_module is None: @@ -113,7 +200,33 @@ class OpenShiftCheck(object): self.__class__.__name__ + " invoked execute_module without providing the method at initialization." ) - return self._execute_module(module_name, module_args, self.tmp, self.task_vars) + result = self._execute_module(module_name, module_args, self.tmp, self.task_vars) + if result.get("changed"): + self.changed = True + for output in ["result", "stdout"]: + # output is often JSON; attempt to decode + try: + result[output + "_json"] = json.loads(result[output]) + except (KeyError, ValueError): + pass + + if register: + self.register_log("execute_module: " + module_name, result) + self.register_file(save_as_name or module_name + ".json", result) + return result + + def execute_module_with_retries(self, module_name, module_args): + """Run execute_module and retry on failure.""" + result = {} + tries = 0 + while True: + res = self.execute_module(module_name, module_args) + if tries > self._module_retries or not res.get("failed"): + result.update(res) + return result + result["last_failed"] = res + tries += 1 + time.sleep(self._module_retry_interval) def get_var(self, *keys, **kwargs): """Get deeply nested values from task_vars. @@ -171,8 +284,23 @@ class OpenShiftCheck(object): 'There is a bug in this check. While trying to convert variable \n' ' "{var}={value}"\n' 'the given converter cannot be used or failed unexpectedly:\n' - '{error}'.format(var=".".join(keys), value=value, error=error) - ) + '{type}: {error}'.format( + var=".".join(keys), + value=value, + type=error.__class__.__name__, + error=error + )) + + @staticmethod + def normalize(name_list): + """Return a clean list of names. + + The input may be a comma-separated string or a sequence. Leading and + trailing whitespace characters are removed. Empty items are discarded. + """ + if isinstance(name_list, string_types): + name_list = name_list.split(',') + return [name.strip() for name in name_list if name.strip()] @staticmethod def get_major_minor_version(openshift_image_tag): @@ -214,7 +342,9 @@ class OpenShiftCheck(object): mount_point = os.path.dirname(mount_point) try: - return mount_for_path[mount_point] + mount = mount_for_path[mount_point] + self.register_log("mount point for " + path, mount) + return mount except KeyError: known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) raise OpenShiftCheckException( diff --git a/roles/openshift_health_checker/openshift_checks/diagnostics.py b/roles/openshift_health_checker/openshift_checks/diagnostics.py new file mode 100644 index 000000000..1cfdc1129 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/diagnostics.py @@ -0,0 +1,62 @@ +""" +A check to run relevant diagnostics via `oc adm diagnostics`. +""" + +import os + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException + + +DIAGNOSTIC_LIST = ( + "AggregatedLogging ClusterRegistry ClusterRoleBindings ClusterRoles " + "ClusterRouter DiagnosticPod NetworkCheck" +).split() + + +class DiagnosticCheck(OpenShiftCheck): + """A check to run relevant diagnostics via `oc adm diagnostics`.""" + + name = "diagnostics" + tags = ["health"] + + def is_active(self): + return super(DiagnosticCheck, self).is_active() and self.is_first_master() + + def run(self): + if self.exec_diagnostic("ConfigContexts"): + # only run the other diagnostics if that one succeeds (otherwise, all will fail) + diagnostics = self.get_var("openshift_check_diagnostics", default=DIAGNOSTIC_LIST) + for diagnostic in self.normalize(diagnostics): + self.exec_diagnostic(diagnostic) + return {} + + def exec_diagnostic(self, diagnostic): + """ + Execute an 'oc adm diagnostics' command on the remote host. + Raises OcNotFound or registers OcDiagFailed. + Returns True on success or False on failure (non-zero rc). + """ + config_base = self.get_var("openshift.common.config_base") + args = { + "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), + "cmd": "adm diagnostics", + "extra_args": [diagnostic], + } + + result = self.execute_module("ocutil", args, save_as_name=diagnostic + ".failure.json") + self.register_file(diagnostic + ".txt", result['result']) + if result.get("failed"): + if result['result'] == '[Errno 2] No such file or directory': + raise OpenShiftCheckException( + "OcNotFound", + "This host is supposed to be a master but does not have the `oc` command where expected.\n" + "Has an installation been run on this host yet?" + ) + + self.register_failure(OpenShiftCheckException( + 'OcDiagFailed', + 'The {diag} diagnostic reported an error:\n' + '{error}'.format(diag=diagnostic, error=result['result']) + )) + return False + return True diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py index 6d1dea9ce..87e6146d4 100644 --- a/roles/openshift_health_checker/openshift_checks/disk_availability.py +++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py @@ -1,6 +1,7 @@ """Check that there is enough disk space in predefined paths.""" import tempfile +import os.path from openshift_checks import OpenShiftCheck, OpenShiftCheckException @@ -15,31 +16,31 @@ class DiskAvailability(OpenShiftCheck): # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements recommended_disk_space_bytes = { '/var': { - 'masters': 40 * 10**9, - 'nodes': 15 * 10**9, - 'etcd': 20 * 10**9, + 'oo_masters_to_config': 40 * 10**9, + 'oo_nodes_to_config': 15 * 10**9, + 'oo_etcd_to_config': 20 * 10**9, }, # Used to copy client binaries into, # see roles/openshift_cli/library/openshift_container_binary_sync.py. '/usr/local/bin': { - 'masters': 1 * 10**9, - 'nodes': 1 * 10**9, - 'etcd': 1 * 10**9, + 'oo_masters_to_config': 1 * 10**9, + 'oo_nodes_to_config': 1 * 10**9, + 'oo_etcd_to_config': 1 * 10**9, }, # Used as temporary storage in several cases. tempfile.gettempdir(): { - 'masters': 1 * 10**9, - 'nodes': 1 * 10**9, - 'etcd': 1 * 10**9, + 'oo_masters_to_config': 1 * 10**9, + 'oo_nodes_to_config': 1 * 10**9, + 'oo_etcd_to_config': 1 * 10**9, }, } # recommended disk space for each location under an upgrade context recommended_disk_upgrade_bytes = { '/var': { - 'masters': 10 * 10**9, - 'nodes': 5 * 10 ** 9, - 'etcd': 5 * 10 ** 9, + 'oo_masters_to_config': 10 * 10**9, + 'oo_nodes_to_config': 5 * 10 ** 9, + 'oo_etcd_to_config': 5 * 10 ** 9, }, } @@ -61,15 +62,19 @@ class DiskAvailability(OpenShiftCheck): number = float(user_config) user_config = { '/var': { - 'masters': number, - 'nodes': number, - 'etcd': number, + 'oo_masters_to_config': number, + 'oo_nodes_to_config': number, + 'oo_etcd_to_config': number, }, } except TypeError: # If it is not a number, then it should be a nested dict. pass + self.register_log("recommended thresholds", self.recommended_disk_space_bytes) + if user_config: + self.register_log("user-configured thresholds", user_config) + # TODO: as suggested in # https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021, # maybe we could support checking disk availability in paths that are @@ -113,21 +118,25 @@ class DiskAvailability(OpenShiftCheck): 'in your Ansible inventory, and lower the recommended disk space availability\n' 'if necessary for this upgrade.').format(config_bytes) - return { - 'failed': True, - 'msg': ( - 'Available disk space in "{}" ({:.1f} GB) ' - 'is below minimum recommended ({:.1f} GB)' - ).format(path, free_gb, recommended_gb) - } + self.register_failure(msg) return {} + def find_ansible_submounts(self, path): + """Return a list of ansible_mounts that are below the given path.""" + base = os.path.join(path, "") + return [ + mount + for mount in self.get_var("ansible_mounts") + if mount["mount"].startswith(base) + ] + def free_bytes(self, path): """Return the size available in path based on ansible_mounts.""" + submounts = sum(mnt.get('size_available', 0) for mnt in self.find_ansible_submounts(path)) mount = self.find_ansible_mount(path) try: - return mount['size_available'] + return mount['size_available'] + submounts except KeyError: raise OpenShiftCheckException( 'Unable to retrieve disk availability for "{path}".\n' diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py index 85a922f86..5beb20503 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py +++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py @@ -1,5 +1,7 @@ """Check that required Docker images are available.""" +from pipes import quote +from ansible.module_utils import six from openshift_checks import OpenShiftCheck from openshift_checks.mixins import DockerHostMixin @@ -32,6 +34,46 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): # we use python-docker-py to check local docker for images, and skopeo # to look for images available remotely without waiting to pull them. dependencies = ["python-docker-py", "skopeo"] + # command for checking if remote registries have an image, without docker pull + skopeo_command = "timeout 10 skopeo inspect --tls-verify={tls} {creds} docker://{registry}/{image}" + skopeo_example_command = "skopeo inspect [--tls-verify=false] [--creds=<user>:<pass>] docker://<registry>/<image>" + + def __init__(self, *args, **kwargs): + super(DockerImageAvailability, self).__init__(*args, **kwargs) + + self.registries = dict( + # set of registries that need to be checked insecurely (note: not accounting for CIDR entries) + insecure=set(self.ensure_list("openshift_docker_insecure_registries")), + # set of registries that should never be queried even if given in the image + blocked=set(self.ensure_list("openshift_docker_blocked_registries")), + ) + + # ordered list of registries (according to inventory vars) that docker will try for unscoped images + regs = self.ensure_list("openshift_docker_additional_registries") + # currently one of these registries is added whether the user wants it or not. + deployment_type = self.get_var("openshift_deployment_type") + if deployment_type == "origin" and "docker.io" not in regs: + regs.append("docker.io") + elif deployment_type == 'openshift-enterprise' and "registry.access.redhat.com" not in regs: + regs.append("registry.access.redhat.com") + self.registries["configured"] = regs + + # for the oreg_url registry there may be credentials specified + components = self.get_var("oreg_url", default="").split('/') + self.registries["oreg"] = "" if len(components) < 3 else components[0] + + # Retrieve and template registry credentials, if provided + self.skopeo_command_creds = "" + oreg_auth_user = self.get_var('oreg_auth_user', default='') + oreg_auth_password = self.get_var('oreg_auth_password', default='') + if oreg_auth_user != '' and oreg_auth_password != '': + if self._templar is not None: + oreg_auth_user = self._templar.template(oreg_auth_user) + oreg_auth_password = self._templar.template(oreg_auth_password) + self.skopeo_command_creds = "--creds={}:{}".format(quote(oreg_auth_user), quote(oreg_auth_password)) + + # record whether we could reach a registry or not (and remember results) + self.reachable_registries = {} def is_active(self): """Skip hosts with unsupported deployment types.""" @@ -55,21 +97,28 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): if not missing_images: return {} - registries = self.known_docker_registries() - if not registries: - return {"failed": True, "msg": "Unable to retrieve any docker registries."} - - available_images = self.available_images(missing_images, registries) + available_images = self.available_images(missing_images) unavailable_images = set(missing_images) - set(available_images) if unavailable_images: - return { - "failed": True, - "msg": ( - "One or more required Docker images are not available:\n {}\n" - "Configured registries: {}" - ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)), - } + unreachable = [reg for reg, reachable in self.reachable_registries.items() if not reachable] + unreachable_msg = "Failed connecting to: {}\n".format(", ".join(unreachable)) + blocked_msg = "Blocked registries: {}\n".format(", ".join(self.registries["blocked"])) + msg = ( + "One or more required container images are not available:\n {missing}\n" + "Checked with: {cmd}\n" + "Default registries searched: {registries}\n" + "{blocked}" + "{unreachable}" + ).format( + missing=",\n ".join(sorted(unavailable_images)), + cmd=self.skopeo_example_command, + registries=", ".join(self.registries["configured"]), + blocked=blocked_msg if self.registries["blocked"] else "", + unreachable=unreachable_msg if unreachable else "", + ) + + return dict(failed=True, msg=msg) return {} @@ -95,13 +144,11 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): # containerized etcd may not have openshift_image_tag, see bz 1466622 image_tag = self.get_var("openshift_image_tag", default="latest") image_info = DEPLOYMENT_IMAGE_INFO[deployment_type] - if not image_info: - return required # template for images that run on top of OpenShift image_url = "{}/{}-{}:{}".format(image_info["namespace"], image_info["name"], "${component}", "${version}") image_url = self.get_var("oreg_url", default="") or image_url - if 'nodes' in host_groups: + if 'oo_nodes_to_config' in host_groups: for suffix in NODE_IMAGE_SUFFIXES: required.add(image_url.replace("${component}", suffix).replace("${version}", image_tag)) # The registry-console is for some reason not prefixed with ose- like the other components. @@ -112,65 +159,87 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): # images for containerized components if self.get_var("openshift", "common", "is_containerized"): components = set() - if 'nodes' in host_groups: + if 'oo_nodes_to_config' in host_groups: components.update(["node", "openvswitch"]) - if 'masters' in host_groups: # name is "origin" or "ose" + if 'oo_masters_to_config' in host_groups: # name is "origin" or "ose" components.add(image_info["name"]) for component in components: required.add("{}/{}:{}".format(image_info["namespace"], component, image_tag)) - if 'etcd' in host_groups: # special case, note it is the same for origin/enterprise + if 'oo_etcd_to_config' in host_groups: # special case, note it is the same for origin/enterprise required.add("registry.access.redhat.com/rhel7/etcd") # and no image tag return required def local_images(self, images): """Filter a list of images and return those available locally.""" - return [ - image for image in images - if self.is_image_local(image) - ] + found_images = [] + for image in images: + # docker could have the image name as-is or prefixed with any registry + imglist = [image] + [reg + "/" + image for reg in self.registries["configured"]] + if self.is_image_local(imglist): + found_images.append(image) + return found_images def is_image_local(self, image): """Check if image is already in local docker index.""" result = self.execute_module("docker_image_facts", {"name": image}) - if result.get("failed", False): - return False - - return bool(result.get("images", [])) - - def known_docker_registries(self): - """Build a list of docker registries available according to inventory vars.""" - docker_facts = self.get_var("openshift", "docker") - regs = set(docker_facts["additional_registries"]) - - deployment_type = self.get_var("openshift_deployment_type") - if deployment_type == "origin": - regs.update(["docker.io"]) - elif "enterprise" in deployment_type: - regs.update(["registry.access.redhat.com"]) - - return list(regs) - - def available_images(self, images, default_registries): + return bool(result.get("images")) and not result.get("failed") + + def ensure_list(self, registry_param): + """Return the task var as a list.""" + # https://bugzilla.redhat.com/show_bug.cgi?id=1497274 + # If the result was a string type, place it into a list. We must do this + # as using list() on a string will split the string into its characters. + # Otherwise cast to a list as was done previously. + registry = self.get_var(registry_param, default=[]) + if not isinstance(registry, six.string_types): + return list(registry) + return self.normalize(registry) + + def available_images(self, images): """Search remotely for images. Returns: list of images found.""" return [ image for image in images - if self.is_available_skopeo_image(image, default_registries) + if self.is_available_skopeo_image(image) ] - def is_available_skopeo_image(self, image, default_registries): + def is_available_skopeo_image(self, image): """Use Skopeo to determine if required image exists in known registry(s).""" - registries = default_registries - - # if image already includes a registry, only use that + registries = self.registries["configured"] + # If image already includes a registry, only use that. + # NOTE: This logic would incorrectly identify images that do not use a namespace, e.g. + # registry.access.redhat.com/rhel7 as if the registry were a namespace. + # It's not clear that there's any way to distinguish them, but fortunately + # the current set of images all look like [registry/]namespace/name[:version]. if image.count("/") > 1: registry, image = image.split("/", 1) registries = [registry] for registry in registries: - args = {"_raw_params": "skopeo inspect --tls-verify=false docker://{}/{}".format(registry, image)} - result = self.execute_module("command", args) + if registry in self.registries["blocked"]: + continue # blocked will never be consulted + if registry not in self.reachable_registries: + self.reachable_registries[registry] = self.connect_to_registry(registry) + if not self.reachable_registries[registry]: + continue # do not keep trying unreachable registries + + args = dict(registry=registry, image=image) + args["tls"] = "false" if registry in self.registries["insecure"] else "true" + args["creds"] = self.skopeo_command_creds if registry == self.registries["oreg"] else "" + + result = self.execute_module_with_retries("command", {"_raw_params": self.skopeo_command.format(**args)}) if result.get("rc", 0) == 0 and not result.get("failed"): return True + if result.get("rc") == 124: # RC 124 == timed out; mark unreachable + self.reachable_registries[registry] = False return False + + def connect_to_registry(self, registry): + """Use ansible wait_for module to test connectivity from host to registry. Returns bool.""" + # test a simple TCP connection + host, _, port = registry.partition(":") + port = port or 443 + args = dict(host=host, port=port, state="started", timeout=30) + result = self.execute_module("wait_for", args) + return result.get("rc", 0) == 0 and not result.get("failed") diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py index 0558ddf14..6808d8b2f 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_storage.py +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -14,7 +14,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): """ name = "docker_storage" - tags = ["pre-install", "health", "preflight"] + tags = ["health", "preflight"] dependencies = ["python-docker-py"] storage_drivers = ["devicemapper", "overlay", "overlay2"] diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py index b4c8957e9..8b20ccb49 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py @@ -12,7 +12,7 @@ class EtcdTraffic(OpenShiftCheck): def is_active(self): """Skip hosts that do not have etcd in their group names.""" group_names = self.get_var("group_names", default=[]) - valid_group_names = "etcd" in group_names + valid_group_names = "oo_etcd_to_config" in group_names version = self.get_major_minor_version(self.get_var("openshift_image_tag")) valid_version = version in ((3, 4), (3, 5)) diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index e5d93ff3f..3d75da6f9 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -15,8 +15,12 @@ class EtcdVolume(OpenShiftCheck): etcd_mount_path = "/var/lib/etcd" def is_active(self): - etcd_hosts = self.get_var("groups", "etcd", default=[]) or self.get_var("groups", "masters", default=[]) or [] - is_etcd_host = self.get_var("ansible_ssh_host") in etcd_hosts + etcd_hosts = ( + self.get_var("groups", "oo_etcd_to_config", default=[]) or + self.get_var("groups", "oo_masters_to_config", default=[]) or + [] + ) + is_etcd_host = self.get_var("ansible_host") in etcd_hosts return super(EtcdVolume, self).is_active() and is_etcd_host def run(self): diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py index 7fc843fd7..986a01f38 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -72,7 +72,7 @@ class Elasticsearch(LoggingCheck): for pod_name in pods_by_name.keys(): # Compare what each ES node reports as master and compare for split brain get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") - master_name_str = self.exec_oc(get_master_cmd, []) + master_name_str = self.exec_oc(get_master_cmd, [], save_as_name="get_master_names.json") master_names = (master_name_str or '').split(' ') if len(master_names) > 1: es_master_names.add(master_names[1]) @@ -113,7 +113,7 @@ class Elasticsearch(LoggingCheck): # get ES cluster nodes node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') - cluster_node_data = self.exec_oc(node_cmd, []) + cluster_node_data = self.exec_oc(node_cmd, [], save_as_name="get_es_nodes.json") try: cluster_nodes = json.loads(cluster_node_data)['nodes'] except (ValueError, KeyError): @@ -142,7 +142,7 @@ class Elasticsearch(LoggingCheck): errors = [] for pod_name in pods_by_name.keys(): cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') - cluster_health_data = self.exec_oc(cluster_health_cmd, []) + cluster_health_data = self.exec_oc(cluster_health_cmd, [], save_as_name='get_es_health.json') try: health_res = json.loads(cluster_health_data) if not health_res or not health_res.get('status'): @@ -171,7 +171,7 @@ class Elasticsearch(LoggingCheck): errors = [] for pod_name in pods_by_name.keys(): df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) - disk_output = self.exec_oc(df_cmd, []) + disk_output = self.exec_oc(df_cmd, [], save_as_name='get_pv_diskspace.json') lines = disk_output.splitlines() # expecting one header looking like 'IUse% Use%' and one body line body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py index d783e6760..e93cc9028 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py @@ -46,7 +46,7 @@ class FluentdConfig(LoggingCheck): # if check is running on a master, retrieve all running pods # and check any pod's container for the env var "USE_JOURNAL" group_names = self.get_var("group_names") - if "masters" in group_names: + if "oo_masters_to_config" in group_names: use_journald = self.check_fluentd_env_var() docker_info = self.execute_module("docker_info", {}) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py index ecd8adb64..05ba73ca1 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -30,14 +30,6 @@ class LoggingCheck(OpenShiftCheck): logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False) return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master() - def is_first_master(self): - """Determine if running on first master. Returns: bool""" - # Note: It would be nice to use membership in oo_first_master group, however for now it - # seems best to avoid requiring that setup and just check this is the first master. - hostname = self.get_var("ansible_ssh_host") or [None] - masters = self.get_var("groups", "masters", default=None) or [None] - return masters[0] == hostname - def run(self): return {} @@ -78,7 +70,7 @@ class LoggingCheck(OpenShiftCheck): """Returns the namespace in which logging is configured to deploy.""" return self.get_var("openshift_logging_namespace", default="logging") - def exec_oc(self, cmd_str="", extra_args=None): + def exec_oc(self, cmd_str="", extra_args=None, save_as_name=None): """ Execute an 'oc' command in the remote host. Returns: output of command and namespace, @@ -92,7 +84,7 @@ class LoggingCheck(OpenShiftCheck): "extra_args": list(extra_args) if extra_args else [], } - result = self.execute_module("ocutil", args) + result = self.execute_module("ocutil", args, save_as_name=save_as_name) if result.get("failed"): if result['result'] == '[Errno 2] No such file or directory': raise CouldNotUseOc( diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py index d781db649..cacdf4213 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py @@ -104,7 +104,7 @@ class LoggingIndexTime(LoggingCheck): "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}" ) exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid) - result = self.exec_oc(exec_cmd, []) + result = self.exec_oc(exec_cmd, [], save_as_name="query_for_uuid.json") try: count = json.loads(result)["count"] diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py index 765ba072d..e7a8ec976 100644 --- a/roles/openshift_health_checker/openshift_checks/memory_availability.py +++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py @@ -14,9 +14,9 @@ class MemoryAvailability(OpenShiftCheck): # Values taken from the official installation documentation: # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements recommended_memory_bytes = { - "masters": 16 * GIB, - "nodes": 8 * GIB, - "etcd": 8 * GIB, + "oo_masters_to_config": 16 * GIB, + "oo_nodes_to_config": 8 * GIB, + "oo_etcd_to_config": 8 * GIB, } # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal memtotal_adjustment = 1 * GIB diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py index e9bae60a3..cfbdea303 100644 --- a/roles/openshift_health_checker/openshift_checks/mixins.py +++ b/roles/openshift_health_checker/openshift_checks/mixins.py @@ -21,9 +21,11 @@ class DockerHostMixin(object): def is_active(self): """Only run on hosts that depend on Docker.""" - is_containerized = self.get_var("openshift", "common", "is_containerized") - is_node = "nodes" in self.get_var("group_names", default=[]) - return super(DockerHostMixin, self).is_active() and (is_containerized or is_node) + group_names = set(self.get_var("group_names", default=[])) + needs_docker = set(["oo_nodes_to_config"]) + if self.get_var("openshift.common.is_containerized"): + needs_docker.update(["oo_masters_to_config", "oo_etcd_to_config"]) + return super(DockerHostMixin, self).is_active() and bool(group_names.intersection(needs_docker)) def ensure_dependencies(self): """ @@ -36,7 +38,7 @@ class DockerHostMixin(object): # NOTE: we would use the "package" module but it's actually an action plugin # and it's not clear how to invoke one of those. This is about the same anyway: - result = self.execute_module( + result = self.execute_module_with_retries( self.get_var("ansible_pkg_mgr", default="yum"), {"name": self.dependencies, "state": "present"}, ) @@ -49,5 +51,4 @@ class DockerHostMixin(object): " {deps}\n{msg}" ).format(deps=',\n '.join(self.dependencies), msg=msg) failed = result.get("failed", False) or result.get("rc", 0) != 0 - self.changed = result.get("changed", False) return msg, failed diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py index 363c12def..416805c4d 100644 --- a/roles/openshift_health_checker/openshift_checks/ovs_version.py +++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py @@ -24,7 +24,7 @@ class OvsVersion(NotContainerizedMixin, OpenShiftCheck): def is_active(self): """Skip hosts that do not have package requirements.""" group_names = self.get_var("group_names", default=[]) - master_or_node = 'masters' in group_names or 'nodes' in group_names + master_or_node = 'oo_masters_to_config' in group_names or 'oo_nodes_to_config' in group_names return super(OvsVersion, self).is_active() and master_or_node def run(self): diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py index a86180b00..090e438ff 100644 --- a/roles/openshift_health_checker/openshift_checks/package_availability.py +++ b/roles/openshift_health_checker/openshift_checks/package_availability.py @@ -20,13 +20,13 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck): packages = set() - if "masters" in group_names: + if "oo_masters_to_config" in group_names: packages.update(self.master_packages(rpm_prefix)) - if "nodes" in group_names: + if "oo_nodes_to_config" in group_names: packages.update(self.node_packages(rpm_prefix)) args = {"packages": sorted(set(packages))} - return self.execute_module("check_yum_update", args) + return self.execute_module_with_retries("check_yum_update", args) @staticmethod def master_packages(rpm_prefix): diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py index 1e9aecbe0..8464e8a5e 100644 --- a/roles/openshift_health_checker/openshift_checks/package_update.py +++ b/roles/openshift_health_checker/openshift_checks/package_update.py @@ -11,4 +11,4 @@ class PackageUpdate(NotContainerizedMixin, OpenShiftCheck): def run(self): args = {"packages": []} - return self.execute_module("check_yum_update", args) + return self.execute_module_with_retries("check_yum_update", args) diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py index 8b780114f..2f09b22fc 100644 --- a/roles/openshift_health_checker/openshift_checks/package_version.py +++ b/roles/openshift_health_checker/openshift_checks/package_version.py @@ -36,7 +36,7 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck): def is_active(self): """Skip hosts that do not have package requirements.""" group_names = self.get_var("group_names", default=[]) - master_or_node = 'masters' in group_names or 'nodes' in group_names + master_or_node = 'oo_masters_to_config' in group_names or 'oo_nodes_to_config' in group_names return super(PackageVersion, self).is_active() and master_or_node def run(self): @@ -46,6 +46,7 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck): check_multi_minor_release = deployment_type in ['openshift-enterprise'] args = { + "package_mgr": self.get_var("ansible_pkg_mgr"), "package_list": [ { "name": "openvswitch", @@ -75,7 +76,7 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck): ], } - return self.execute_module("aos_version", args) + return self.execute_module_with_retries("aos_version", args) def get_required_ovs_version(self): """Return the correct Open vSwitch version(s) for the current OpenShift version.""" |