From 06a6fb9642a2cc70b1ca65f403b853fe8ce9d4b2 Mon Sep 17 00:00:00 2001 From: Luke Meyer Date: Thu, 20 Jul 2017 23:39:47 -0400 Subject: openshift_checks: refactor logging checks Turn failure messages into exceptions that tests can look for without depending on text meant for humans. Turn logging_namespace property into a method. Get rid of _exec_oc and just use logging.exec_oc. --- .../openshift_checks/logging/curator.py | 32 ++-- .../openshift_checks/logging/elasticsearch.py | 166 ++++++++-------- .../openshift_checks/logging/fluentd.py | 191 +++++++++---------- .../openshift_checks/logging/fluentd_config.py | 17 +- .../openshift_checks/logging/kibana.py | 208 +++++++++++---------- .../openshift_checks/logging/logging.py | 54 ++++-- .../openshift_checks/logging/logging_index_time.py | 75 ++++---- 7 files changed, 375 insertions(+), 368 deletions(-) (limited to 'roles/openshift_health_checker/openshift_checks/logging') diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py index 32a92c909..b27f97172 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/curator.py +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -1,6 +1,6 @@ """Check for an aggregated logging Curator deployment""" -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck class Curator(LoggingCheck): @@ -12,27 +12,17 @@ class Curator(LoggingCheck): def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - curator_pods, error = self.get_pods_for_component( - self.logging_namespace, - "curator", - ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_curator(curator_pods) - - if check_error: - msg = ("The following Curator deployment issue was found:" - "{}".format(check_error)) - return {"failed": True, "msg": msg} - + curator_pods = self.get_pods_for_component("curator") + self.check_curator(curator_pods) # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Curator deployment.'} + + return {} def check_curator(self, pods): """Check to see if curator is up and working. Returns: error string""" if not pods: - return ( + raise OpenShiftCheckException( + "MissingComponentPods", "There are no Curator pods for the logging stack,\n" "so nothing will prune Elasticsearch indexes.\n" "Is Curator correctly deployed?" @@ -40,14 +30,14 @@ class Curator(LoggingCheck): not_running = self.not_running_pods(pods) if len(not_running) == len(pods): - return ( + raise OpenShiftCheckException( + "CuratorNotRunning", "The Curator pod is not currently in a running state,\n" "so Elasticsearch indexes may increase without bound." ) if len(pods) - len(not_running) > 1: - return ( + raise OpenShiftCheckException( + "TooManyCurators", "There is more than one Curator pod running. This should not normally happen.\n" "Although this doesn't cause any problems, you may want to investigate." ) - - return None diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py index b2e9a8f49..7fc843fd7 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -3,6 +3,7 @@ import json import re +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList from openshift_checks.logging.logging import LoggingCheck @@ -15,168 +16,178 @@ class Elasticsearch(LoggingCheck): def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - es_pods, error = self.get_pods_for_component( - self.logging_namespace, - "es", - ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_elasticsearch(es_pods) - - if check_error: - msg = ("The following Elasticsearch deployment issue was found:" - "{}".format(check_error)) - return {"failed": True, "msg": msg} - + es_pods = self.get_pods_for_component("es") + self.check_elasticsearch(es_pods) # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Elasticsearch deployment.'} - def _not_running_elasticsearch_pods(self, es_pods): - """Returns: list of pods that are not running, list of errors about non-running pods""" - not_running = self.not_running_pods(es_pods) - if not_running: - return not_running, [( - 'The following Elasticsearch pods are not running:\n' - '{pods}' - 'These pods will not aggregate logs from their nodes.' - ).format(pods=''.join( - " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) - for pod in not_running - ))] - return not_running, [] + return {} def check_elasticsearch(self, es_pods): - """Various checks for elasticsearch. Returns: error string""" - not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods) - running_pods = [pod for pod in es_pods if pod not in not_running_pods] + """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors.""" + running_pods, errors = self.running_elasticsearch_pods(es_pods) pods_by_name = { pod['metadata']['name']: pod for pod in running_pods # Filter out pods that are not members of a DC if pod['metadata'].get('labels', {}).get('deploymentconfig') } if not pods_by_name: - return 'No logging Elasticsearch pods were found. Is logging deployed?' - error_msgs += self._check_elasticsearch_masters(pods_by_name) - error_msgs += self._check_elasticsearch_node_list(pods_by_name) - error_msgs += self._check_es_cluster_health(pods_by_name) - error_msgs += self._check_elasticsearch_diskspace(pods_by_name) - return '\n'.join(error_msgs) + # nothing running, cannot run the rest of the check + errors.append(OpenShiftCheckException( + 'NoRunningPods', + 'No logging Elasticsearch pods were found running, so no logs are being aggregated.' + )) + raise OpenShiftCheckExceptionList(errors) + + errors += self.check_elasticsearch_masters(pods_by_name) + errors += self.check_elasticsearch_node_list(pods_by_name) + errors += self.check_es_cluster_health(pods_by_name) + errors += self.check_elasticsearch_diskspace(pods_by_name) + if errors: + raise OpenShiftCheckExceptionList(errors) + + def running_elasticsearch_pods(self, es_pods): + """Returns: list of running pods, list of errors about non-running pods""" + not_running = self.not_running_pods(es_pods) + running_pods = [pod for pod in es_pods if pod not in not_running] + if not_running: + return running_pods, [OpenShiftCheckException( + 'PodNotRunning', + 'The following Elasticsearch pods are defined but not running:\n' + '{pods}'.format(pods=''.join( + " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) + for pod in not_running + )) + )] + return running_pods, [] @staticmethod def _build_es_curl_cmd(pod_name, url): base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'" return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) - def _check_elasticsearch_masters(self, pods_by_name): - """Check that Elasticsearch masters are sane. Returns: list of error strings""" + def check_elasticsearch_masters(self, pods_by_name): + """Check that Elasticsearch masters are sane. Returns: list of errors""" es_master_names = set() - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): # Compare what each ES node reports as master and compare for split brain get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") - master_name_str = self.exec_oc(self.logging_namespace, get_master_cmd, []) + master_name_str = self.exec_oc(get_master_cmd, []) master_names = (master_name_str or '').split(' ') if len(master_names) > 1: es_master_names.add(master_names[1]) else: - error_msgs.append( - 'No master? Elasticsearch {pod} returned bad string when asked master name:\n' + errors.append(OpenShiftCheckException( + 'NoMasterName', + 'Elasticsearch {pod} gave unexpected response when asked master name:\n' ' {response}'.format(pod=pod_name, response=master_name_str) - ) + )) if not es_master_names: - error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?') - return '\n'.join(error_msgs) + errors.append(OpenShiftCheckException( + 'NoMasterFound', + 'No logging Elasticsearch masters were found.' + )) + return errors if len(es_master_names) > 1: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'SplitBrainMasters', 'Found multiple Elasticsearch masters according to the pods:\n' '{master_list}\n' 'This implies that the masters have "split brain" and are not correctly\n' 'replicating data for the logging cluster. Log loss is likely to occur.' .format(master_list='\n'.join(' ' + master for master in es_master_names)) - ) + )) - return error_msgs + return errors - def _check_elasticsearch_node_list(self, pods_by_name): - """Check that reported ES masters are accounted for by pods. Returns: list of error strings""" + def check_elasticsearch_node_list(self, pods_by_name): + """Check that reported ES masters are accounted for by pods. Returns: list of errors""" if not pods_by_name: - return ['No logging Elasticsearch masters were found. Is logging deployed?'] + return [OpenShiftCheckException( + 'MissingComponentPods', + 'No logging Elasticsearch pods were found.' + )] # get ES cluster nodes node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') - cluster_node_data = self.exec_oc(self.logging_namespace, node_cmd, []) + cluster_node_data = self.exec_oc(node_cmd, []) try: cluster_nodes = json.loads(cluster_node_data)['nodes'] except (ValueError, KeyError): - return [ + return [OpenShiftCheckException( + 'MissingNodeList', 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' + cluster_node_data - ] + )] # Try to match all ES-reported node hosts to known pods. - error_msgs = [] + errors = [] for node in cluster_nodes.values(): # Note that with 1.4/3.4 the pod IP may be used as the master name if not any(node['host'] in (pod_name, pod['status'].get('podIP')) for pod_name, pod in pods_by_name.items()): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'EsPodNodeMismatch', 'The Elasticsearch cluster reports a member node "{node}"\n' 'that does not correspond to any known ES pod.'.format(node=node['host']) - ) + )) - return error_msgs + return errors - def _check_es_cluster_health(self, pods_by_name): + def check_es_cluster_health(self, pods_by_name): """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') - cluster_health_data = self.exec_oc(self.logging_namespace, cluster_health_cmd, []) + cluster_health_data = self.exec_oc(cluster_health_cmd, []) try: health_res = json.loads(cluster_health_data) if not health_res or not health_res.get('status'): raise ValueError() except ValueError: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'BadEsResponse', 'Could not retrieve cluster health status from logging ES pod "{pod}".\n' 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data) - ) + )) continue if health_res['status'] not in ['green', 'yellow']: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'EsClusterHealthRed', 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name) - ) + )) - return error_msgs + return errors - def _check_elasticsearch_diskspace(self, pods_by_name): + def check_elasticsearch_diskspace(self, pods_by_name): """ Exec into an ES pod and query the diskspace on the persistent volume. Returns: list of errors """ - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) - disk_output = self.exec_oc(self.logging_namespace, df_cmd, []) + disk_output = self.exec_oc(df_cmd, []) lines = disk_output.splitlines() # expecting one header looking like 'IUse% Use%' and one body line body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'BadDfResponse', 'Could not retrieve storage usage from logging ES pod "{pod}".\n' 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output) - ) + )) continue inode_pct, disk_pct = re.match(body_re, lines[1]).groups() inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90') if int(inode_pct) >= int(inode_pct_thresh): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'InodeUsageTooHigh', 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n' ' is {pct}, greater than threshold {limit}.\n' ' Note: threshold can be specified in inventory with {param}'.format( @@ -184,10 +195,11 @@ class Elasticsearch(LoggingCheck): pct=str(inode_pct), limit=str(inode_pct_thresh), param='openshift_check_efk_es_inode_pct', - )) + ))) disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80') if int(disk_pct) >= int(disk_pct_thresh): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'DiskUsageTooHigh', 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n' ' is {pct}, greater than threshold {limit}.\n' ' Note: threshold can be specified in inventory with {param}'.format( @@ -195,6 +207,6 @@ class Elasticsearch(LoggingCheck): pct=str(disk_pct), limit=str(disk_pct_thresh), param='openshift_check_efk_es_storage_pct', - )) + ))) - return error_msgs + return errors diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py index 69c7b4392..3b192a281 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -2,6 +2,8 @@ import json + +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList from openshift_checks.logging.logging import LoggingCheck @@ -12,57 +14,96 @@ class Fluentd(LoggingCheck): tags = ["health", "logging"] def run(self): - """Check various things and gather errors. Returns: result as hash""" + """Check the Fluentd deployment and raise an error if any problems are found.""" + + fluentd_pods = self.get_pods_for_component("fluentd") + self.check_fluentd(fluentd_pods) + return {} + + def check_fluentd(self, pods): + """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found.""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - fluentd_pods, error = super(Fluentd, self).get_pods_for_component( - self.logging_namespace, - "fluentd", + node_selector = self.get_var( + 'openshift_logging_fluentd_nodeselector', + default='logging-infra-fluentd=true' ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_fluentd(fluentd_pods) - if check_error: - msg = ("The following Fluentd deployment issue was found:" - "{}".format(check_error)) - return {"failed": True, "msg": msg} + nodes_by_name = self.get_nodes_by_name() + fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector) - # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Fluentd deployment.'} + errors = [] + errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) + errors += self.check_nodes_have_fluentd(pods, fluentd_nodes) + errors += self.check_fluentd_pods_running(pods) + + # Make sure there are no extra fluentd pods + if len(pods) > len(fluentd_nodes): + errors.append(OpenShiftCheckException( + 'TooManyFluentdPods', + 'There are more Fluentd pods running than nodes labeled.\n' + 'This may not cause problems with logging but it likely indicates something wrong.' + )) + + if errors: + raise OpenShiftCheckExceptionList(errors) + + def get_nodes_by_name(self): + """Retrieve all the node definitions. Returns: dict(name: node)""" + nodes_json = self.exec_oc("get nodes -o json", []) + try: + nodes = json.loads(nodes_json) + except ValueError: # no valid json - should not happen + raise OpenShiftCheckException( + "BadOcNodeList", + "Could not obtain a list of nodes to validate fluentd.\n" + "Output from oc get:\n" + nodes_json + ) + if not nodes or not nodes.get('items'): # also should not happen + raise OpenShiftCheckException( + "NoNodesDefined", + "No nodes appear to be defined according to the API." + ) + return { + node['metadata']['name']: node + for node in nodes['items'] + } @staticmethod - def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector): - """Filter to all nodes with fluentd label. Returns dict(name: node), error string""" + def filter_fluentd_labeled_nodes(nodes_by_name, node_selector): + """Filter to all nodes with fluentd label. Returns dict(name: node)""" label, value = node_selector.split('=', 1) fluentd_nodes = { name: node for name, node in nodes_by_name.items() if node['metadata']['labels'].get(label) == value } if not fluentd_nodes: - return None, ( + raise OpenShiftCheckException( + 'NoNodesLabeled', 'There are no nodes with the fluentd label {label}.\n' - 'This means no logs will be aggregated from the nodes.' - ).format(label=node_selector) - return fluentd_nodes, None + 'This means no logs will be aggregated from the nodes.'.format(label=node_selector) + ) + return fluentd_nodes - def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): - """Note if nodes are not labeled as expected. Returns: error string""" + def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): + """Note if nodes are not labeled as expected. Returns: error list""" intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all']) if not intended_nodes or '--all' in intended_nodes: intended_nodes = nodes_by_name.keys() nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) if nodes_missing_labels: - return ( + return [OpenShiftCheckException( + 'NodesUnlabeled', 'The following nodes are supposed to be labeled with {label} but are not:\n' ' {nodes}\n' - 'Fluentd will not aggregate logs from these nodes.' - ).format(label=node_selector, nodes=', '.join(nodes_missing_labels)) - return None + 'Fluentd will not aggregate logs from these nodes.'.format( + label=node_selector, nodes=', '.join(nodes_missing_labels) + ))] + + return [] @staticmethod - def _check_nodes_have_fluentd(pods, fluentd_nodes): - """Make sure fluentd is on all the labeled nodes. Returns: error string""" + def check_nodes_have_fluentd(pods, fluentd_nodes): + """Make sure fluentd is on all the labeled nodes. Returns: error list""" unmatched_nodes = fluentd_nodes.copy() node_names_by_label = { node['metadata']['labels']['kubernetes.io/hostname']: name @@ -82,80 +123,32 @@ class Fluentd(LoggingCheck): ]: unmatched_nodes.pop(name, None) if unmatched_nodes: - return ( + return [OpenShiftCheckException( + 'MissingFluentdPod', 'The following nodes are supposed to have a Fluentd pod but do not:\n' - '{nodes}' - 'These nodes will not have their logs aggregated.' - ).format(nodes=''.join( - " {}\n".format(name) - for name in unmatched_nodes.keys() - )) - return None + ' {nodes}\n' + 'These nodes will not have their logs aggregated.'.format( + nodes='\n '.join(unmatched_nodes.keys()) + ))] + + return [] - def _check_fluentd_pods_running(self, pods): + def check_fluentd_pods_running(self, pods): """Make sure all fluentd pods are running. Returns: error string""" not_running = super(Fluentd, self).not_running_pods(pods) if not_running: - return ( + return [OpenShiftCheckException( + 'FluentdNotRunning', 'The following Fluentd pods are supposed to be running but are not:\n' - '{pods}' - 'These pods will not aggregate logs from their nodes.' - ).format(pods=''.join( - " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) - for pod in not_running - )) - return None - - def check_fluentd(self, pods): - """Verify fluentd is running everywhere. Returns: error string""" - - node_selector = self.get_var( - 'openshift_logging_fluentd_nodeselector', - default='logging-infra-fluentd=true' - ) - - nodes_by_name, error = self.get_nodes_by_name() - - if error: - return error - fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector) - if error: - return error - - error_msgs = [] - error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) - if error: - error_msgs.append(error) - error = self._check_nodes_have_fluentd(pods, fluentd_nodes) - if error: - error_msgs.append(error) - error = self._check_fluentd_pods_running(pods) - if error: - error_msgs.append(error) - - # Make sure there are no extra fluentd pods - if len(pods) > len(fluentd_nodes): - error_msgs.append( - 'There are more Fluentd pods running than nodes labeled.\n' - 'This may not cause problems with logging but it likely indicates something wrong.' - ) - - return '\n'.join(error_msgs) - - def get_nodes_by_name(self): - """Retrieve all the node definitions. Returns: dict(name: node), error string""" - nodes_json = self.exec_oc( - self.logging_namespace, - "get nodes -o json", - [] - ) - try: - nodes = json.loads(nodes_json) - except ValueError: # no valid json - should not happen - return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json - if not nodes or not nodes.get('items'): # also should not happen - return None, "No nodes appear to be defined according to the API." - return { - node['metadata']['name']: node - for node in nodes['items'] - }, None + ' {pods}\n' + 'These pods will not aggregate logs from their nodes.'.format( + pods='\n'.join( + " {name} ({host})".format( + name=pod['metadata']['name'], + host=pod['spec'].get('host', 'None') + ) + for pod in not_running + ) + ))] + + return [] diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py index 0970f0a63..d783e6760 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py @@ -24,7 +24,6 @@ class FluentdConfig(LoggingCheck): def run(self): """Check that Fluentd has running pods, and that its logging config matches Docker's logging config.""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace) config_error = self.check_logging_config() if config_error: msg = ("The following Fluentd logging configuration problem was found:" @@ -120,19 +119,13 @@ class FluentdConfig(LoggingCheck): def running_fluentd_pods(self): """Return a list of running fluentd pods.""" - fluentd_pods, error = self.get_pods_for_component( - self.logging_namespace, - "fluentd", - ) - if error: - msg = 'Unable to retrieve any pods for the "fluentd" logging component: {}'.format(error) - raise OpenShiftCheckException(msg) + fluentd_pods = self.get_pods_for_component("fluentd") running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running'] if not running_fluentd_pods: - msg = ('No Fluentd pods were found to be in the "Running" state. ' - 'At least one Fluentd pod is required in order to perform this check.') - - raise OpenShiftCheckException(msg) + raise OpenShiftCheckException( + 'No Fluentd pods were found to be in the "Running" state. ' + 'At least one Fluentd pod is required in order to perform this check.' + ) return running_fluentd_pods diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py index c600bb47e..3b1cf8baa 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -12,7 +12,7 @@ except ImportError: from urllib.error import HTTPError, URLError import urllib.request as urllib2 -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException class Kibana(LoggingCheck): @@ -24,25 +24,12 @@ class Kibana(LoggingCheck): def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - kibana_pods, error = self.get_pods_for_component( - self.logging_namespace, - "kibana", - ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_kibana(kibana_pods) - - if not check_error: - check_error = self._check_kibana_route() - - if check_error: - msg = ("The following Kibana deployment issue was found:" - "{}".format(check_error)) - return {"failed": True, "msg": msg} - + kibana_pods = self.get_pods_for_component("kibana") + self.check_kibana(kibana_pods) + self.check_kibana_route() # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Kibana deployment.'} + + return {} def _verify_url_internal(self, url): """ @@ -65,7 +52,7 @@ class Kibana(LoggingCheck): def _verify_url_external(url): """ Try to reach a URL from ansible control host. - Returns: success (bool), reason (for failure) + Raise an OpenShiftCheckException if anything goes wrong. """ # This actually checks from the ansible control host, which may or may not # really be "external" to the cluster. @@ -91,130 +78,149 @@ class Kibana(LoggingCheck): return None def check_kibana(self, pods): - """Check to see if Kibana is up and working. Returns: error string.""" + """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not.""" if not pods: - return "There are no Kibana pods deployed, so no access to the logging UI." + raise OpenShiftCheckException( + "MissingComponentPods", + "There are no Kibana pods deployed, so no access to the logging UI." + ) not_running = self.not_running_pods(pods) if len(not_running) == len(pods): - return "No Kibana pod is in a running state, so there is no access to the logging UI." + raise OpenShiftCheckException( + "NoRunningPods", + "No Kibana pod is in a running state, so there is no access to the logging UI." + ) elif not_running: - return ( + raise OpenShiftCheckException( + "PodNotRunning", "The following Kibana pods are not currently in a running state:\n" - "{pods}" - "However at least one is, so service may not be impacted." - ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running)) - - return None + " {pods}\n" + "However at least one is, so service may not be impacted.".format( + pods="\n ".join(pod['metadata']['name'] for pod in not_running) + ) + ) def _get_kibana_url(self): """ Get kibana route or report error. - Returns: url (or empty), reason for failure + Returns: url """ # Get logging url - get_route = self.exec_oc( - self.logging_namespace, - "get route logging-kibana -o json", - [], - ) + get_route = self.exec_oc("get route logging-kibana -o json", []) if not get_route: - return None, 'no_route_exists' + raise OpenShiftCheckException( + 'no_route_exists', + 'No route is defined for Kibana in the logging namespace,\n' + 'so the logging stack is not accessible. Is logging deployed?\n' + 'Did something remove the logging-kibana route?' + ) - route = json.loads(get_route) + try: + route = json.loads(get_route) + # check that the route has been accepted by a router + ingress = route["status"]["ingress"] + except (ValueError, KeyError): + raise OpenShiftCheckException( + 'get_route_failed', + '"oc get route" returned an unexpected response:\n' + get_route + ) - # check that the route has been accepted by a router - ingress = route["status"]["ingress"] # ingress can be null if there is no router, or empty if not routed if not ingress or not ingress[0]: - return None, 'route_not_accepted' + raise OpenShiftCheckException( + 'route_not_accepted', + 'The logging-kibana route is not being routed by any router.\n' + 'Is the router deployed and working?' + ) host = route.get("spec", {}).get("host") if not host: - return None, 'route_missing_host' + raise OpenShiftCheckException( + 'route_missing_host', + 'The logging-kibana route has no hostname defined,\n' + 'which should never happen. Did something alter its definition?' + ) - return 'https://{}/'.format(host), None + return 'https://{}/'.format(host) - def _check_kibana_route(self): + def check_kibana_route(self): """ Check to see if kibana route is up and working. - Returns: error string + Raises exception if not. """ - known_errors = dict( - no_route_exists=( - 'No route is defined for Kibana in the logging namespace,\n' - 'so the logging stack is not accessible. Is logging deployed?\n' - 'Did something remove the logging-kibana route?' - ), - route_not_accepted=( - 'The logging-kibana route is not being routed by any router.\n' - 'Is the router deployed and working?' - ), - route_missing_host=( - 'The logging-kibana route has no hostname defined,\n' - 'which should never happen. Did something alter its definition?' - ), - ) - kibana_url, error = self._get_kibana_url() - if not kibana_url: - return known_errors.get(error, error) + kibana_url = self._get_kibana_url() # first, check that kibana is reachable from the master. error = self._verify_url_internal(kibana_url) if error: if 'urlopen error [Errno 111] Connection refused' in error: - error = ( + raise OpenShiftCheckException( + 'FailedToConnectInternal', 'Failed to connect from this master to Kibana URL {url}\n' - 'Is kibana running, and is at least one router routing to it?' - ).format(url=kibana_url) + 'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url) + ) elif 'urlopen error [Errno -2] Name or service not known' in error: - error = ( + raise OpenShiftCheckException( + 'FailedToResolveInternal', 'Failed to connect from this master to Kibana URL {url}\n' 'because the hostname does not resolve.\n' - 'Is DNS configured for the Kibana hostname?' - ).format(url=kibana_url) + 'Is DNS configured for the Kibana hostname?'.format(url=kibana_url) + ) elif 'Status code was not' in error: - error = ( + raise OpenShiftCheckException( + 'WrongReturnCodeInternal', 'A request from this master to the Kibana URL {url}\n' 'did not return the correct status code (302).\n' 'This could mean that Kibana is malfunctioning, the hostname is\n' 'resolving incorrectly, or other network issues. The output was:\n' - ' {error}' - ).format(url=kibana_url, error=error) - return 'Error validating the logging Kibana route:\n' + error + ' {error}'.format(url=kibana_url, error=error) + ) + raise OpenShiftCheckException( + 'MiscRouteErrorInternal', + 'Error validating the logging Kibana route internally:\n' + error + ) # in production we would like the kibana route to work from outside the # cluster too; but that may not be the case, so allow disabling just this part. - if not self.get_var("openshift_check_efk_kibana_external", default=True): - return None + if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true": + return error = self._verify_url_external(kibana_url) - if error: - if 'urlopen error [Errno 111] Connection refused' in error: - error = ( - 'Failed to connect from the Ansible control host to Kibana URL {url}\n' - 'Is the router for the Kibana hostname exposed externally?' - ).format(url=kibana_url) - elif 'urlopen error [Errno -2] Name or service not known' in error: - error = ( - 'Failed to resolve the Kibana hostname in {url}\n' - 'from the Ansible control host.\n' - 'Is DNS configured to resolve this Kibana hostname externally?' - ).format(url=kibana_url) - elif 'Expected success (200)' in error: - error = ( - 'A request to Kibana at {url}\n' - 'returned the wrong error code:\n' - ' {error}\n' - 'This could mean that Kibana is malfunctioning, the hostname is\n' - 'resolving incorrectly, or other network issues.' - ).format(url=kibana_url, error=error) - error = ( - 'Error validating the logging Kibana route:\n{error}\n' - 'To disable external Kibana route validation, set in your inventory:\n' - ' openshift_check_efk_kibana_external=False' - ).format(error=error) - return error - return None + + if not error: + return + + error_fmt = ( + 'Error validating the logging Kibana route:\n{error}\n' + 'To disable external Kibana route validation, set the variable:\n' + ' openshift_check_efk_kibana_external=False' + ) + if 'urlopen error [Errno 111] Connection refused' in error: + msg = ( + 'Failed to connect from the Ansible control host to Kibana URL {url}\n' + 'Is the router for the Kibana hostname exposed externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg)) + elif 'urlopen error [Errno -2] Name or service not known' in error: + msg = ( + 'Failed to resolve the Kibana hostname in {url}\n' + 'from the Ansible control host.\n' + 'Is DNS configured to resolve this Kibana hostname externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg)) + elif 'Expected success (200)' in error: + msg = ( + 'A request to Kibana at {url}\n' + 'returned the wrong error code:\n' + ' {error}\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues.' + ).format(url=kibana_url, error=error) + raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg)) + raise OpenShiftCheckException( + 'MiscRouteError', + 'Error validating the logging Kibana route externally:\n' + error + ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py index 43ba6c406..3b7c39760 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -8,6 +8,16 @@ import os from openshift_checks import OpenShiftCheck, OpenShiftCheckException +class MissingComponentPods(OpenShiftCheckException): + """Raised when a component has no pods in the namespace.""" + pass + + +class CouldNotUseOc(OpenShiftCheckException): + """Raised when ocutil has a failure running oc.""" + pass + + class LoggingCheck(OpenShiftCheck): """Base class for OpenShift aggregated logging component checks""" @@ -15,7 +25,6 @@ class LoggingCheck(OpenShiftCheck): # run by itself. name = "logging" - logging_namespace = "logging" def is_active(self): logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) @@ -32,22 +41,24 @@ class LoggingCheck(OpenShiftCheck): def run(self): return {} - def get_pods_for_component(self, namespace, logging_component): - """Get all pods for a given component. Returns: list of pods for component, error string""" + def get_pods_for_component(self, logging_component): + """Get all pods for a given component. Returns: list of pods.""" pod_output = self.exec_oc( - namespace, "get pods -l component={} -o json".format(logging_component), [], ) try: - pods = json.loads(pod_output) - if not pods or not pods.get('items'): + pods = json.loads(pod_output) # raises ValueError if deserialize fails + if not pods or not pods.get('items'): # also a broken response, treat the same raise ValueError() except ValueError: - # successful run but non-parsing data generally means there were no pods in the namespace - return None, 'No pods were found for the "{}" logging component.'.format(logging_component) + # successful run but non-parsing data generally means there were no pods to be found + raise MissingComponentPods( + 'There are no "{}" component pods in the "{}" namespace.\n' + 'Is logging deployed?'.format(logging_component, self.logging_namespace()) + ) - return pods['items'], None + return pods['items'] @staticmethod def not_running_pods(pods): @@ -63,15 +74,19 @@ class LoggingCheck(OpenShiftCheck): ) ] - def exec_oc(self, namespace="logging", cmd_str="", extra_args=None): + def logging_namespace(self): + """Returns the namespace in which logging is configured to deploy.""" + return self.get_var("openshift_logging_namespace", default="logging") + + def exec_oc(self, cmd_str="", extra_args=None): """ Execute an 'oc' command in the remote host. Returns: output of command and namespace, - or raises OpenShiftCheckException on error + or raises CouldNotUseOc on error """ config_base = self.get_var("openshift", "common", "config_base") args = { - "namespace": namespace, + "namespace": self.logging_namespace(), "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), "cmd": cmd_str, "extra_args": list(extra_args) if extra_args else [], @@ -79,17 +94,16 @@ class LoggingCheck(OpenShiftCheck): result = self.execute_module("ocutil", args) if result.get("failed"): - msg = ( - 'Unexpected error using `oc` to validate the logging stack components.\n' - 'Error executing `oc {cmd}`:\n' - '{error}' - ).format(cmd=args['cmd'], error=result['result']) - if result['result'] == '[Errno 2] No such file or directory': - msg = ( + raise CouldNotUseOc( "This host is supposed to be a master but does not have the `oc` command where expected.\n" "Has an installation been run on this host yet?" ) - raise OpenShiftCheckException(msg) + + raise CouldNotUseOc( + 'Unexpected error using `oc` to validate the logging stack components.\n' + 'Error executing `oc {cmd}`:\n' + '{error}'.format(cmd=args['cmd'], error=result['result']) + ) return result.get("result", "") diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py index b24e88e05..d781db649 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py @@ -19,8 +19,6 @@ class LoggingIndexTime(LoggingCheck): name = "logging_index_time" tags = ["health", "logging"] - logging_namespace = "logging" - def run(self): """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs.""" try: @@ -28,29 +26,25 @@ class LoggingIndexTime(LoggingCheck): self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS) ) except ValueError: - return { - "failed": True, - "msg": ('Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' - 'Value must be an integer representing an amount in seconds.'), - } + raise OpenShiftCheckException( + 'InvalidTimeout', + 'Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' + 'Value must be an integer representing an amount in seconds.' + ) running_component_pods = dict() # get all component pods - self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace) for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']): - pods, error = self.get_pods_for_component(self.logging_namespace, component) - - if error: - msg = 'Unable to retrieve pods for the {} logging component: {}' - return {"failed": True, "changed": False, "msg": msg.format(name, error)} - + pods = self.get_pods_for_component(component) running_pods = self.running_pods(pods) if not running_pods: - msg = ('No {} pods in the "Running" state were found.' - 'At least one pod is required in order to perform this check.') - return {"failed": True, "changed": False, "msg": msg.format(name)} + raise OpenShiftCheckException( + component + 'NoRunningPods', + 'No {} pods in the "Running" state were found.' + 'At least one pod is required in order to perform this check.'.format(name) + ) running_component_pods[component] = running_pods @@ -65,8 +59,11 @@ class LoggingIndexTime(LoggingCheck): interval = 1 while not self.query_es_from_es(es_pod, uuid): if time.time() + interval > deadline: - msg = "expecting match in Elasticsearch for message with uuid {}, but no matches were found after {}s." - raise OpenShiftCheckException(msg.format(uuid, timeout_secs)) + raise OpenShiftCheckException( + "NoMatchFound", + "expecting match in Elasticsearch for message with uuid {}, " + "but no matches were found after {}s.".format(uuid, timeout_secs) + ) time.sleep(interval) def curl_kibana_with_uuid(self, kibana_pod): @@ -76,22 +73,23 @@ class LoggingIndexTime(LoggingCheck): exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}" exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid) - error_str = self.exec_oc(self.logging_namespace, exec_cmd, []) + error_str = self.exec_oc(exec_cmd, []) try: error_code = json.loads(error_str)["statusCode"] - except KeyError: - msg = ('invalid response returned from Kibana request (Missing "statusCode" key):\n' - 'Command: {}\nResponse: {}').format(exec_cmd, error_str) - raise OpenShiftCheckException(msg) - except ValueError: - msg = ('invalid response returned from Kibana request (Non-JSON output):\n' - 'Command: {}\nResponse: {}').format(exec_cmd, error_str) - raise OpenShiftCheckException(msg) + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'kibanaInvalidResponse', + 'invalid response returned from Kibana request:\n' + 'Command: {}\nResponse: {}'.format(exec_cmd, error_str) + ) if error_code != 404: - msg = 'invalid error code returned from Kibana request. Expecting error code "404", but got "{}" instead.' - raise OpenShiftCheckException(msg.format(error_code)) + raise OpenShiftCheckException( + 'kibanaInvalidReturnCode', + 'invalid error code returned from Kibana request.\n' + 'Expecting error code "404", but got "{}" instead.'.format(error_code) + ) return uuid @@ -105,17 +103,18 @@ class LoggingIndexTime(LoggingCheck): "--key /etc/elasticsearch/secret/admin-key " "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}" ) - exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace, uuid=uuid) - result = self.exec_oc(self.logging_namespace, exec_cmd, []) + exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid) + result = self.exec_oc(exec_cmd, []) try: count = json.loads(result)["count"] - except KeyError: - msg = 'invalid response from Elasticsearch query:\n"{}"\nMissing "count" key:\n{}' - raise OpenShiftCheckException(msg.format(exec_cmd, result)) - except ValueError: - msg = 'invalid response from Elasticsearch query:\n"{}"\nNon-JSON output:\n{}' - raise OpenShiftCheckException(msg.format(exec_cmd, result)) + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'esInvalidResponse', + 'Invalid response from Elasticsearch query:\n' + ' {}\n' + 'Response was:\n{}'.format(exec_cmd, result) + ) return count -- cgit v1.2.3