diff options
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks/logging')
8 files changed, 996 insertions, 0 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py new file mode 100644 index 000000000..b27f97172 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -0,0 +1,43 @@ +"""Check for an aggregated logging Curator deployment""" + +from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck + + +class Curator(LoggingCheck): + """Check for an aggregated logging Curator deployment""" + + name = "curator" + tags = ["health", "logging"] + + def run(self): + """Check various things and gather errors. Returns: result as hash""" + + curator_pods = self.get_pods_for_component("curator") + self.check_curator(curator_pods) + # TODO(lmeyer): run it all again for the ops cluster + + return {} + + def check_curator(self, pods): + """Check to see if curator is up and working. Returns: error string""" + if not pods: + raise OpenShiftCheckException( + "MissingComponentPods", + "There are no Curator pods for the logging stack,\n" + "so nothing will prune Elasticsearch indexes.\n" + "Is Curator correctly deployed?" + ) + + not_running = self.not_running_pods(pods) + if len(not_running) == len(pods): + raise OpenShiftCheckException( + "CuratorNotRunning", + "The Curator pod is not currently in a running state,\n" + "so Elasticsearch indexes may increase without bound." + ) + if len(pods) - len(not_running) > 1: + raise OpenShiftCheckException( + "TooManyCurators", + "There is more than one Curator pod running. This should not normally happen.\n" + "Although this doesn't cause any problems, you may want to investigate." + ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py new file mode 100644 index 000000000..986a01f38 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -0,0 +1,212 @@ +"""Check for an aggregated logging Elasticsearch deployment""" + +import json +import re + +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList +from openshift_checks.logging.logging import LoggingCheck + + +class Elasticsearch(LoggingCheck): + """Check for an aggregated logging Elasticsearch deployment""" + + name = "elasticsearch" + tags = ["health", "logging"] + + def run(self): + """Check various things and gather errors. Returns: result as hash""" + + es_pods = self.get_pods_for_component("es") + self.check_elasticsearch(es_pods) + # TODO(lmeyer): run it all again for the ops cluster + + return {} + + def check_elasticsearch(self, es_pods): + """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors.""" + running_pods, errors = self.running_elasticsearch_pods(es_pods) + pods_by_name = { + pod['metadata']['name']: pod for pod in running_pods + # Filter out pods that are not members of a DC + if pod['metadata'].get('labels', {}).get('deploymentconfig') + } + if not pods_by_name: + # nothing running, cannot run the rest of the check + errors.append(OpenShiftCheckException( + 'NoRunningPods', + 'No logging Elasticsearch pods were found running, so no logs are being aggregated.' + )) + raise OpenShiftCheckExceptionList(errors) + + errors += self.check_elasticsearch_masters(pods_by_name) + errors += self.check_elasticsearch_node_list(pods_by_name) + errors += self.check_es_cluster_health(pods_by_name) + errors += self.check_elasticsearch_diskspace(pods_by_name) + if errors: + raise OpenShiftCheckExceptionList(errors) + + def running_elasticsearch_pods(self, es_pods): + """Returns: list of running pods, list of errors about non-running pods""" + not_running = self.not_running_pods(es_pods) + running_pods = [pod for pod in es_pods if pod not in not_running] + if not_running: + return running_pods, [OpenShiftCheckException( + 'PodNotRunning', + 'The following Elasticsearch pods are defined but not running:\n' + '{pods}'.format(pods=''.join( + " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) + for pod in not_running + )) + )] + return running_pods, [] + + @staticmethod + def _build_es_curl_cmd(pod_name, url): + base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'" + return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) + + def check_elasticsearch_masters(self, pods_by_name): + """Check that Elasticsearch masters are sane. Returns: list of errors""" + es_master_names = set() + errors = [] + for pod_name in pods_by_name.keys(): + # Compare what each ES node reports as master and compare for split brain + get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") + master_name_str = self.exec_oc(get_master_cmd, [], save_as_name="get_master_names.json") + master_names = (master_name_str or '').split(' ') + if len(master_names) > 1: + es_master_names.add(master_names[1]) + else: + errors.append(OpenShiftCheckException( + 'NoMasterName', + 'Elasticsearch {pod} gave unexpected response when asked master name:\n' + ' {response}'.format(pod=pod_name, response=master_name_str) + )) + + if not es_master_names: + errors.append(OpenShiftCheckException( + 'NoMasterFound', + 'No logging Elasticsearch masters were found.' + )) + return errors + + if len(es_master_names) > 1: + errors.append(OpenShiftCheckException( + 'SplitBrainMasters', + 'Found multiple Elasticsearch masters according to the pods:\n' + '{master_list}\n' + 'This implies that the masters have "split brain" and are not correctly\n' + 'replicating data for the logging cluster. Log loss is likely to occur.' + .format(master_list='\n'.join(' ' + master for master in es_master_names)) + )) + + return errors + + def check_elasticsearch_node_list(self, pods_by_name): + """Check that reported ES masters are accounted for by pods. Returns: list of errors""" + + if not pods_by_name: + return [OpenShiftCheckException( + 'MissingComponentPods', + 'No logging Elasticsearch pods were found.' + )] + + # get ES cluster nodes + node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') + cluster_node_data = self.exec_oc(node_cmd, [], save_as_name="get_es_nodes.json") + try: + cluster_nodes = json.loads(cluster_node_data)['nodes'] + except (ValueError, KeyError): + return [OpenShiftCheckException( + 'MissingNodeList', + 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' + + cluster_node_data + )] + + # Try to match all ES-reported node hosts to known pods. + errors = [] + for node in cluster_nodes.values(): + # Note that with 1.4/3.4 the pod IP may be used as the master name + if not any(node['host'] in (pod_name, pod['status'].get('podIP')) + for pod_name, pod in pods_by_name.items()): + errors.append(OpenShiftCheckException( + 'EsPodNodeMismatch', + 'The Elasticsearch cluster reports a member node "{node}"\n' + 'that does not correspond to any known ES pod.'.format(node=node['host']) + )) + + return errors + + def check_es_cluster_health(self, pods_by_name): + """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" + errors = [] + for pod_name in pods_by_name.keys(): + cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') + cluster_health_data = self.exec_oc(cluster_health_cmd, [], save_as_name='get_es_health.json') + try: + health_res = json.loads(cluster_health_data) + if not health_res or not health_res.get('status'): + raise ValueError() + except ValueError: + errors.append(OpenShiftCheckException( + 'BadEsResponse', + 'Could not retrieve cluster health status from logging ES pod "{pod}".\n' + 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data) + )) + continue + + if health_res['status'] not in ['green', 'yellow']: + errors.append(OpenShiftCheckException( + 'EsClusterHealthRed', + 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name) + )) + + return errors + + def check_elasticsearch_diskspace(self, pods_by_name): + """ + Exec into an ES pod and query the diskspace on the persistent volume. + Returns: list of errors + """ + errors = [] + for pod_name in pods_by_name.keys(): + df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) + disk_output = self.exec_oc(df_cmd, [], save_as_name='get_pv_diskspace.json') + lines = disk_output.splitlines() + # expecting one header looking like 'IUse% Use%' and one body line + body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' + if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]): + errors.append(OpenShiftCheckException( + 'BadDfResponse', + 'Could not retrieve storage usage from logging ES pod "{pod}".\n' + 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output) + )) + continue + inode_pct, disk_pct = re.match(body_re, lines[1]).groups() + + inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90') + if int(inode_pct) >= int(inode_pct_thresh): + errors.append(OpenShiftCheckException( + 'InodeUsageTooHigh', + 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n' + ' is {pct}, greater than threshold {limit}.\n' + ' Note: threshold can be specified in inventory with {param}'.format( + pod=pod_name, + pct=str(inode_pct), + limit=str(inode_pct_thresh), + param='openshift_check_efk_es_inode_pct', + ))) + disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80') + if int(disk_pct) >= int(disk_pct_thresh): + errors.append(OpenShiftCheckException( + 'DiskUsageTooHigh', + 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n' + ' is {pct}, greater than threshold {limit}.\n' + ' Note: threshold can be specified in inventory with {param}'.format( + pod=pod_name, + pct=str(disk_pct), + limit=str(disk_pct_thresh), + param='openshift_check_efk_es_storage_pct', + ))) + + return errors diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py new file mode 100644 index 000000000..3b192a281 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -0,0 +1,154 @@ +"""Check for an aggregated logging Fluentd deployment""" + +import json + + +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList +from openshift_checks.logging.logging import LoggingCheck + + +class Fluentd(LoggingCheck): + """Check for an aggregated logging Fluentd deployment""" + + name = "fluentd" + tags = ["health", "logging"] + + def run(self): + """Check the Fluentd deployment and raise an error if any problems are found.""" + + fluentd_pods = self.get_pods_for_component("fluentd") + self.check_fluentd(fluentd_pods) + return {} + + def check_fluentd(self, pods): + """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found.""" + + node_selector = self.get_var( + 'openshift_logging_fluentd_nodeselector', + default='logging-infra-fluentd=true' + ) + + nodes_by_name = self.get_nodes_by_name() + fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector) + + errors = [] + errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) + errors += self.check_nodes_have_fluentd(pods, fluentd_nodes) + errors += self.check_fluentd_pods_running(pods) + + # Make sure there are no extra fluentd pods + if len(pods) > len(fluentd_nodes): + errors.append(OpenShiftCheckException( + 'TooManyFluentdPods', + 'There are more Fluentd pods running than nodes labeled.\n' + 'This may not cause problems with logging but it likely indicates something wrong.' + )) + + if errors: + raise OpenShiftCheckExceptionList(errors) + + def get_nodes_by_name(self): + """Retrieve all the node definitions. Returns: dict(name: node)""" + nodes_json = self.exec_oc("get nodes -o json", []) + try: + nodes = json.loads(nodes_json) + except ValueError: # no valid json - should not happen + raise OpenShiftCheckException( + "BadOcNodeList", + "Could not obtain a list of nodes to validate fluentd.\n" + "Output from oc get:\n" + nodes_json + ) + if not nodes or not nodes.get('items'): # also should not happen + raise OpenShiftCheckException( + "NoNodesDefined", + "No nodes appear to be defined according to the API." + ) + return { + node['metadata']['name']: node + for node in nodes['items'] + } + + @staticmethod + def filter_fluentd_labeled_nodes(nodes_by_name, node_selector): + """Filter to all nodes with fluentd label. Returns dict(name: node)""" + label, value = node_selector.split('=', 1) + fluentd_nodes = { + name: node for name, node in nodes_by_name.items() + if node['metadata']['labels'].get(label) == value + } + if not fluentd_nodes: + raise OpenShiftCheckException( + 'NoNodesLabeled', + 'There are no nodes with the fluentd label {label}.\n' + 'This means no logs will be aggregated from the nodes.'.format(label=node_selector) + ) + return fluentd_nodes + + def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): + """Note if nodes are not labeled as expected. Returns: error list""" + intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all']) + if not intended_nodes or '--all' in intended_nodes: + intended_nodes = nodes_by_name.keys() + nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) + if nodes_missing_labels: + return [OpenShiftCheckException( + 'NodesUnlabeled', + 'The following nodes are supposed to be labeled with {label} but are not:\n' + ' {nodes}\n' + 'Fluentd will not aggregate logs from these nodes.'.format( + label=node_selector, nodes=', '.join(nodes_missing_labels) + ))] + + return [] + + @staticmethod + def check_nodes_have_fluentd(pods, fluentd_nodes): + """Make sure fluentd is on all the labeled nodes. Returns: error list""" + unmatched_nodes = fluentd_nodes.copy() + node_names_by_label = { + node['metadata']['labels']['kubernetes.io/hostname']: name + for name, node in fluentd_nodes.items() + } + node_names_by_internal_ip = { + address['address']: name + for name, node in fluentd_nodes.items() + for address in node['status']['addresses'] + if address['type'] == "InternalIP" + } + for pod in pods: + for name in [ + pod['spec']['nodeName'], + node_names_by_internal_ip.get(pod['spec']['nodeName']), + node_names_by_label.get(pod.get('spec', {}).get('host')), + ]: + unmatched_nodes.pop(name, None) + if unmatched_nodes: + return [OpenShiftCheckException( + 'MissingFluentdPod', + 'The following nodes are supposed to have a Fluentd pod but do not:\n' + ' {nodes}\n' + 'These nodes will not have their logs aggregated.'.format( + nodes='\n '.join(unmatched_nodes.keys()) + ))] + + return [] + + def check_fluentd_pods_running(self, pods): + """Make sure all fluentd pods are running. Returns: error string""" + not_running = super(Fluentd, self).not_running_pods(pods) + if not_running: + return [OpenShiftCheckException( + 'FluentdNotRunning', + 'The following Fluentd pods are supposed to be running but are not:\n' + ' {pods}\n' + 'These pods will not aggregate logs from their nodes.'.format( + pods='\n'.join( + " {name} ({host})".format( + name=pod['metadata']['name'], + host=pod['spec'].get('host', 'None') + ) + for pod in not_running + ) + ))] + + return [] diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py new file mode 100644 index 000000000..e93cc9028 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py @@ -0,0 +1,131 @@ +""" +Module for performing checks on a Fluentd logging deployment configuration +""" + +from openshift_checks import OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck + + +class FluentdConfig(LoggingCheck): + """Module that checks logging configuration of an integrated logging Fluentd deployment""" + name = "fluentd_config" + tags = ["health"] + + def is_active(self): + logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) + + try: + version = self.get_major_minor_version(self.get_var("openshift_image_tag")) + except ValueError: + # if failed to parse OpenShift version, perform check anyway (if logging enabled) + return logging_deployed + + return logging_deployed and version < (3, 6) + + def run(self): + """Check that Fluentd has running pods, and that its logging config matches Docker's logging config.""" + config_error = self.check_logging_config() + if config_error: + msg = ("The following Fluentd logging configuration problem was found:" + "\n{}".format(config_error)) + return {"failed": True, "msg": msg} + + return {} + + def check_logging_config(self): + """Ensure that the configured Docker logging driver matches fluentd settings. + This means that, at least for now, if the following condition is met: + + openshift_logging_fluentd_use_journal == True + + then the value of the configured Docker logging driver should be "journald". + Otherwise, the value of the Docker logging driver should be "json-file". + Returns an error string if the above condition is not met, or None otherwise.""" + use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True) + + # if check is running on a master, retrieve all running pods + # and check any pod's container for the env var "USE_JOURNAL" + group_names = self.get_var("group_names") + if "oo_masters_to_config" in group_names: + use_journald = self.check_fluentd_env_var() + + docker_info = self.execute_module("docker_info", {}) + try: + logging_driver = docker_info["info"]["LoggingDriver"] + except KeyError: + return "Unable to determine Docker logging driver." + + logging_driver = docker_info["info"]["LoggingDriver"] + recommended_logging_driver = "journald" + error = None + + # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver` + # option as an inventory file variable, or adding the log driver value as part of the + # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that + # can be passed to the Docker binary; the only other recommendation that can be made, would be + # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running + # individual containers. + if use_journald and logging_driver != "journald": + error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n' + 'This differs from your Docker configuration, which has been set to use "{driver}" ' + 'as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + elif not use_journald and logging_driver != "json-file": + recommended_logging_driver = "json-file" + error = ('Your Fluentd configuration is set to aggregate Docker container logs from ' + 'individual json log files per container.\n ' + 'This differs from your Docker configuration, which has been set to use ' + '"{driver}" as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + + if error: + error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n' + ' openshift_docker_options="--log-driver={driver}"\n\n' + 'Alternatively, you can add the following option to your Docker configuration, located in' + '"/etc/docker/daemon.json":\n\n' + '{{ "log-driver": "{driver}" }}\n\n' + 'See https://docs.docker.com/engine/admin/logging/json-file ' + 'for more information.').format(driver=recommended_logging_driver) + + return error + + def check_fluentd_env_var(self): + """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod.""" + running_pods = self.running_fluentd_pods() + + try: + pod_containers = running_pods[0]["spec"]["containers"] + except KeyError: + return "Unable to detect running containers on selected Fluentd pod." + + if not pod_containers: + msg = ('There are no running containers on selected Fluentd pod "{}".\n' + 'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", "")) + raise OpenShiftCheckException(msg) + + pod_env = pod_containers[0].get("env") + if not pod_env: + msg = ('There are no environment variables set on the Fluentd container "{}".\n' + 'Unable to calculate expected logging driver.').format(pod_containers[0].get("name")) + raise OpenShiftCheckException(msg) + + for env in pod_env: + if env["name"] == "USE_JOURNAL": + return env.get("value", "false") != "false" + + return False + + def running_fluentd_pods(self): + """Return a list of running fluentd pods.""" + fluentd_pods = self.get_pods_for_component("fluentd") + + running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running'] + if not running_fluentd_pods: + raise OpenShiftCheckException( + 'No Fluentd pods were found to be in the "Running" state. ' + 'At least one Fluentd pod is required in order to perform this check.' + ) + + return running_fluentd_pods diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py new file mode 100644 index 000000000..3b1cf8baa --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -0,0 +1,226 @@ +""" +Module for performing checks on a Kibana logging deployment +""" + +import json +import ssl + +try: + from urllib2 import HTTPError, URLError + import urllib2 +except ImportError: + from urllib.error import HTTPError, URLError + import urllib.request as urllib2 + +from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException + + +class Kibana(LoggingCheck): + """Module that checks an integrated logging Kibana deployment""" + + name = "kibana" + tags = ["health", "logging"] + + def run(self): + """Check various things and gather errors. Returns: result as hash""" + + kibana_pods = self.get_pods_for_component("kibana") + self.check_kibana(kibana_pods) + self.check_kibana_route() + # TODO(lmeyer): run it all again for the ops cluster + + return {} + + def _verify_url_internal(self, url): + """ + Try to reach a URL from the host. + Returns: success (bool), reason (for failure) + """ + args = dict( + url=url, + follow_redirects='none', + validate_certs='no', # likely to be signed with internal CA + # TODO(lmeyer): give users option to validate certs + status_code=302, + ) + result = self.execute_module('uri', args) + if result.get('failed'): + return result['msg'] + return None + + @staticmethod + def _verify_url_external(url): + """ + Try to reach a URL from ansible control host. + Raise an OpenShiftCheckException if anything goes wrong. + """ + # This actually checks from the ansible control host, which may or may not + # really be "external" to the cluster. + + # Disable SSL cert validation to work around internally signed certs + ctx = ssl.create_default_context() + ctx.check_hostname = False # or setting CERT_NONE is refused + ctx.verify_mode = ssl.CERT_NONE + + # Verify that the url is returning a valid response + try: + # We only care if the url connects and responds + return_code = urllib2.urlopen(url, context=ctx).getcode() + except HTTPError as httperr: + return httperr.reason + except URLError as urlerr: + return str(urlerr) + + # there appears to be no way to prevent urlopen from following redirects + if return_code != 200: + return 'Expected success (200) but got return code {}'.format(int(return_code)) + + return None + + def check_kibana(self, pods): + """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not.""" + + if not pods: + raise OpenShiftCheckException( + "MissingComponentPods", + "There are no Kibana pods deployed, so no access to the logging UI." + ) + + not_running = self.not_running_pods(pods) + if len(not_running) == len(pods): + raise OpenShiftCheckException( + "NoRunningPods", + "No Kibana pod is in a running state, so there is no access to the logging UI." + ) + elif not_running: + raise OpenShiftCheckException( + "PodNotRunning", + "The following Kibana pods are not currently in a running state:\n" + " {pods}\n" + "However at least one is, so service may not be impacted.".format( + pods="\n ".join(pod['metadata']['name'] for pod in not_running) + ) + ) + + def _get_kibana_url(self): + """ + Get kibana route or report error. + Returns: url + """ + + # Get logging url + get_route = self.exec_oc("get route logging-kibana -o json", []) + if not get_route: + raise OpenShiftCheckException( + 'no_route_exists', + 'No route is defined for Kibana in the logging namespace,\n' + 'so the logging stack is not accessible. Is logging deployed?\n' + 'Did something remove the logging-kibana route?' + ) + + try: + route = json.loads(get_route) + # check that the route has been accepted by a router + ingress = route["status"]["ingress"] + except (ValueError, KeyError): + raise OpenShiftCheckException( + 'get_route_failed', + '"oc get route" returned an unexpected response:\n' + get_route + ) + + # ingress can be null if there is no router, or empty if not routed + if not ingress or not ingress[0]: + raise OpenShiftCheckException( + 'route_not_accepted', + 'The logging-kibana route is not being routed by any router.\n' + 'Is the router deployed and working?' + ) + + host = route.get("spec", {}).get("host") + if not host: + raise OpenShiftCheckException( + 'route_missing_host', + 'The logging-kibana route has no hostname defined,\n' + 'which should never happen. Did something alter its definition?' + ) + + return 'https://{}/'.format(host) + + def check_kibana_route(self): + """ + Check to see if kibana route is up and working. + Raises exception if not. + """ + + kibana_url = self._get_kibana_url() + + # first, check that kibana is reachable from the master. + error = self._verify_url_internal(kibana_url) + if error: + if 'urlopen error [Errno 111] Connection refused' in error: + raise OpenShiftCheckException( + 'FailedToConnectInternal', + 'Failed to connect from this master to Kibana URL {url}\n' + 'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url) + ) + elif 'urlopen error [Errno -2] Name or service not known' in error: + raise OpenShiftCheckException( + 'FailedToResolveInternal', + 'Failed to connect from this master to Kibana URL {url}\n' + 'because the hostname does not resolve.\n' + 'Is DNS configured for the Kibana hostname?'.format(url=kibana_url) + ) + elif 'Status code was not' in error: + raise OpenShiftCheckException( + 'WrongReturnCodeInternal', + 'A request from this master to the Kibana URL {url}\n' + 'did not return the correct status code (302).\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues. The output was:\n' + ' {error}'.format(url=kibana_url, error=error) + ) + raise OpenShiftCheckException( + 'MiscRouteErrorInternal', + 'Error validating the logging Kibana route internally:\n' + error + ) + + # in production we would like the kibana route to work from outside the + # cluster too; but that may not be the case, so allow disabling just this part. + if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true": + return + error = self._verify_url_external(kibana_url) + + if not error: + return + + error_fmt = ( + 'Error validating the logging Kibana route:\n{error}\n' + 'To disable external Kibana route validation, set the variable:\n' + ' openshift_check_efk_kibana_external=False' + ) + if 'urlopen error [Errno 111] Connection refused' in error: + msg = ( + 'Failed to connect from the Ansible control host to Kibana URL {url}\n' + 'Is the router for the Kibana hostname exposed externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg)) + elif 'urlopen error [Errno -2] Name or service not known' in error: + msg = ( + 'Failed to resolve the Kibana hostname in {url}\n' + 'from the Ansible control host.\n' + 'Is DNS configured to resolve this Kibana hostname externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg)) + elif 'Expected success (200)' in error: + msg = ( + 'A request to Kibana at {url}\n' + 'returned the wrong error code:\n' + ' {error}\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues.' + ).format(url=kibana_url, error=error) + raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg)) + raise OpenShiftCheckException( + 'MiscRouteError', + 'Error validating the logging Kibana route externally:\n' + error + ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py new file mode 100644 index 000000000..05ba73ca1 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -0,0 +1,101 @@ +""" +Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack +""" + +import json +import os + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException + + +class MissingComponentPods(OpenShiftCheckException): + """Raised when a component has no pods in the namespace.""" + pass + + +class CouldNotUseOc(OpenShiftCheckException): + """Raised when ocutil has a failure running oc.""" + pass + + +class LoggingCheck(OpenShiftCheck): + """Base class for OpenShift aggregated logging component checks""" + + # FIXME: this should not be listed as a check, since it is not meant to be + # run by itself. + + name = "logging" + + def is_active(self): + logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False) + return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master() + + def run(self): + return {} + + def get_pods_for_component(self, logging_component): + """Get all pods for a given component. Returns: list of pods.""" + pod_output = self.exec_oc( + "get pods -l component={} -o json".format(logging_component), + [], + ) + try: + pods = json.loads(pod_output) # raises ValueError if deserialize fails + if not pods or not pods.get('items'): # also a broken response, treat the same + raise ValueError() + except ValueError: + # successful run but non-parsing data generally means there were no pods to be found + raise MissingComponentPods( + 'There are no "{}" component pods in the "{}" namespace.\n' + 'Is logging deployed?'.format(logging_component, self.logging_namespace()) + ) + + return pods['items'] + + @staticmethod + def not_running_pods(pods): + """Returns: list of pods not in a ready and running state""" + return [ + pod for pod in pods + if not pod.get("status", {}).get("containerStatuses") or any( + container['ready'] is False + for container in pod['status']['containerStatuses'] + ) or not any( + condition['type'] == 'Ready' and condition['status'] == 'True' + for condition in pod['status'].get('conditions', []) + ) + ] + + def logging_namespace(self): + """Returns the namespace in which logging is configured to deploy.""" + return self.get_var("openshift_logging_namespace", default="logging") + + def exec_oc(self, cmd_str="", extra_args=None, save_as_name=None): + """ + Execute an 'oc' command in the remote host. + Returns: output of command and namespace, + or raises CouldNotUseOc on error + """ + config_base = self.get_var("openshift", "common", "config_base") + args = { + "namespace": self.logging_namespace(), + "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), + "cmd": cmd_str, + "extra_args": list(extra_args) if extra_args else [], + } + + result = self.execute_module("ocutil", args, save_as_name=save_as_name) + if result.get("failed"): + if result['result'] == '[Errno 2] No such file or directory': + raise CouldNotUseOc( + "This host is supposed to be a master but does not have the `oc` command where expected.\n" + "Has an installation been run on this host yet?" + ) + + raise CouldNotUseOc( + 'Unexpected error using `oc` to validate the logging stack components.\n' + 'Error executing `oc {cmd}`:\n' + '{error}'.format(cmd=args['cmd'], error=result['result']) + ) + + return result.get("result", "") diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py new file mode 100644 index 000000000..cacdf4213 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py @@ -0,0 +1,129 @@ +""" +Check for ensuring logs from pods can be queried in a reasonable amount of time. +""" + +import json +import time + +from uuid import uuid4 + +from openshift_checks import OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck + + +ES_CMD_TIMEOUT_SECONDS = 30 + + +class LoggingIndexTime(LoggingCheck): + """Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time.""" + name = "logging_index_time" + tags = ["health", "logging"] + + def run(self): + """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs.""" + try: + log_index_timeout = int( + self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS) + ) + except ValueError: + raise OpenShiftCheckException( + 'InvalidTimeout', + 'Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' + 'Value must be an integer representing an amount in seconds.' + ) + + running_component_pods = dict() + + # get all component pods + for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']): + pods = self.get_pods_for_component(component) + running_pods = self.running_pods(pods) + + if not running_pods: + raise OpenShiftCheckException( + component + 'NoRunningPods', + 'No {} pods in the "Running" state were found.' + 'At least one pod is required in order to perform this check.'.format(name) + ) + + running_component_pods[component] = running_pods + + uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0]) + self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout) + return {} + + def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs): + """Retry an Elasticsearch query every second until query success, or a defined + length of time has passed.""" + deadline = time.time() + timeout_secs + interval = 1 + while not self.query_es_from_es(es_pod, uuid): + if time.time() + interval > deadline: + raise OpenShiftCheckException( + "NoMatchFound", + "expecting match in Elasticsearch for message with uuid {}, " + "but no matches were found after {}s.".format(uuid, timeout_secs) + ) + time.sleep(interval) + + def curl_kibana_with_uuid(self, kibana_pod): + """curl Kibana with a unique uuid.""" + uuid = self.generate_uuid() + pod_name = kibana_pod["metadata"]["name"] + exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}" + exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid) + + error_str = self.exec_oc(exec_cmd, []) + + try: + error_code = json.loads(error_str)["statusCode"] + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'kibanaInvalidResponse', + 'invalid response returned from Kibana request:\n' + 'Command: {}\nResponse: {}'.format(exec_cmd, error_str) + ) + + if error_code != 404: + raise OpenShiftCheckException( + 'kibanaInvalidReturnCode', + 'invalid error code returned from Kibana request.\n' + 'Expecting error code "404", but got "{}" instead.'.format(error_code) + ) + + return uuid + + def query_es_from_es(self, es_pod, uuid): + """curl the Elasticsearch pod and look for a unique uuid in its logs.""" + pod_name = es_pod["metadata"]["name"] + exec_cmd = ( + "exec {pod_name} -- curl --max-time 30 -s -f " + "--cacert /etc/elasticsearch/secret/admin-ca " + "--cert /etc/elasticsearch/secret/admin-cert " + "--key /etc/elasticsearch/secret/admin-key " + "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}" + ) + exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid) + result = self.exec_oc(exec_cmd, [], save_as_name="query_for_uuid.json") + + try: + count = json.loads(result)["count"] + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'esInvalidResponse', + 'Invalid response from Elasticsearch query:\n' + ' {}\n' + 'Response was:\n{}'.format(exec_cmd, result) + ) + + return count + + @staticmethod + def running_pods(pods): + """Filter pods that are running.""" + return [pod for pod in pods if pod['status']['phase'] == 'Running'] + + @staticmethod + def generate_uuid(): + """Wrap uuid generator. Allows for testing with expected values.""" + return str(uuid4()) |