diff options
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks/logging')
7 files changed, 357 insertions, 133 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py index c9fc59896..32d853d57 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/curator.py +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -1,28 +1,21 @@ -""" -Module for performing checks on an Curator logging deployment -""" +"""Check for an aggregated logging Curator deployment""" -from openshift_checks import get_var from openshift_checks.logging.logging import LoggingCheck class Curator(LoggingCheck): - """Module that checks an integrated logging Curator deployment""" + """Check for an aggregated logging Curator deployment""" name = "curator" tags = ["health", "logging"] - logging_namespace = None - - def run(self, tmp, task_vars): + def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") - curator_pods, error = super(Curator, self).get_pods_for_component( - self.module_executor, + self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") + curator_pods, error = self.get_pods_for_component( self.logging_namespace, "curator", - task_vars ) if error: return {"failed": True, "changed": False, "msg": error} @@ -30,7 +23,6 @@ class Curator(LoggingCheck): if check_error: msg = ("The following Curator deployment issue was found:" - "\n-------\n" "{}".format(check_error)) return {"failed": True, "changed": False, "msg": msg} @@ -46,7 +38,7 @@ class Curator(LoggingCheck): "Is Curator correctly deployed?" ) - not_running = super(Curator, self).not_running_pods(pods) + not_running = self.not_running_pods(pods) if len(not_running) == len(pods): return ( "The Curator pod is not currently in a running state,\n" diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py index 01cb35b81..8bdda1f32 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -1,39 +1,31 @@ -""" -Module for performing checks on an Elasticsearch logging deployment -""" +"""Check for an aggregated logging Elasticsearch deployment""" import json import re -from openshift_checks import get_var from openshift_checks.logging.logging import LoggingCheck class Elasticsearch(LoggingCheck): - """Module that checks an integrated logging Elasticsearch deployment""" + """Check for an aggregated logging Elasticsearch deployment""" name = "elasticsearch" tags = ["health", "logging"] - logging_namespace = None - - def run(self, tmp, task_vars): + def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") - es_pods, error = super(Elasticsearch, self).get_pods_for_component( - self.execute_module, + self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") + es_pods, error = self.get_pods_for_component( self.logging_namespace, "es", - task_vars, ) if error: return {"failed": True, "changed": False, "msg": error} - check_error = self.check_elasticsearch(es_pods, task_vars) + check_error = self.check_elasticsearch(es_pods) if check_error: msg = ("The following Elasticsearch deployment issue was found:" - "\n-------\n" "{}".format(check_error)) return {"failed": True, "changed": False, "msg": msg} @@ -41,8 +33,8 @@ class Elasticsearch(LoggingCheck): return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'} def _not_running_elasticsearch_pods(self, es_pods): - """Returns: list of running pods, list of errors about non-running pods""" - not_running = super(Elasticsearch, self).not_running_pods(es_pods) + """Returns: list of pods that are not running, list of errors about non-running pods""" + not_running = self.not_running_pods(es_pods) if not_running: return not_running, [( 'The following Elasticsearch pods are not running:\n' @@ -54,7 +46,7 @@ class Elasticsearch(LoggingCheck): ))] return not_running, [] - def check_elasticsearch(self, es_pods, task_vars): + def check_elasticsearch(self, es_pods): """Various checks for elasticsearch. Returns: error string""" not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods) running_pods = [pod for pod in es_pods if pod not in not_running_pods] @@ -65,10 +57,10 @@ class Elasticsearch(LoggingCheck): } if not pods_by_name: return 'No logging Elasticsearch pods were found. Is logging deployed?' - error_msgs += self._check_elasticsearch_masters(pods_by_name, task_vars) - error_msgs += self._check_elasticsearch_node_list(pods_by_name, task_vars) - error_msgs += self._check_es_cluster_health(pods_by_name, task_vars) - error_msgs += self._check_elasticsearch_diskspace(pods_by_name, task_vars) + error_msgs += self._check_elasticsearch_masters(pods_by_name) + error_msgs += self._check_elasticsearch_node_list(pods_by_name) + error_msgs += self._check_es_cluster_health(pods_by_name) + error_msgs += self._check_elasticsearch_diskspace(pods_by_name) return '\n'.join(error_msgs) @staticmethod @@ -76,14 +68,14 @@ class Elasticsearch(LoggingCheck): base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'" return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) - def _check_elasticsearch_masters(self, pods_by_name, task_vars): + def _check_elasticsearch_masters(self, pods_by_name): """Check that Elasticsearch masters are sane. Returns: list of error strings""" es_master_names = set() error_msgs = [] for pod_name in pods_by_name.keys(): # Compare what each ES node reports as master and compare for split brain get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") - master_name_str = self._exec_oc(get_master_cmd, [], task_vars) + master_name_str = self.exec_oc(self.logging_namespace, get_master_cmd, []) master_names = (master_name_str or '').split(' ') if len(master_names) > 1: es_master_names.add(master_names[1]) @@ -108,7 +100,7 @@ class Elasticsearch(LoggingCheck): return error_msgs - def _check_elasticsearch_node_list(self, pods_by_name, task_vars): + def _check_elasticsearch_node_list(self, pods_by_name): """Check that reported ES masters are accounted for by pods. Returns: list of error strings""" if not pods_by_name: @@ -116,7 +108,7 @@ class Elasticsearch(LoggingCheck): # get ES cluster nodes node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') - cluster_node_data = self._exec_oc(node_cmd, [], task_vars) + cluster_node_data = self.exec_oc(self.logging_namespace, node_cmd, []) try: cluster_nodes = json.loads(cluster_node_data)['nodes'] except (ValueError, KeyError): @@ -138,12 +130,12 @@ class Elasticsearch(LoggingCheck): return error_msgs - def _check_es_cluster_health(self, pods_by_name, task_vars): + def _check_es_cluster_health(self, pods_by_name): """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" error_msgs = [] for pod_name in pods_by_name.keys(): cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') - cluster_health_data = self._exec_oc(cluster_health_cmd, [], task_vars) + cluster_health_data = self.exec_oc(self.logging_namespace, cluster_health_cmd, []) try: health_res = json.loads(cluster_health_data) if not health_res or not health_res.get('status'): @@ -162,7 +154,7 @@ class Elasticsearch(LoggingCheck): return error_msgs - def _check_elasticsearch_diskspace(self, pods_by_name, task_vars): + def _check_elasticsearch_diskspace(self, pods_by_name): """ Exec into an ES pod and query the diskspace on the persistent volume. Returns: list of errors @@ -170,7 +162,7 @@ class Elasticsearch(LoggingCheck): error_msgs = [] for pod_name in pods_by_name.keys(): df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) - disk_output = self._exec_oc(df_cmd, [], task_vars) + disk_output = self.exec_oc(self.logging_namespace, df_cmd, []) lines = disk_output.splitlines() # expecting one header looking like 'IUse% Use%' and one body line body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' @@ -182,7 +174,7 @@ class Elasticsearch(LoggingCheck): continue inode_pct, disk_pct = re.match(body_re, lines[1]).groups() - inode_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_inode_pct', default='90') + inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90') if int(inode_pct) >= int(inode_pct_thresh): error_msgs.append( 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n' @@ -193,7 +185,7 @@ class Elasticsearch(LoggingCheck): limit=str(inode_pct_thresh), param='openshift_check_efk_es_inode_pct', )) - disk_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_storage_pct', default='80') + disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80') if int(disk_pct) >= int(disk_pct_thresh): error_msgs.append( 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n' @@ -206,12 +198,3 @@ class Elasticsearch(LoggingCheck): )) return error_msgs - - def _exec_oc(self, cmd_str, extra_args, task_vars): - return super(Elasticsearch, self).exec_oc( - self.execute_module, - self.logging_namespace, - cmd_str, - extra_args, - task_vars, - ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py index 627567293..b3485bf44 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -1,37 +1,30 @@ -""" -Module for performing checks on an Fluentd logging deployment -""" +"""Check for an aggregated logging Fluentd deployment""" import json -from openshift_checks import get_var from openshift_checks.logging.logging import LoggingCheck class Fluentd(LoggingCheck): - """Module that checks an integrated logging Fluentd deployment""" + """Check for an aggregated logging Fluentd deployment""" + name = "fluentd" tags = ["health", "logging"] - logging_namespace = None - - def run(self, tmp, task_vars): + def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") + self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") fluentd_pods, error = super(Fluentd, self).get_pods_for_component( - self.execute_module, self.logging_namespace, "fluentd", - task_vars, ) if error: return {"failed": True, "changed": False, "msg": error} - check_error = self.check_fluentd(fluentd_pods, task_vars) + check_error = self.check_fluentd(fluentd_pods) if check_error: msg = ("The following Fluentd deployment issue was found:" - "\n-------\n" "{}".format(check_error)) return {"failed": True, "changed": False, "msg": msg} @@ -53,10 +46,9 @@ class Fluentd(LoggingCheck): ).format(label=node_selector) return fluentd_nodes, None - @staticmethod - def _check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars): + def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): """Note if nodes are not labeled as expected. Returns: error string""" - intended_nodes = get_var(task_vars, 'openshift_logging_fluentd_hosts', default=['--all']) + intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all']) if not intended_nodes or '--all' in intended_nodes: intended_nodes = nodes_by_name.keys() nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) @@ -114,13 +106,15 @@ class Fluentd(LoggingCheck): )) return None - def check_fluentd(self, pods, task_vars): + def check_fluentd(self, pods): """Verify fluentd is running everywhere. Returns: error string""" - node_selector = get_var(task_vars, 'openshift_logging_fluentd_nodeselector', - default='logging-infra-fluentd=true') + node_selector = self.get_var( + 'openshift_logging_fluentd_nodeselector', + default='logging-infra-fluentd=true' + ) - nodes_by_name, error = self.get_nodes_by_name(task_vars) + nodes_by_name, error = self.get_nodes_by_name() if error: return error @@ -129,7 +123,7 @@ class Fluentd(LoggingCheck): return error error_msgs = [] - error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars) + error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) if error: error_msgs.append(error) error = self._check_nodes_have_fluentd(pods, fluentd_nodes) @@ -148,9 +142,13 @@ class Fluentd(LoggingCheck): return '\n'.join(error_msgs) - def get_nodes_by_name(self, task_vars): + def get_nodes_by_name(self): """Retrieve all the node definitions. Returns: dict(name: node), error string""" - nodes_json = self._exec_oc("get nodes -o json", [], task_vars) + nodes_json = self.exec_oc( + self.logging_namespace, + "get nodes -o json", + [] + ) try: nodes = json.loads(nodes_json) except ValueError: # no valid json - should not happen @@ -161,10 +159,3 @@ class Fluentd(LoggingCheck): node['metadata']['name']: node for node in nodes['items'] }, None - - def _exec_oc(self, cmd_str, extra_args, task_vars): - return super(Fluentd, self).exec_oc(self.execute_module, - self.logging_namespace, - cmd_str, - extra_args, - task_vars) diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py new file mode 100644 index 000000000..0970f0a63 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py @@ -0,0 +1,138 @@ +""" +Module for performing checks on a Fluentd logging deployment configuration +""" + +from openshift_checks import OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck + + +class FluentdConfig(LoggingCheck): + """Module that checks logging configuration of an integrated logging Fluentd deployment""" + name = "fluentd_config" + tags = ["health"] + + def is_active(self): + logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) + + try: + version = self.get_major_minor_version(self.get_var("openshift_image_tag")) + except ValueError: + # if failed to parse OpenShift version, perform check anyway (if logging enabled) + return logging_deployed + + return logging_deployed and version < (3, 6) + + def run(self): + """Check that Fluentd has running pods, and that its logging config matches Docker's logging config.""" + self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace) + config_error = self.check_logging_config() + if config_error: + msg = ("The following Fluentd logging configuration problem was found:" + "\n{}".format(config_error)) + return {"failed": True, "msg": msg} + + return {} + + def check_logging_config(self): + """Ensure that the configured Docker logging driver matches fluentd settings. + This means that, at least for now, if the following condition is met: + + openshift_logging_fluentd_use_journal == True + + then the value of the configured Docker logging driver should be "journald". + Otherwise, the value of the Docker logging driver should be "json-file". + Returns an error string if the above condition is not met, or None otherwise.""" + use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True) + + # if check is running on a master, retrieve all running pods + # and check any pod's container for the env var "USE_JOURNAL" + group_names = self.get_var("group_names") + if "masters" in group_names: + use_journald = self.check_fluentd_env_var() + + docker_info = self.execute_module("docker_info", {}) + try: + logging_driver = docker_info["info"]["LoggingDriver"] + except KeyError: + return "Unable to determine Docker logging driver." + + logging_driver = docker_info["info"]["LoggingDriver"] + recommended_logging_driver = "journald" + error = None + + # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver` + # option as an inventory file variable, or adding the log driver value as part of the + # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that + # can be passed to the Docker binary; the only other recommendation that can be made, would be + # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running + # individual containers. + if use_journald and logging_driver != "journald": + error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n' + 'This differs from your Docker configuration, which has been set to use "{driver}" ' + 'as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + elif not use_journald and logging_driver != "json-file": + recommended_logging_driver = "json-file" + error = ('Your Fluentd configuration is set to aggregate Docker container logs from ' + 'individual json log files per container.\n ' + 'This differs from your Docker configuration, which has been set to use ' + '"{driver}" as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + + if error: + error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n' + ' openshift_docker_options="--log-driver={driver}"\n\n' + 'Alternatively, you can add the following option to your Docker configuration, located in' + '"/etc/docker/daemon.json":\n\n' + '{{ "log-driver": "{driver}" }}\n\n' + 'See https://docs.docker.com/engine/admin/logging/json-file ' + 'for more information.').format(driver=recommended_logging_driver) + + return error + + def check_fluentd_env_var(self): + """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod.""" + running_pods = self.running_fluentd_pods() + + try: + pod_containers = running_pods[0]["spec"]["containers"] + except KeyError: + return "Unable to detect running containers on selected Fluentd pod." + + if not pod_containers: + msg = ('There are no running containers on selected Fluentd pod "{}".\n' + 'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", "")) + raise OpenShiftCheckException(msg) + + pod_env = pod_containers[0].get("env") + if not pod_env: + msg = ('There are no environment variables set on the Fluentd container "{}".\n' + 'Unable to calculate expected logging driver.').format(pod_containers[0].get("name")) + raise OpenShiftCheckException(msg) + + for env in pod_env: + if env["name"] == "USE_JOURNAL": + return env.get("value", "false") != "false" + + return False + + def running_fluentd_pods(self): + """Return a list of running fluentd pods.""" + fluentd_pods, error = self.get_pods_for_component( + self.logging_namespace, + "fluentd", + ) + if error: + msg = 'Unable to retrieve any pods for the "fluentd" logging component: {}'.format(error) + raise OpenShiftCheckException(msg) + + running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running'] + if not running_fluentd_pods: + msg = ('No Fluentd pods were found to be in the "Running" state. ' + 'At least one Fluentd pod is required in order to perform this check.') + + raise OpenShiftCheckException(msg) + + return running_fluentd_pods diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py index 442f407b1..efb14ab42 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -12,7 +12,6 @@ except ImportError: from urllib.error import HTTPError, URLError import urllib.request as urllib2 -from openshift_checks import get_var from openshift_checks.logging.logging import LoggingCheck @@ -22,35 +21,30 @@ class Kibana(LoggingCheck): name = "kibana" tags = ["health", "logging"] - logging_namespace = None - - def run(self, tmp, task_vars): + def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") - kibana_pods, error = super(Kibana, self).get_pods_for_component( - self.execute_module, + self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") + kibana_pods, error = self.get_pods_for_component( self.logging_namespace, "kibana", - task_vars, ) if error: return {"failed": True, "changed": False, "msg": error} check_error = self.check_kibana(kibana_pods) if not check_error: - check_error = self._check_kibana_route(task_vars) + check_error = self._check_kibana_route() if check_error: msg = ("The following Kibana deployment issue was found:" - "\n-------\n" "{}".format(check_error)) return {"failed": True, "changed": False, "msg": msg} # TODO(lmeyer): run it all again for the ops cluster return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'} - def _verify_url_internal(self, url, task_vars): + def _verify_url_internal(self, url): """ Try to reach a URL from the host. Returns: success (bool), reason (for failure) @@ -62,7 +56,7 @@ class Kibana(LoggingCheck): # TODO(lmeyer): give users option to validate certs status_code=302, ) - result = self.execute_module('uri', args, task_vars) + result = self.execute_module('uri', args) if result.get('failed'): return result['msg'] return None @@ -114,14 +108,18 @@ class Kibana(LoggingCheck): return None - def _get_kibana_url(self, task_vars): + def _get_kibana_url(self): """ Get kibana route or report error. Returns: url (or empty), reason for failure """ # Get logging url - get_route = self._exec_oc("get route logging-kibana -o json", [], task_vars) + get_route = self.exec_oc( + self.logging_namespace, + "get route logging-kibana -o json", + [], + ) if not get_route: return None, 'no_route_exists' @@ -139,7 +137,7 @@ class Kibana(LoggingCheck): return 'https://{}/'.format(host), None - def _check_kibana_route(self, task_vars): + def _check_kibana_route(self): """ Check to see if kibana route is up and working. Returns: error string @@ -160,12 +158,12 @@ class Kibana(LoggingCheck): ), ) - kibana_url, error = self._get_kibana_url(task_vars) + kibana_url, error = self._get_kibana_url() if not kibana_url: return known_errors.get(error, error) # first, check that kibana is reachable from the master. - error = self._verify_url_internal(kibana_url, task_vars) + error = self._verify_url_internal(kibana_url) if error: if 'urlopen error [Errno 111] Connection refused' in error: error = ( @@ -190,7 +188,7 @@ class Kibana(LoggingCheck): # in production we would like the kibana route to work from outside the # cluster too; but that may not be the case, so allow disabling just this part. - if not get_var(task_vars, "openshift_check_efk_kibana_external", default=True): + if not self.get_var("openshift_check_efk_kibana_external", default=True): return None error = self._verify_url_external(kibana_url) if error: @@ -220,10 +218,3 @@ class Kibana(LoggingCheck): ).format(error=error) return error return None - - def _exec_oc(self, cmd_str, extra_args, task_vars): - return super(Kibana, self).exec_oc(self.execute_module, - self.logging_namespace, - cmd_str, - extra_args, - task_vars) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py index 05b4d300c..43ba6c406 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -5,39 +5,39 @@ Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana st import json import os -from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var +from openshift_checks import OpenShiftCheck, OpenShiftCheckException class LoggingCheck(OpenShiftCheck): - """Base class for logging component checks""" + """Base class for OpenShift aggregated logging component checks""" + + # FIXME: this should not be listed as a check, since it is not meant to be + # run by itself. name = "logging" + logging_namespace = "logging" - @classmethod - def is_active(cls, task_vars): - return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars) + def is_active(self): + logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) + return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master() - @staticmethod - def is_first_master(task_vars): - """Run only on first master and only when logging is configured. Returns: bool""" - logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True) + def is_first_master(self): + """Determine if running on first master. Returns: bool""" # Note: It would be nice to use membership in oo_first_master group, however for now it # seems best to avoid requiring that setup and just check this is the first master. - hostname = get_var(task_vars, "ansible_ssh_host") or [None] - masters = get_var(task_vars, "groups", "masters", default=None) or [None] - return logging_deployed and masters[0] == hostname + hostname = self.get_var("ansible_ssh_host") or [None] + masters = self.get_var("groups", "masters", default=None) or [None] + return masters[0] == hostname - def run(self, tmp, task_vars): - pass + def run(self): + return {} - def get_pods_for_component(self, execute_module, namespace, logging_component, task_vars): + def get_pods_for_component(self, namespace, logging_component): """Get all pods for a given component. Returns: list of pods for component, error string""" pod_output = self.exec_oc( - execute_module, namespace, "get pods -l component={} -o json".format(logging_component), [], - task_vars ) try: pods = json.loads(pod_output) @@ -45,7 +45,7 @@ class LoggingCheck(OpenShiftCheck): raise ValueError() except ValueError: # successful run but non-parsing data generally means there were no pods in the namespace - return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace) + return None, 'No pods were found for the "{}" logging component.'.format(logging_component) return pods['items'], None @@ -54,23 +54,22 @@ class LoggingCheck(OpenShiftCheck): """Returns: list of pods not in a ready and running state""" return [ pod for pod in pods - if any( + if not pod.get("status", {}).get("containerStatuses") or any( container['ready'] is False for container in pod['status']['containerStatuses'] ) or not any( condition['type'] == 'Ready' and condition['status'] == 'True' - for condition in pod['status']['conditions'] + for condition in pod['status'].get('conditions', []) ) ] - @staticmethod - def exec_oc(execute_module=None, namespace="logging", cmd_str="", extra_args=None, task_vars=None): + def exec_oc(self, namespace="logging", cmd_str="", extra_args=None): """ Execute an 'oc' command in the remote host. Returns: output of command and namespace, or raises OpenShiftCheckException on error """ - config_base = get_var(task_vars, "openshift", "common", "config_base") + config_base = self.get_var("openshift", "common", "config_base") args = { "namespace": namespace, "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), @@ -78,7 +77,7 @@ class LoggingCheck(OpenShiftCheck): "extra_args": list(extra_args) if extra_args else [], } - result = execute_module("ocutil", args, task_vars) + result = self.execute_module("ocutil", args) if result.get("failed"): msg = ( 'Unexpected error using `oc` to validate the logging stack components.\n' diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py new file mode 100644 index 000000000..b24e88e05 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py @@ -0,0 +1,130 @@ +""" +Check for ensuring logs from pods can be queried in a reasonable amount of time. +""" + +import json +import time + +from uuid import uuid4 + +from openshift_checks import OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck + + +ES_CMD_TIMEOUT_SECONDS = 30 + + +class LoggingIndexTime(LoggingCheck): + """Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time.""" + name = "logging_index_time" + tags = ["health", "logging"] + + logging_namespace = "logging" + + def run(self): + """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs.""" + try: + log_index_timeout = int( + self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS) + ) + except ValueError: + return { + "failed": True, + "msg": ('Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' + 'Value must be an integer representing an amount in seconds.'), + } + + running_component_pods = dict() + + # get all component pods + self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace) + for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']): + pods, error = self.get_pods_for_component(self.logging_namespace, component) + + if error: + msg = 'Unable to retrieve pods for the {} logging component: {}' + return {"failed": True, "changed": False, "msg": msg.format(name, error)} + + running_pods = self.running_pods(pods) + + if not running_pods: + msg = ('No {} pods in the "Running" state were found.' + 'At least one pod is required in order to perform this check.') + return {"failed": True, "changed": False, "msg": msg.format(name)} + + running_component_pods[component] = running_pods + + uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0]) + self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout) + return {} + + def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs): + """Retry an Elasticsearch query every second until query success, or a defined + length of time has passed.""" + deadline = time.time() + timeout_secs + interval = 1 + while not self.query_es_from_es(es_pod, uuid): + if time.time() + interval > deadline: + msg = "expecting match in Elasticsearch for message with uuid {}, but no matches were found after {}s." + raise OpenShiftCheckException(msg.format(uuid, timeout_secs)) + time.sleep(interval) + + def curl_kibana_with_uuid(self, kibana_pod): + """curl Kibana with a unique uuid.""" + uuid = self.generate_uuid() + pod_name = kibana_pod["metadata"]["name"] + exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}" + exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid) + + error_str = self.exec_oc(self.logging_namespace, exec_cmd, []) + + try: + error_code = json.loads(error_str)["statusCode"] + except KeyError: + msg = ('invalid response returned from Kibana request (Missing "statusCode" key):\n' + 'Command: {}\nResponse: {}').format(exec_cmd, error_str) + raise OpenShiftCheckException(msg) + except ValueError: + msg = ('invalid response returned from Kibana request (Non-JSON output):\n' + 'Command: {}\nResponse: {}').format(exec_cmd, error_str) + raise OpenShiftCheckException(msg) + + if error_code != 404: + msg = 'invalid error code returned from Kibana request. Expecting error code "404", but got "{}" instead.' + raise OpenShiftCheckException(msg.format(error_code)) + + return uuid + + def query_es_from_es(self, es_pod, uuid): + """curl the Elasticsearch pod and look for a unique uuid in its logs.""" + pod_name = es_pod["metadata"]["name"] + exec_cmd = ( + "exec {pod_name} -- curl --max-time 30 -s -f " + "--cacert /etc/elasticsearch/secret/admin-ca " + "--cert /etc/elasticsearch/secret/admin-cert " + "--key /etc/elasticsearch/secret/admin-key " + "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}" + ) + exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace, uuid=uuid) + result = self.exec_oc(self.logging_namespace, exec_cmd, []) + + try: + count = json.loads(result)["count"] + except KeyError: + msg = 'invalid response from Elasticsearch query:\n"{}"\nMissing "count" key:\n{}' + raise OpenShiftCheckException(msg.format(exec_cmd, result)) + except ValueError: + msg = 'invalid response from Elasticsearch query:\n"{}"\nNon-JSON output:\n{}' + raise OpenShiftCheckException(msg.format(exec_cmd, result)) + + return count + + @staticmethod + def running_pods(pods): + """Filter pods that are running.""" + return [pod for pod in pods if pod['status']['phase'] == 'Running'] + + @staticmethod + def generate_uuid(): + """Wrap uuid generator. Allows for testing with expected values.""" + return str(uuid4()) |