summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker/openshift_checks/logging
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks/logging')
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/__init__.py0
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/curator.py43
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py212
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd.py154
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py131
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/kibana.py226
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging.py101
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py129
8 files changed, 996 insertions, 0 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
new file mode 100644
index 000000000..b27f97172
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -0,0 +1,43 @@
+"""Check for an aggregated logging Curator deployment"""
+
+from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck
+
+
+class Curator(LoggingCheck):
+ """Check for an aggregated logging Curator deployment"""
+
+ name = "curator"
+ tags = ["health", "logging"]
+
+ def run(self):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ curator_pods = self.get_pods_for_component("curator")
+ self.check_curator(curator_pods)
+ # TODO(lmeyer): run it all again for the ops cluster
+
+ return {}
+
+ def check_curator(self, pods):
+ """Check to see if curator is up and working. Returns: error string"""
+ if not pods:
+ raise OpenShiftCheckException(
+ "MissingComponentPods",
+ "There are no Curator pods for the logging stack,\n"
+ "so nothing will prune Elasticsearch indexes.\n"
+ "Is Curator correctly deployed?"
+ )
+
+ not_running = self.not_running_pods(pods)
+ if len(not_running) == len(pods):
+ raise OpenShiftCheckException(
+ "CuratorNotRunning",
+ "The Curator pod is not currently in a running state,\n"
+ "so Elasticsearch indexes may increase without bound."
+ )
+ if len(pods) - len(not_running) > 1:
+ raise OpenShiftCheckException(
+ "TooManyCurators",
+ "There is more than one Curator pod running. This should not normally happen.\n"
+ "Although this doesn't cause any problems, you may want to investigate."
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
new file mode 100644
index 000000000..986a01f38
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -0,0 +1,212 @@
+"""Check for an aggregated logging Elasticsearch deployment"""
+
+import json
+import re
+
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Elasticsearch(LoggingCheck):
+ """Check for an aggregated logging Elasticsearch deployment"""
+
+ name = "elasticsearch"
+ tags = ["health", "logging"]
+
+ def run(self):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ es_pods = self.get_pods_for_component("es")
+ self.check_elasticsearch(es_pods)
+ # TODO(lmeyer): run it all again for the ops cluster
+
+ return {}
+
+ def check_elasticsearch(self, es_pods):
+ """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors."""
+ running_pods, errors = self.running_elasticsearch_pods(es_pods)
+ pods_by_name = {
+ pod['metadata']['name']: pod for pod in running_pods
+ # Filter out pods that are not members of a DC
+ if pod['metadata'].get('labels', {}).get('deploymentconfig')
+ }
+ if not pods_by_name:
+ # nothing running, cannot run the rest of the check
+ errors.append(OpenShiftCheckException(
+ 'NoRunningPods',
+ 'No logging Elasticsearch pods were found running, so no logs are being aggregated.'
+ ))
+ raise OpenShiftCheckExceptionList(errors)
+
+ errors += self.check_elasticsearch_masters(pods_by_name)
+ errors += self.check_elasticsearch_node_list(pods_by_name)
+ errors += self.check_es_cluster_health(pods_by_name)
+ errors += self.check_elasticsearch_diskspace(pods_by_name)
+ if errors:
+ raise OpenShiftCheckExceptionList(errors)
+
+ def running_elasticsearch_pods(self, es_pods):
+ """Returns: list of running pods, list of errors about non-running pods"""
+ not_running = self.not_running_pods(es_pods)
+ running_pods = [pod for pod in es_pods if pod not in not_running]
+ if not_running:
+ return running_pods, [OpenShiftCheckException(
+ 'PodNotRunning',
+ 'The following Elasticsearch pods are defined but not running:\n'
+ '{pods}'.format(pods=''.join(
+ " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+ for pod in not_running
+ ))
+ )]
+ return running_pods, []
+
+ @staticmethod
+ def _build_es_curl_cmd(pod_name, url):
+ base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
+ return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
+
+ def check_elasticsearch_masters(self, pods_by_name):
+ """Check that Elasticsearch masters are sane. Returns: list of errors"""
+ es_master_names = set()
+ errors = []
+ for pod_name in pods_by_name.keys():
+ # Compare what each ES node reports as master and compare for split brain
+ get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
+ master_name_str = self.exec_oc(get_master_cmd, [], save_as_name="get_master_names.json")
+ master_names = (master_name_str or '').split(' ')
+ if len(master_names) > 1:
+ es_master_names.add(master_names[1])
+ else:
+ errors.append(OpenShiftCheckException(
+ 'NoMasterName',
+ 'Elasticsearch {pod} gave unexpected response when asked master name:\n'
+ ' {response}'.format(pod=pod_name, response=master_name_str)
+ ))
+
+ if not es_master_names:
+ errors.append(OpenShiftCheckException(
+ 'NoMasterFound',
+ 'No logging Elasticsearch masters were found.'
+ ))
+ return errors
+
+ if len(es_master_names) > 1:
+ errors.append(OpenShiftCheckException(
+ 'SplitBrainMasters',
+ 'Found multiple Elasticsearch masters according to the pods:\n'
+ '{master_list}\n'
+ 'This implies that the masters have "split brain" and are not correctly\n'
+ 'replicating data for the logging cluster. Log loss is likely to occur.'
+ .format(master_list='\n'.join(' ' + master for master in es_master_names))
+ ))
+
+ return errors
+
+ def check_elasticsearch_node_list(self, pods_by_name):
+ """Check that reported ES masters are accounted for by pods. Returns: list of errors"""
+
+ if not pods_by_name:
+ return [OpenShiftCheckException(
+ 'MissingComponentPods',
+ 'No logging Elasticsearch pods were found.'
+ )]
+
+ # get ES cluster nodes
+ node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
+ cluster_node_data = self.exec_oc(node_cmd, [], save_as_name="get_es_nodes.json")
+ try:
+ cluster_nodes = json.loads(cluster_node_data)['nodes']
+ except (ValueError, KeyError):
+ return [OpenShiftCheckException(
+ 'MissingNodeList',
+ 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
+ cluster_node_data
+ )]
+
+ # Try to match all ES-reported node hosts to known pods.
+ errors = []
+ for node in cluster_nodes.values():
+ # Note that with 1.4/3.4 the pod IP may be used as the master name
+ if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
+ for pod_name, pod in pods_by_name.items()):
+ errors.append(OpenShiftCheckException(
+ 'EsPodNodeMismatch',
+ 'The Elasticsearch cluster reports a member node "{node}"\n'
+ 'that does not correspond to any known ES pod.'.format(node=node['host'])
+ ))
+
+ return errors
+
+ def check_es_cluster_health(self, pods_by_name):
+ """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
+ errors = []
+ for pod_name in pods_by_name.keys():
+ cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
+ cluster_health_data = self.exec_oc(cluster_health_cmd, [], save_as_name='get_es_health.json')
+ try:
+ health_res = json.loads(cluster_health_data)
+ if not health_res or not health_res.get('status'):
+ raise ValueError()
+ except ValueError:
+ errors.append(OpenShiftCheckException(
+ 'BadEsResponse',
+ 'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
+ 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
+ ))
+ continue
+
+ if health_res['status'] not in ['green', 'yellow']:
+ errors.append(OpenShiftCheckException(
+ 'EsClusterHealthRed',
+ 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
+ ))
+
+ return errors
+
+ def check_elasticsearch_diskspace(self, pods_by_name):
+ """
+ Exec into an ES pod and query the diskspace on the persistent volume.
+ Returns: list of errors
+ """
+ errors = []
+ for pod_name in pods_by_name.keys():
+ df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
+ disk_output = self.exec_oc(df_cmd, [], save_as_name='get_pv_diskspace.json')
+ lines = disk_output.splitlines()
+ # expecting one header looking like 'IUse% Use%' and one body line
+ body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
+ if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
+ errors.append(OpenShiftCheckException(
+ 'BadDfResponse',
+ 'Could not retrieve storage usage from logging ES pod "{pod}".\n'
+ 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
+ ))
+ continue
+ inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
+
+ inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90')
+ if int(inode_pct) >= int(inode_pct_thresh):
+ errors.append(OpenShiftCheckException(
+ 'InodeUsageTooHigh',
+ 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
+ ' is {pct}, greater than threshold {limit}.\n'
+ ' Note: threshold can be specified in inventory with {param}'.format(
+ pod=pod_name,
+ pct=str(inode_pct),
+ limit=str(inode_pct_thresh),
+ param='openshift_check_efk_es_inode_pct',
+ )))
+ disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80')
+ if int(disk_pct) >= int(disk_pct_thresh):
+ errors.append(OpenShiftCheckException(
+ 'DiskUsageTooHigh',
+ 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
+ ' is {pct}, greater than threshold {limit}.\n'
+ ' Note: threshold can be specified in inventory with {param}'.format(
+ pod=pod_name,
+ pct=str(disk_pct),
+ limit=str(disk_pct_thresh),
+ param='openshift_check_efk_es_storage_pct',
+ )))
+
+ return errors
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
new file mode 100644
index 000000000..3b192a281
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -0,0 +1,154 @@
+"""Check for an aggregated logging Fluentd deployment"""
+
+import json
+
+
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Fluentd(LoggingCheck):
+ """Check for an aggregated logging Fluentd deployment"""
+
+ name = "fluentd"
+ tags = ["health", "logging"]
+
+ def run(self):
+ """Check the Fluentd deployment and raise an error if any problems are found."""
+
+ fluentd_pods = self.get_pods_for_component("fluentd")
+ self.check_fluentd(fluentd_pods)
+ return {}
+
+ def check_fluentd(self, pods):
+ """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found."""
+
+ node_selector = self.get_var(
+ 'openshift_logging_fluentd_nodeselector',
+ default='logging-infra-fluentd=true'
+ )
+
+ nodes_by_name = self.get_nodes_by_name()
+ fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
+
+ errors = []
+ errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
+ errors += self.check_nodes_have_fluentd(pods, fluentd_nodes)
+ errors += self.check_fluentd_pods_running(pods)
+
+ # Make sure there are no extra fluentd pods
+ if len(pods) > len(fluentd_nodes):
+ errors.append(OpenShiftCheckException(
+ 'TooManyFluentdPods',
+ 'There are more Fluentd pods running than nodes labeled.\n'
+ 'This may not cause problems with logging but it likely indicates something wrong.'
+ ))
+
+ if errors:
+ raise OpenShiftCheckExceptionList(errors)
+
+ def get_nodes_by_name(self):
+ """Retrieve all the node definitions. Returns: dict(name: node)"""
+ nodes_json = self.exec_oc("get nodes -o json", [])
+ try:
+ nodes = json.loads(nodes_json)
+ except ValueError: # no valid json - should not happen
+ raise OpenShiftCheckException(
+ "BadOcNodeList",
+ "Could not obtain a list of nodes to validate fluentd.\n"
+ "Output from oc get:\n" + nodes_json
+ )
+ if not nodes or not nodes.get('items'): # also should not happen
+ raise OpenShiftCheckException(
+ "NoNodesDefined",
+ "No nodes appear to be defined according to the API."
+ )
+ return {
+ node['metadata']['name']: node
+ for node in nodes['items']
+ }
+
+ @staticmethod
+ def filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+ """Filter to all nodes with fluentd label. Returns dict(name: node)"""
+ label, value = node_selector.split('=', 1)
+ fluentd_nodes = {
+ name: node for name, node in nodes_by_name.items()
+ if node['metadata']['labels'].get(label) == value
+ }
+ if not fluentd_nodes:
+ raise OpenShiftCheckException(
+ 'NoNodesLabeled',
+ 'There are no nodes with the fluentd label {label}.\n'
+ 'This means no logs will be aggregated from the nodes.'.format(label=node_selector)
+ )
+ return fluentd_nodes
+
+ def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
+ """Note if nodes are not labeled as expected. Returns: error list"""
+ intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all'])
+ if not intended_nodes or '--all' in intended_nodes:
+ intended_nodes = nodes_by_name.keys()
+ nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
+ if nodes_missing_labels:
+ return [OpenShiftCheckException(
+ 'NodesUnlabeled',
+ 'The following nodes are supposed to be labeled with {label} but are not:\n'
+ ' {nodes}\n'
+ 'Fluentd will not aggregate logs from these nodes.'.format(
+ label=node_selector, nodes=', '.join(nodes_missing_labels)
+ ))]
+
+ return []
+
+ @staticmethod
+ def check_nodes_have_fluentd(pods, fluentd_nodes):
+ """Make sure fluentd is on all the labeled nodes. Returns: error list"""
+ unmatched_nodes = fluentd_nodes.copy()
+ node_names_by_label = {
+ node['metadata']['labels']['kubernetes.io/hostname']: name
+ for name, node in fluentd_nodes.items()
+ }
+ node_names_by_internal_ip = {
+ address['address']: name
+ for name, node in fluentd_nodes.items()
+ for address in node['status']['addresses']
+ if address['type'] == "InternalIP"
+ }
+ for pod in pods:
+ for name in [
+ pod['spec']['nodeName'],
+ node_names_by_internal_ip.get(pod['spec']['nodeName']),
+ node_names_by_label.get(pod.get('spec', {}).get('host')),
+ ]:
+ unmatched_nodes.pop(name, None)
+ if unmatched_nodes:
+ return [OpenShiftCheckException(
+ 'MissingFluentdPod',
+ 'The following nodes are supposed to have a Fluentd pod but do not:\n'
+ ' {nodes}\n'
+ 'These nodes will not have their logs aggregated.'.format(
+ nodes='\n '.join(unmatched_nodes.keys())
+ ))]
+
+ return []
+
+ def check_fluentd_pods_running(self, pods):
+ """Make sure all fluentd pods are running. Returns: error string"""
+ not_running = super(Fluentd, self).not_running_pods(pods)
+ if not_running:
+ return [OpenShiftCheckException(
+ 'FluentdNotRunning',
+ 'The following Fluentd pods are supposed to be running but are not:\n'
+ ' {pods}\n'
+ 'These pods will not aggregate logs from their nodes.'.format(
+ pods='\n'.join(
+ " {name} ({host})".format(
+ name=pod['metadata']['name'],
+ host=pod['spec'].get('host', 'None')
+ )
+ for pod in not_running
+ )
+ ))]
+
+ return []
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
new file mode 100644
index 000000000..e93cc9028
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
@@ -0,0 +1,131 @@
+"""
+Module for performing checks on a Fluentd logging deployment configuration
+"""
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class FluentdConfig(LoggingCheck):
+ """Module that checks logging configuration of an integrated logging Fluentd deployment"""
+ name = "fluentd_config"
+ tags = ["health"]
+
+ def is_active(self):
+ logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False)
+
+ try:
+ version = self.get_major_minor_version(self.get_var("openshift_image_tag"))
+ except ValueError:
+ # if failed to parse OpenShift version, perform check anyway (if logging enabled)
+ return logging_deployed
+
+ return logging_deployed and version < (3, 6)
+
+ def run(self):
+ """Check that Fluentd has running pods, and that its logging config matches Docker's logging config."""
+ config_error = self.check_logging_config()
+ if config_error:
+ msg = ("The following Fluentd logging configuration problem was found:"
+ "\n{}".format(config_error))
+ return {"failed": True, "msg": msg}
+
+ return {}
+
+ def check_logging_config(self):
+ """Ensure that the configured Docker logging driver matches fluentd settings.
+ This means that, at least for now, if the following condition is met:
+
+ openshift_logging_fluentd_use_journal == True
+
+ then the value of the configured Docker logging driver should be "journald".
+ Otherwise, the value of the Docker logging driver should be "json-file".
+ Returns an error string if the above condition is not met, or None otherwise."""
+ use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True)
+
+ # if check is running on a master, retrieve all running pods
+ # and check any pod's container for the env var "USE_JOURNAL"
+ group_names = self.get_var("group_names")
+ if "oo_masters_to_config" in group_names:
+ use_journald = self.check_fluentd_env_var()
+
+ docker_info = self.execute_module("docker_info", {})
+ try:
+ logging_driver = docker_info["info"]["LoggingDriver"]
+ except KeyError:
+ return "Unable to determine Docker logging driver."
+
+ logging_driver = docker_info["info"]["LoggingDriver"]
+ recommended_logging_driver = "journald"
+ error = None
+
+ # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver`
+ # option as an inventory file variable, or adding the log driver value as part of the
+ # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that
+ # can be passed to the Docker binary; the only other recommendation that can be made, would be
+ # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running
+ # individual containers.
+ if use_journald and logging_driver != "journald":
+ error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n'
+ 'This differs from your Docker configuration, which has been set to use "{driver}" '
+ 'as the default method of storing logs.\n'
+ 'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+ 'from your Docker containers.').format(driver=logging_driver)
+ elif not use_journald and logging_driver != "json-file":
+ recommended_logging_driver = "json-file"
+ error = ('Your Fluentd configuration is set to aggregate Docker container logs from '
+ 'individual json log files per container.\n '
+ 'This differs from your Docker configuration, which has been set to use '
+ '"{driver}" as the default method of storing logs.\n'
+ 'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+ 'from your Docker containers.').format(driver=logging_driver)
+
+ if error:
+ error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n'
+ ' openshift_docker_options="--log-driver={driver}"\n\n'
+ 'Alternatively, you can add the following option to your Docker configuration, located in'
+ '"/etc/docker/daemon.json":\n\n'
+ '{{ "log-driver": "{driver}" }}\n\n'
+ 'See https://docs.docker.com/engine/admin/logging/json-file '
+ 'for more information.').format(driver=recommended_logging_driver)
+
+ return error
+
+ def check_fluentd_env_var(self):
+ """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod."""
+ running_pods = self.running_fluentd_pods()
+
+ try:
+ pod_containers = running_pods[0]["spec"]["containers"]
+ except KeyError:
+ return "Unable to detect running containers on selected Fluentd pod."
+
+ if not pod_containers:
+ msg = ('There are no running containers on selected Fluentd pod "{}".\n'
+ 'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", ""))
+ raise OpenShiftCheckException(msg)
+
+ pod_env = pod_containers[0].get("env")
+ if not pod_env:
+ msg = ('There are no environment variables set on the Fluentd container "{}".\n'
+ 'Unable to calculate expected logging driver.').format(pod_containers[0].get("name"))
+ raise OpenShiftCheckException(msg)
+
+ for env in pod_env:
+ if env["name"] == "USE_JOURNAL":
+ return env.get("value", "false") != "false"
+
+ return False
+
+ def running_fluentd_pods(self):
+ """Return a list of running fluentd pods."""
+ fluentd_pods = self.get_pods_for_component("fluentd")
+
+ running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running']
+ if not running_fluentd_pods:
+ raise OpenShiftCheckException(
+ 'No Fluentd pods were found to be in the "Running" state. '
+ 'At least one Fluentd pod is required in order to perform this check.'
+ )
+
+ return running_fluentd_pods
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
new file mode 100644
index 000000000..3b1cf8baa
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -0,0 +1,226 @@
+"""
+Module for performing checks on a Kibana logging deployment
+"""
+
+import json
+import ssl
+
+try:
+ from urllib2 import HTTPError, URLError
+ import urllib2
+except ImportError:
+ from urllib.error import HTTPError, URLError
+ import urllib.request as urllib2
+
+from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException
+
+
+class Kibana(LoggingCheck):
+ """Module that checks an integrated logging Kibana deployment"""
+
+ name = "kibana"
+ tags = ["health", "logging"]
+
+ def run(self):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ kibana_pods = self.get_pods_for_component("kibana")
+ self.check_kibana(kibana_pods)
+ self.check_kibana_route()
+ # TODO(lmeyer): run it all again for the ops cluster
+
+ return {}
+
+ def _verify_url_internal(self, url):
+ """
+ Try to reach a URL from the host.
+ Returns: success (bool), reason (for failure)
+ """
+ args = dict(
+ url=url,
+ follow_redirects='none',
+ validate_certs='no', # likely to be signed with internal CA
+ # TODO(lmeyer): give users option to validate certs
+ status_code=302,
+ )
+ result = self.execute_module('uri', args)
+ if result.get('failed'):
+ return result['msg']
+ return None
+
+ @staticmethod
+ def _verify_url_external(url):
+ """
+ Try to reach a URL from ansible control host.
+ Raise an OpenShiftCheckException if anything goes wrong.
+ """
+ # This actually checks from the ansible control host, which may or may not
+ # really be "external" to the cluster.
+
+ # Disable SSL cert validation to work around internally signed certs
+ ctx = ssl.create_default_context()
+ ctx.check_hostname = False # or setting CERT_NONE is refused
+ ctx.verify_mode = ssl.CERT_NONE
+
+ # Verify that the url is returning a valid response
+ try:
+ # We only care if the url connects and responds
+ return_code = urllib2.urlopen(url, context=ctx).getcode()
+ except HTTPError as httperr:
+ return httperr.reason
+ except URLError as urlerr:
+ return str(urlerr)
+
+ # there appears to be no way to prevent urlopen from following redirects
+ if return_code != 200:
+ return 'Expected success (200) but got return code {}'.format(int(return_code))
+
+ return None
+
+ def check_kibana(self, pods):
+ """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not."""
+
+ if not pods:
+ raise OpenShiftCheckException(
+ "MissingComponentPods",
+ "There are no Kibana pods deployed, so no access to the logging UI."
+ )
+
+ not_running = self.not_running_pods(pods)
+ if len(not_running) == len(pods):
+ raise OpenShiftCheckException(
+ "NoRunningPods",
+ "No Kibana pod is in a running state, so there is no access to the logging UI."
+ )
+ elif not_running:
+ raise OpenShiftCheckException(
+ "PodNotRunning",
+ "The following Kibana pods are not currently in a running state:\n"
+ " {pods}\n"
+ "However at least one is, so service may not be impacted.".format(
+ pods="\n ".join(pod['metadata']['name'] for pod in not_running)
+ )
+ )
+
+ def _get_kibana_url(self):
+ """
+ Get kibana route or report error.
+ Returns: url
+ """
+
+ # Get logging url
+ get_route = self.exec_oc("get route logging-kibana -o json", [])
+ if not get_route:
+ raise OpenShiftCheckException(
+ 'no_route_exists',
+ 'No route is defined for Kibana in the logging namespace,\n'
+ 'so the logging stack is not accessible. Is logging deployed?\n'
+ 'Did something remove the logging-kibana route?'
+ )
+
+ try:
+ route = json.loads(get_route)
+ # check that the route has been accepted by a router
+ ingress = route["status"]["ingress"]
+ except (ValueError, KeyError):
+ raise OpenShiftCheckException(
+ 'get_route_failed',
+ '"oc get route" returned an unexpected response:\n' + get_route
+ )
+
+ # ingress can be null if there is no router, or empty if not routed
+ if not ingress or not ingress[0]:
+ raise OpenShiftCheckException(
+ 'route_not_accepted',
+ 'The logging-kibana route is not being routed by any router.\n'
+ 'Is the router deployed and working?'
+ )
+
+ host = route.get("spec", {}).get("host")
+ if not host:
+ raise OpenShiftCheckException(
+ 'route_missing_host',
+ 'The logging-kibana route has no hostname defined,\n'
+ 'which should never happen. Did something alter its definition?'
+ )
+
+ return 'https://{}/'.format(host)
+
+ def check_kibana_route(self):
+ """
+ Check to see if kibana route is up and working.
+ Raises exception if not.
+ """
+
+ kibana_url = self._get_kibana_url()
+
+ # first, check that kibana is reachable from the master.
+ error = self._verify_url_internal(kibana_url)
+ if error:
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ raise OpenShiftCheckException(
+ 'FailedToConnectInternal',
+ 'Failed to connect from this master to Kibana URL {url}\n'
+ 'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url)
+ )
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ raise OpenShiftCheckException(
+ 'FailedToResolveInternal',
+ 'Failed to connect from this master to Kibana URL {url}\n'
+ 'because the hostname does not resolve.\n'
+ 'Is DNS configured for the Kibana hostname?'.format(url=kibana_url)
+ )
+ elif 'Status code was not' in error:
+ raise OpenShiftCheckException(
+ 'WrongReturnCodeInternal',
+ 'A request from this master to the Kibana URL {url}\n'
+ 'did not return the correct status code (302).\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues. The output was:\n'
+ ' {error}'.format(url=kibana_url, error=error)
+ )
+ raise OpenShiftCheckException(
+ 'MiscRouteErrorInternal',
+ 'Error validating the logging Kibana route internally:\n' + error
+ )
+
+ # in production we would like the kibana route to work from outside the
+ # cluster too; but that may not be the case, so allow disabling just this part.
+ if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true":
+ return
+ error = self._verify_url_external(kibana_url)
+
+ if not error:
+ return
+
+ error_fmt = (
+ 'Error validating the logging Kibana route:\n{error}\n'
+ 'To disable external Kibana route validation, set the variable:\n'
+ ' openshift_check_efk_kibana_external=False'
+ )
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ msg = (
+ 'Failed to connect from the Ansible control host to Kibana URL {url}\n'
+ 'Is the router for the Kibana hostname exposed externally?'
+ ).format(url=kibana_url)
+ raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg))
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ msg = (
+ 'Failed to resolve the Kibana hostname in {url}\n'
+ 'from the Ansible control host.\n'
+ 'Is DNS configured to resolve this Kibana hostname externally?'
+ ).format(url=kibana_url)
+ raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg))
+ elif 'Expected success (200)' in error:
+ msg = (
+ 'A request to Kibana at {url}\n'
+ 'returned the wrong error code:\n'
+ ' {error}\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues.'
+ ).format(url=kibana_url, error=error)
+ raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg))
+ raise OpenShiftCheckException(
+ 'MiscRouteError',
+ 'Error validating the logging Kibana route externally:\n' + error
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
new file mode 100644
index 000000000..05ba73ca1
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -0,0 +1,101 @@
+"""
+Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack
+"""
+
+import json
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+class MissingComponentPods(OpenShiftCheckException):
+ """Raised when a component has no pods in the namespace."""
+ pass
+
+
+class CouldNotUseOc(OpenShiftCheckException):
+ """Raised when ocutil has a failure running oc."""
+ pass
+
+
+class LoggingCheck(OpenShiftCheck):
+ """Base class for OpenShift aggregated logging component checks"""
+
+ # FIXME: this should not be listed as a check, since it is not meant to be
+ # run by itself.
+
+ name = "logging"
+
+ def is_active(self):
+ logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False)
+ return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master()
+
+ def run(self):
+ return {}
+
+ def get_pods_for_component(self, logging_component):
+ """Get all pods for a given component. Returns: list of pods."""
+ pod_output = self.exec_oc(
+ "get pods -l component={} -o json".format(logging_component),
+ [],
+ )
+ try:
+ pods = json.loads(pod_output) # raises ValueError if deserialize fails
+ if not pods or not pods.get('items'): # also a broken response, treat the same
+ raise ValueError()
+ except ValueError:
+ # successful run but non-parsing data generally means there were no pods to be found
+ raise MissingComponentPods(
+ 'There are no "{}" component pods in the "{}" namespace.\n'
+ 'Is logging deployed?'.format(logging_component, self.logging_namespace())
+ )
+
+ return pods['items']
+
+ @staticmethod
+ def not_running_pods(pods):
+ """Returns: list of pods not in a ready and running state"""
+ return [
+ pod for pod in pods
+ if not pod.get("status", {}).get("containerStatuses") or any(
+ container['ready'] is False
+ for container in pod['status']['containerStatuses']
+ ) or not any(
+ condition['type'] == 'Ready' and condition['status'] == 'True'
+ for condition in pod['status'].get('conditions', [])
+ )
+ ]
+
+ def logging_namespace(self):
+ """Returns the namespace in which logging is configured to deploy."""
+ return self.get_var("openshift_logging_namespace", default="logging")
+
+ def exec_oc(self, cmd_str="", extra_args=None, save_as_name=None):
+ """
+ Execute an 'oc' command in the remote host.
+ Returns: output of command and namespace,
+ or raises CouldNotUseOc on error
+ """
+ config_base = self.get_var("openshift", "common", "config_base")
+ args = {
+ "namespace": self.logging_namespace(),
+ "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+ "cmd": cmd_str,
+ "extra_args": list(extra_args) if extra_args else [],
+ }
+
+ result = self.execute_module("ocutil", args, save_as_name=save_as_name)
+ if result.get("failed"):
+ if result['result'] == '[Errno 2] No such file or directory':
+ raise CouldNotUseOc(
+ "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+ "Has an installation been run on this host yet?"
+ )
+
+ raise CouldNotUseOc(
+ 'Unexpected error using `oc` to validate the logging stack components.\n'
+ 'Error executing `oc {cmd}`:\n'
+ '{error}'.format(cmd=args['cmd'], error=result['result'])
+ )
+
+ return result.get("result", "")
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
new file mode 100644
index 000000000..cacdf4213
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -0,0 +1,129 @@
+"""
+Check for ensuring logs from pods can be queried in a reasonable amount of time.
+"""
+
+import json
+import time
+
+from uuid import uuid4
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+ES_CMD_TIMEOUT_SECONDS = 30
+
+
+class LoggingIndexTime(LoggingCheck):
+ """Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time."""
+ name = "logging_index_time"
+ tags = ["health", "logging"]
+
+ def run(self):
+ """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs."""
+ try:
+ log_index_timeout = int(
+ self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS)
+ )
+ except ValueError:
+ raise OpenShiftCheckException(
+ 'InvalidTimeout',
+ 'Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
+ 'Value must be an integer representing an amount in seconds.'
+ )
+
+ running_component_pods = dict()
+
+ # get all component pods
+ for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']):
+ pods = self.get_pods_for_component(component)
+ running_pods = self.running_pods(pods)
+
+ if not running_pods:
+ raise OpenShiftCheckException(
+ component + 'NoRunningPods',
+ 'No {} pods in the "Running" state were found.'
+ 'At least one pod is required in order to perform this check.'.format(name)
+ )
+
+ running_component_pods[component] = running_pods
+
+ uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0])
+ self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout)
+ return {}
+
+ def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs):
+ """Retry an Elasticsearch query every second until query success, or a defined
+ length of time has passed."""
+ deadline = time.time() + timeout_secs
+ interval = 1
+ while not self.query_es_from_es(es_pod, uuid):
+ if time.time() + interval > deadline:
+ raise OpenShiftCheckException(
+ "NoMatchFound",
+ "expecting match in Elasticsearch for message with uuid {}, "
+ "but no matches were found after {}s.".format(uuid, timeout_secs)
+ )
+ time.sleep(interval)
+
+ def curl_kibana_with_uuid(self, kibana_pod):
+ """curl Kibana with a unique uuid."""
+ uuid = self.generate_uuid()
+ pod_name = kibana_pod["metadata"]["name"]
+ exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}"
+ exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid)
+
+ error_str = self.exec_oc(exec_cmd, [])
+
+ try:
+ error_code = json.loads(error_str)["statusCode"]
+ except (KeyError, ValueError):
+ raise OpenShiftCheckException(
+ 'kibanaInvalidResponse',
+ 'invalid response returned from Kibana request:\n'
+ 'Command: {}\nResponse: {}'.format(exec_cmd, error_str)
+ )
+
+ if error_code != 404:
+ raise OpenShiftCheckException(
+ 'kibanaInvalidReturnCode',
+ 'invalid error code returned from Kibana request.\n'
+ 'Expecting error code "404", but got "{}" instead.'.format(error_code)
+ )
+
+ return uuid
+
+ def query_es_from_es(self, es_pod, uuid):
+ """curl the Elasticsearch pod and look for a unique uuid in its logs."""
+ pod_name = es_pod["metadata"]["name"]
+ exec_cmd = (
+ "exec {pod_name} -- curl --max-time 30 -s -f "
+ "--cacert /etc/elasticsearch/secret/admin-ca "
+ "--cert /etc/elasticsearch/secret/admin-cert "
+ "--key /etc/elasticsearch/secret/admin-key "
+ "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
+ )
+ exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid)
+ result = self.exec_oc(exec_cmd, [], save_as_name="query_for_uuid.json")
+
+ try:
+ count = json.loads(result)["count"]
+ except (KeyError, ValueError):
+ raise OpenShiftCheckException(
+ 'esInvalidResponse',
+ 'Invalid response from Elasticsearch query:\n'
+ ' {}\n'
+ 'Response was:\n{}'.format(exec_cmd, result)
+ )
+
+ return count
+
+ @staticmethod
+ def running_pods(pods):
+ """Filter pods that are running."""
+ return [pod for pod in pods if pod['status']['phase'] == 'Running']
+
+ @staticmethod
+ def generate_uuid():
+ """Wrap uuid generator. Allows for testing with expected values."""
+ return str(uuid4())