From 04154a21237f9da6ee64bf57097d125b85b99a5f Mon Sep 17 00:00:00 2001 From: juanvallejo Date: Mon, 26 Jun 2017 17:41:40 -0400 Subject: add fluentd logging driver config check --- .../openshift_checks/logging/curator.py | 9 +- .../openshift_checks/logging/elasticsearch.py | 22 +--- .../openshift_checks/logging/fluentd.py | 16 +-- .../openshift_checks/logging/fluentd_config.py | 138 +++++++++++++++++++++ .../openshift_checks/logging/kibana.py | 18 +-- 5 files changed, 159 insertions(+), 44 deletions(-) create mode 100644 roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py (limited to 'roles/openshift_health_checker/openshift_checks/logging') diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py index f82ae64d7..32d853d57 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/curator.py +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -9,11 +9,11 @@ class Curator(LoggingCheck): name = "curator" tags = ["health", "logging"] - logging_namespace = None - def run(self): + """Check various things and gather errors. Returns: result as hash""" + self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - curator_pods, error = super(Curator, self).get_pods_for_component( + curator_pods, error = self.get_pods_for_component( self.logging_namespace, "curator", ) @@ -23,7 +23,6 @@ class Curator(LoggingCheck): if check_error: msg = ("The following Curator deployment issue was found:" - "\n-------\n" "{}".format(check_error)) return {"failed": True, "changed": False, "msg": msg} @@ -39,7 +38,7 @@ class Curator(LoggingCheck): "Is Curator correctly deployed?" ) - not_running = super(Curator, self).not_running_pods(pods) + not_running = self.not_running_pods(pods) if len(not_running) == len(pods): return ( "The Curator pod is not currently in a running state,\n" diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py index 1e478c04d..8bdda1f32 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -12,13 +12,11 @@ class Elasticsearch(LoggingCheck): name = "elasticsearch" tags = ["health", "logging"] - logging_namespace = None - def run(self): """Check various things and gather errors. Returns: result as hash""" self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - es_pods, error = super(Elasticsearch, self).get_pods_for_component( + es_pods, error = self.get_pods_for_component( self.logging_namespace, "es", ) @@ -28,7 +26,6 @@ class Elasticsearch(LoggingCheck): if check_error: msg = ("The following Elasticsearch deployment issue was found:" - "\n-------\n" "{}".format(check_error)) return {"failed": True, "changed": False, "msg": msg} @@ -37,7 +34,7 @@ class Elasticsearch(LoggingCheck): def _not_running_elasticsearch_pods(self, es_pods): """Returns: list of pods that are not running, list of errors about non-running pods""" - not_running = super(Elasticsearch, self).not_running_pods(es_pods) + not_running = self.not_running_pods(es_pods) if not_running: return not_running, [( 'The following Elasticsearch pods are not running:\n' @@ -78,7 +75,7 @@ class Elasticsearch(LoggingCheck): for pod_name in pods_by_name.keys(): # Compare what each ES node reports as master and compare for split brain get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") - master_name_str = self._exec_oc(get_master_cmd, []) + master_name_str = self.exec_oc(self.logging_namespace, get_master_cmd, []) master_names = (master_name_str or '').split(' ') if len(master_names) > 1: es_master_names.add(master_names[1]) @@ -111,7 +108,7 @@ class Elasticsearch(LoggingCheck): # get ES cluster nodes node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') - cluster_node_data = self._exec_oc(node_cmd, []) + cluster_node_data = self.exec_oc(self.logging_namespace, node_cmd, []) try: cluster_nodes = json.loads(cluster_node_data)['nodes'] except (ValueError, KeyError): @@ -138,7 +135,7 @@ class Elasticsearch(LoggingCheck): error_msgs = [] for pod_name in pods_by_name.keys(): cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') - cluster_health_data = self._exec_oc(cluster_health_cmd, []) + cluster_health_data = self.exec_oc(self.logging_namespace, cluster_health_cmd, []) try: health_res = json.loads(cluster_health_data) if not health_res or not health_res.get('status'): @@ -165,7 +162,7 @@ class Elasticsearch(LoggingCheck): error_msgs = [] for pod_name in pods_by_name.keys(): df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) - disk_output = self._exec_oc(df_cmd, []) + disk_output = self.exec_oc(self.logging_namespace, df_cmd, []) lines = disk_output.splitlines() # expecting one header looking like 'IUse% Use%' and one body line body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' @@ -201,10 +198,3 @@ class Elasticsearch(LoggingCheck): )) return error_msgs - - def _exec_oc(self, cmd_str, extra_args): - return super(Elasticsearch, self).exec_oc( - self.logging_namespace, - cmd_str, - extra_args, - ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py index 063e707a9..b3485bf44 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -11,8 +11,6 @@ class Fluentd(LoggingCheck): name = "fluentd" tags = ["health", "logging"] - logging_namespace = None - def run(self): """Check various things and gather errors. Returns: result as hash""" @@ -27,7 +25,6 @@ class Fluentd(LoggingCheck): if check_error: msg = ("The following Fluentd deployment issue was found:" - "\n-------\n" "{}".format(check_error)) return {"failed": True, "changed": False, "msg": msg} @@ -147,7 +144,11 @@ class Fluentd(LoggingCheck): def get_nodes_by_name(self): """Retrieve all the node definitions. Returns: dict(name: node), error string""" - nodes_json = self._exec_oc("get nodes -o json", []) + nodes_json = self.exec_oc( + self.logging_namespace, + "get nodes -o json", + [] + ) try: nodes = json.loads(nodes_json) except ValueError: # no valid json - should not happen @@ -158,10 +159,3 @@ class Fluentd(LoggingCheck): node['metadata']['name']: node for node in nodes['items'] }, None - - def _exec_oc(self, cmd_str, extra_args): - return super(Fluentd, self).exec_oc( - self.logging_namespace, - cmd_str, - extra_args, - ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py new file mode 100644 index 000000000..0970f0a63 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py @@ -0,0 +1,138 @@ +""" +Module for performing checks on a Fluentd logging deployment configuration +""" + +from openshift_checks import OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck + + +class FluentdConfig(LoggingCheck): + """Module that checks logging configuration of an integrated logging Fluentd deployment""" + name = "fluentd_config" + tags = ["health"] + + def is_active(self): + logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) + + try: + version = self.get_major_minor_version(self.get_var("openshift_image_tag")) + except ValueError: + # if failed to parse OpenShift version, perform check anyway (if logging enabled) + return logging_deployed + + return logging_deployed and version < (3, 6) + + def run(self): + """Check that Fluentd has running pods, and that its logging config matches Docker's logging config.""" + self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace) + config_error = self.check_logging_config() + if config_error: + msg = ("The following Fluentd logging configuration problem was found:" + "\n{}".format(config_error)) + return {"failed": True, "msg": msg} + + return {} + + def check_logging_config(self): + """Ensure that the configured Docker logging driver matches fluentd settings. + This means that, at least for now, if the following condition is met: + + openshift_logging_fluentd_use_journal == True + + then the value of the configured Docker logging driver should be "journald". + Otherwise, the value of the Docker logging driver should be "json-file". + Returns an error string if the above condition is not met, or None otherwise.""" + use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True) + + # if check is running on a master, retrieve all running pods + # and check any pod's container for the env var "USE_JOURNAL" + group_names = self.get_var("group_names") + if "masters" in group_names: + use_journald = self.check_fluentd_env_var() + + docker_info = self.execute_module("docker_info", {}) + try: + logging_driver = docker_info["info"]["LoggingDriver"] + except KeyError: + return "Unable to determine Docker logging driver." + + logging_driver = docker_info["info"]["LoggingDriver"] + recommended_logging_driver = "journald" + error = None + + # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver` + # option as an inventory file variable, or adding the log driver value as part of the + # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that + # can be passed to the Docker binary; the only other recommendation that can be made, would be + # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running + # individual containers. + if use_journald and logging_driver != "journald": + error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n' + 'This differs from your Docker configuration, which has been set to use "{driver}" ' + 'as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + elif not use_journald and logging_driver != "json-file": + recommended_logging_driver = "json-file" + error = ('Your Fluentd configuration is set to aggregate Docker container logs from ' + 'individual json log files per container.\n ' + 'This differs from your Docker configuration, which has been set to use ' + '"{driver}" as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + + if error: + error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n' + ' openshift_docker_options="--log-driver={driver}"\n\n' + 'Alternatively, you can add the following option to your Docker configuration, located in' + '"/etc/docker/daemon.json":\n\n' + '{{ "log-driver": "{driver}" }}\n\n' + 'See https://docs.docker.com/engine/admin/logging/json-file ' + 'for more information.').format(driver=recommended_logging_driver) + + return error + + def check_fluentd_env_var(self): + """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod.""" + running_pods = self.running_fluentd_pods() + + try: + pod_containers = running_pods[0]["spec"]["containers"] + except KeyError: + return "Unable to detect running containers on selected Fluentd pod." + + if not pod_containers: + msg = ('There are no running containers on selected Fluentd pod "{}".\n' + 'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", "")) + raise OpenShiftCheckException(msg) + + pod_env = pod_containers[0].get("env") + if not pod_env: + msg = ('There are no environment variables set on the Fluentd container "{}".\n' + 'Unable to calculate expected logging driver.').format(pod_containers[0].get("name")) + raise OpenShiftCheckException(msg) + + for env in pod_env: + if env["name"] == "USE_JOURNAL": + return env.get("value", "false") != "false" + + return False + + def running_fluentd_pods(self): + """Return a list of running fluentd pods.""" + fluentd_pods, error = self.get_pods_for_component( + self.logging_namespace, + "fluentd", + ) + if error: + msg = 'Unable to retrieve any pods for the "fluentd" logging component: {}'.format(error) + raise OpenShiftCheckException(msg) + + running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running'] + if not running_fluentd_pods: + msg = ('No Fluentd pods were found to be in the "Running" state. ' + 'At least one Fluentd pod is required in order to perform this check.') + + raise OpenShiftCheckException(msg) + + return running_fluentd_pods diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py index 60f94e106..efb14ab42 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -21,13 +21,11 @@ class Kibana(LoggingCheck): name = "kibana" tags = ["health", "logging"] - logging_namespace = None - def run(self): """Check various things and gather errors. Returns: result as hash""" self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - kibana_pods, error = super(Kibana, self).get_pods_for_component( + kibana_pods, error = self.get_pods_for_component( self.logging_namespace, "kibana", ) @@ -40,7 +38,6 @@ class Kibana(LoggingCheck): if check_error: msg = ("The following Kibana deployment issue was found:" - "\n-------\n" "{}".format(check_error)) return {"failed": True, "changed": False, "msg": msg} @@ -118,7 +115,11 @@ class Kibana(LoggingCheck): """ # Get logging url - get_route = self._exec_oc("get route logging-kibana -o json", []) + get_route = self.exec_oc( + self.logging_namespace, + "get route logging-kibana -o json", + [], + ) if not get_route: return None, 'no_route_exists' @@ -217,10 +218,3 @@ class Kibana(LoggingCheck): ).format(error=error) return error return None - - def _exec_oc(self, cmd_str, extra_args): - return super(Kibana, self).exec_oc( - self.logging_namespace, - cmd_str, - extra_args, - ) -- cgit v1.2.3