diff options
13 files changed, 1575 insertions, 6 deletions
diff --git a/roles/openshift_health_checker/library/ocutil.py b/roles/openshift_health_checker/library/ocutil.py new file mode 100644 index 000000000..2e60735d6 --- /dev/null +++ b/roles/openshift_health_checker/library/ocutil.py @@ -0,0 +1,74 @@ +#!/usr/bin/python +"""Interface to OpenShift oc command""" + +import os +import shlex +import shutil +import subprocess + +from ansible.module_utils.basic import AnsibleModule + + +ADDITIONAL_PATH_LOOKUPS = ['/usr/local/bin', os.path.expanduser('~/bin')] + + +def locate_oc_binary(): +    """Find and return oc binary file""" +    # https://github.com/openshift/openshift-ansible/issues/3410 +    # oc can be in /usr/local/bin in some cases, but that may not +    # be in $PATH due to ansible/sudo +    paths = os.environ.get("PATH", os.defpath).split(os.pathsep) + ADDITIONAL_PATH_LOOKUPS + +    oc_binary = 'oc' + +    # Use shutil.which if it is available, otherwise fallback to a naive path search +    try: +        which_result = shutil.which(oc_binary, path=os.pathsep.join(paths)) +        if which_result is not None: +            oc_binary = which_result +    except AttributeError: +        for path in paths: +            if os.path.exists(os.path.join(path, oc_binary)): +                oc_binary = os.path.join(path, oc_binary) +                break + +    return oc_binary + + +def main(): +    """Module that executes commands on a remote OpenShift cluster""" + +    module = AnsibleModule( +        argument_spec=dict( +            namespace=dict(type="str", required=True), +            config_file=dict(type="str", required=True), +            cmd=dict(type="str", required=True), +            extra_args=dict(type="list", default=[]), +        ), +    ) + +    cmd = [ +        locate_oc_binary(), +        '--config', module.params["config_file"], +        '-n', module.params["namespace"], +    ] + shlex.split(module.params["cmd"]) + +    failed = True +    try: +        cmd_result = subprocess.check_output(list(cmd), stderr=subprocess.STDOUT) +        failed = False +    except subprocess.CalledProcessError as exc: +        cmd_result = '[rc {}] {}\n{}'.format(exc.returncode, ' '.join(exc.cmd), exc.output) +    except OSError as exc: +        # we get this when 'oc' is not there +        cmd_result = str(exc) + +    module.exit_json( +        changed=False, +        failed=failed, +        result=cmd_result, +    ) + + +if __name__ == '__main__': +    main() diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index be63d864a..5c9949ced 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -66,16 +66,26 @@ class OpenShiftCheck(object):  LOADER_EXCLUDES = (      "__init__.py",      "mixins.py", +    "logging.py",  ) -def load_checks(): +def load_checks(path=None, subpkg=""):      """Dynamically import all check modules for the side effect of registering checks.""" -    return [ -        import_module(__package__ + "." + name[:-3]) -        for name in os.listdir(os.path.dirname(__file__)) -        if name.endswith(".py") and name not in LOADER_EXCLUDES -    ] +    if path is None: +        path = os.path.dirname(__file__) + +    modules = [] + +    for name in os.listdir(path): +        if os.path.isdir(os.path.join(path, name)): +            modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name) +            continue + +        if name.endswith(".py") and name not in LOADER_EXCLUDES: +            modules.append(import_module(__package__ + subpkg + "." + name[:-3])) + +    return modules  def get_var(task_vars, *keys, **kwargs): diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py new file mode 100644 index 000000000..c9fc59896 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -0,0 +1,61 @@ +""" +Module for performing checks on an Curator logging deployment +""" + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Curator(LoggingCheck): +    """Module that checks an integrated logging Curator deployment""" + +    name = "curator" +    tags = ["health", "logging"] + +    logging_namespace = None + +    def run(self, tmp, task_vars): +        """Check various things and gather errors. Returns: result as hash""" + +        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") +        curator_pods, error = super(Curator, self).get_pods_for_component( +            self.module_executor, +            self.logging_namespace, +            "curator", +            task_vars +        ) +        if error: +            return {"failed": True, "changed": False, "msg": error} +        check_error = self.check_curator(curator_pods) + +        if check_error: +            msg = ("The following Curator deployment issue was found:" +                   "\n-------\n" +                   "{}".format(check_error)) +            return {"failed": True, "changed": False, "msg": msg} + +        # TODO(lmeyer): run it all again for the ops cluster +        return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'} + +    def check_curator(self, pods): +        """Check to see if curator is up and working. Returns: error string""" +        if not pods: +            return ( +                "There are no Curator pods for the logging stack,\n" +                "so nothing will prune Elasticsearch indexes.\n" +                "Is Curator correctly deployed?" +            ) + +        not_running = super(Curator, self).not_running_pods(pods) +        if len(not_running) == len(pods): +            return ( +                "The Curator pod is not currently in a running state,\n" +                "so Elasticsearch indexes may increase without bound." +            ) +        if len(pods) - len(not_running) > 1: +            return ( +                "There is more than one Curator pod running. This should not normally happen.\n" +                "Although this doesn't cause any problems, you may want to investigate." +            ) + +        return None diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py new file mode 100644 index 000000000..01cb35b81 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -0,0 +1,217 @@ +""" +Module for performing checks on an Elasticsearch logging deployment +""" + +import json +import re + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Elasticsearch(LoggingCheck): +    """Module that checks an integrated logging Elasticsearch deployment""" + +    name = "elasticsearch" +    tags = ["health", "logging"] + +    logging_namespace = None + +    def run(self, tmp, task_vars): +        """Check various things and gather errors. Returns: result as hash""" + +        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") +        es_pods, error = super(Elasticsearch, self).get_pods_for_component( +            self.execute_module, +            self.logging_namespace, +            "es", +            task_vars, +        ) +        if error: +            return {"failed": True, "changed": False, "msg": error} +        check_error = self.check_elasticsearch(es_pods, task_vars) + +        if check_error: +            msg = ("The following Elasticsearch deployment issue was found:" +                   "\n-------\n" +                   "{}".format(check_error)) +            return {"failed": True, "changed": False, "msg": msg} + +        # TODO(lmeyer): run it all again for the ops cluster +        return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'} + +    def _not_running_elasticsearch_pods(self, es_pods): +        """Returns: list of running pods, list of errors about non-running pods""" +        not_running = super(Elasticsearch, self).not_running_pods(es_pods) +        if not_running: +            return not_running, [( +                'The following Elasticsearch pods are not running:\n' +                '{pods}' +                'These pods will not aggregate logs from their nodes.' +            ).format(pods=''.join( +                "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) +                for pod in not_running +            ))] +        return not_running, [] + +    def check_elasticsearch(self, es_pods, task_vars): +        """Various checks for elasticsearch. Returns: error string""" +        not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods) +        running_pods = [pod for pod in es_pods if pod not in not_running_pods] +        pods_by_name = { +            pod['metadata']['name']: pod for pod in running_pods +            # Filter out pods that are not members of a DC +            if pod['metadata'].get('labels', {}).get('deploymentconfig') +        } +        if not pods_by_name: +            return 'No logging Elasticsearch pods were found. Is logging deployed?' +        error_msgs += self._check_elasticsearch_masters(pods_by_name, task_vars) +        error_msgs += self._check_elasticsearch_node_list(pods_by_name, task_vars) +        error_msgs += self._check_es_cluster_health(pods_by_name, task_vars) +        error_msgs += self._check_elasticsearch_diskspace(pods_by_name, task_vars) +        return '\n'.join(error_msgs) + +    @staticmethod +    def _build_es_curl_cmd(pod_name, url): +        base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'" +        return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) + +    def _check_elasticsearch_masters(self, pods_by_name, task_vars): +        """Check that Elasticsearch masters are sane. Returns: list of error strings""" +        es_master_names = set() +        error_msgs = [] +        for pod_name in pods_by_name.keys(): +            # Compare what each ES node reports as master and compare for split brain +            get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") +            master_name_str = self._exec_oc(get_master_cmd, [], task_vars) +            master_names = (master_name_str or '').split(' ') +            if len(master_names) > 1: +                es_master_names.add(master_names[1]) +            else: +                error_msgs.append( +                    'No master? Elasticsearch {pod} returned bad string when asked master name:\n' +                    '  {response}'.format(pod=pod_name, response=master_name_str) +                ) + +        if not es_master_names: +            error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?') +            return '\n'.join(error_msgs) + +        if len(es_master_names) > 1: +            error_msgs.append( +                'Found multiple Elasticsearch masters according to the pods:\n' +                '{master_list}\n' +                'This implies that the masters have "split brain" and are not correctly\n' +                'replicating data for the logging cluster. Log loss is likely to occur.' +                .format(master_list='\n'.join('  ' + master for master in es_master_names)) +            ) + +        return error_msgs + +    def _check_elasticsearch_node_list(self, pods_by_name, task_vars): +        """Check that reported ES masters are accounted for by pods. Returns: list of error strings""" + +        if not pods_by_name: +            return ['No logging Elasticsearch masters were found. Is logging deployed?'] + +        # get ES cluster nodes +        node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') +        cluster_node_data = self._exec_oc(node_cmd, [], task_vars) +        try: +            cluster_nodes = json.loads(cluster_node_data)['nodes'] +        except (ValueError, KeyError): +            return [ +                'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' + +                cluster_node_data +            ] + +        # Try to match all ES-reported node hosts to known pods. +        error_msgs = [] +        for node in cluster_nodes.values(): +            # Note that with 1.4/3.4 the pod IP may be used as the master name +            if not any(node['host'] in (pod_name, pod['status'].get('podIP')) +                       for pod_name, pod in pods_by_name.items()): +                error_msgs.append( +                    'The Elasticsearch cluster reports a member node "{node}"\n' +                    'that does not correspond to any known ES pod.'.format(node=node['host']) +                ) + +        return error_msgs + +    def _check_es_cluster_health(self, pods_by_name, task_vars): +        """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" +        error_msgs = [] +        for pod_name in pods_by_name.keys(): +            cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') +            cluster_health_data = self._exec_oc(cluster_health_cmd, [], task_vars) +            try: +                health_res = json.loads(cluster_health_data) +                if not health_res or not health_res.get('status'): +                    raise ValueError() +            except ValueError: +                error_msgs.append( +                    'Could not retrieve cluster health status from logging ES pod "{pod}".\n' +                    'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data) +                ) +                continue + +            if health_res['status'] not in ['green', 'yellow']: +                error_msgs.append( +                    'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name) +                ) + +        return error_msgs + +    def _check_elasticsearch_diskspace(self, pods_by_name, task_vars): +        """ +        Exec into an ES pod and query the diskspace on the persistent volume. +        Returns: list of errors +        """ +        error_msgs = [] +        for pod_name in pods_by_name.keys(): +            df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) +            disk_output = self._exec_oc(df_cmd, [], task_vars) +            lines = disk_output.splitlines() +            # expecting one header looking like 'IUse% Use%' and one body line +            body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' +            if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]): +                error_msgs.append( +                    'Could not retrieve storage usage from logging ES pod "{pod}".\n' +                    'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output) +                ) +                continue +            inode_pct, disk_pct = re.match(body_re, lines[1]).groups() + +            inode_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_inode_pct', default='90') +            if int(inode_pct) >= int(inode_pct_thresh): +                error_msgs.append( +                    'Inode percent usage on the storage volume for logging ES pod "{pod}"\n' +                    '  is {pct}, greater than threshold {limit}.\n' +                    '  Note: threshold can be specified in inventory with {param}'.format( +                        pod=pod_name, +                        pct=str(inode_pct), +                        limit=str(inode_pct_thresh), +                        param='openshift_check_efk_es_inode_pct', +                    )) +            disk_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_storage_pct', default='80') +            if int(disk_pct) >= int(disk_pct_thresh): +                error_msgs.append( +                    'Disk percent usage on the storage volume for logging ES pod "{pod}"\n' +                    '  is {pct}, greater than threshold {limit}.\n' +                    '  Note: threshold can be specified in inventory with {param}'.format( +                        pod=pod_name, +                        pct=str(disk_pct), +                        limit=str(disk_pct_thresh), +                        param='openshift_check_efk_es_storage_pct', +                    )) + +        return error_msgs + +    def _exec_oc(self, cmd_str, extra_args, task_vars): +        return super(Elasticsearch, self).exec_oc( +            self.execute_module, +            self.logging_namespace, +            cmd_str, +            extra_args, +            task_vars, +        ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py new file mode 100644 index 000000000..627567293 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -0,0 +1,170 @@ +""" +Module for performing checks on an Fluentd logging deployment +""" + +import json + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Fluentd(LoggingCheck): +    """Module that checks an integrated logging Fluentd deployment""" +    name = "fluentd" +    tags = ["health", "logging"] + +    logging_namespace = None + +    def run(self, tmp, task_vars): +        """Check various things and gather errors. Returns: result as hash""" + +        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") +        fluentd_pods, error = super(Fluentd, self).get_pods_for_component( +            self.execute_module, +            self.logging_namespace, +            "fluentd", +            task_vars, +        ) +        if error: +            return {"failed": True, "changed": False, "msg": error} +        check_error = self.check_fluentd(fluentd_pods, task_vars) + +        if check_error: +            msg = ("The following Fluentd deployment issue was found:" +                   "\n-------\n" +                   "{}".format(check_error)) +            return {"failed": True, "changed": False, "msg": msg} + +        # TODO(lmeyer): run it all again for the ops cluster +        return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'} + +    @staticmethod +    def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector): +        """Filter to all nodes with fluentd label. Returns dict(name: node), error string""" +        label, value = node_selector.split('=', 1) +        fluentd_nodes = { +            name: node for name, node in nodes_by_name.items() +            if node['metadata']['labels'].get(label) == value +        } +        if not fluentd_nodes: +            return None, ( +                'There are no nodes with the fluentd label {label}.\n' +                'This means no logs will be aggregated from the nodes.' +            ).format(label=node_selector) +        return fluentd_nodes, None + +    @staticmethod +    def _check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars): +        """Note if nodes are not labeled as expected. Returns: error string""" +        intended_nodes = get_var(task_vars, 'openshift_logging_fluentd_hosts', default=['--all']) +        if not intended_nodes or '--all' in intended_nodes: +            intended_nodes = nodes_by_name.keys() +        nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) +        if nodes_missing_labels: +            return ( +                'The following nodes are supposed to be labeled with {label} but are not:\n' +                '  {nodes}\n' +                'Fluentd will not aggregate logs from these nodes.' +            ).format(label=node_selector, nodes=', '.join(nodes_missing_labels)) +        return None + +    @staticmethod +    def _check_nodes_have_fluentd(pods, fluentd_nodes): +        """Make sure fluentd is on all the labeled nodes. Returns: error string""" +        unmatched_nodes = fluentd_nodes.copy() +        node_names_by_label = { +            node['metadata']['labels']['kubernetes.io/hostname']: name +            for name, node in fluentd_nodes.items() +        } +        node_names_by_internal_ip = { +            address['address']: name +            for name, node in fluentd_nodes.items() +            for address in node['status']['addresses'] +            if address['type'] == "InternalIP" +        } +        for pod in pods: +            for name in [ +                    pod['spec']['nodeName'], +                    node_names_by_internal_ip.get(pod['spec']['nodeName']), +                    node_names_by_label.get(pod.get('spec', {}).get('host')), +            ]: +                unmatched_nodes.pop(name, None) +        if unmatched_nodes: +            return ( +                'The following nodes are supposed to have a Fluentd pod but do not:\n' +                '{nodes}' +                'These nodes will not have their logs aggregated.' +            ).format(nodes=''.join( +                "  {}\n".format(name) +                for name in unmatched_nodes.keys() +            )) +        return None + +    def _check_fluentd_pods_running(self, pods): +        """Make sure all fluentd pods are running. Returns: error string""" +        not_running = super(Fluentd, self).not_running_pods(pods) +        if not_running: +            return ( +                'The following Fluentd pods are supposed to be running but are not:\n' +                '{pods}' +                'These pods will not aggregate logs from their nodes.' +            ).format(pods=''.join( +                "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) +                for pod in not_running +            )) +        return None + +    def check_fluentd(self, pods, task_vars): +        """Verify fluentd is running everywhere. Returns: error string""" + +        node_selector = get_var(task_vars, 'openshift_logging_fluentd_nodeselector', +                                default='logging-infra-fluentd=true') + +        nodes_by_name, error = self.get_nodes_by_name(task_vars) + +        if error: +            return error +        fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector) +        if error: +            return error + +        error_msgs = [] +        error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars) +        if error: +            error_msgs.append(error) +        error = self._check_nodes_have_fluentd(pods, fluentd_nodes) +        if error: +            error_msgs.append(error) +        error = self._check_fluentd_pods_running(pods) +        if error: +            error_msgs.append(error) + +        # Make sure there are no extra fluentd pods +        if len(pods) > len(fluentd_nodes): +            error_msgs.append( +                'There are more Fluentd pods running than nodes labeled.\n' +                'This may not cause problems with logging but it likely indicates something wrong.' +            ) + +        return '\n'.join(error_msgs) + +    def get_nodes_by_name(self, task_vars): +        """Retrieve all the node definitions. Returns: dict(name: node), error string""" +        nodes_json = self._exec_oc("get nodes -o json", [], task_vars) +        try: +            nodes = json.loads(nodes_json) +        except ValueError:  # no valid json - should not happen +            return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json +        if not nodes or not nodes.get('items'):  # also should not happen +            return None, "No nodes appear to be defined according to the API." +        return { +            node['metadata']['name']: node +            for node in nodes['items'] +        }, None + +    def _exec_oc(self, cmd_str, extra_args, task_vars): +        return super(Fluentd, self).exec_oc(self.execute_module, +                                            self.logging_namespace, +                                            cmd_str, +                                            extra_args, +                                            task_vars) diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py new file mode 100644 index 000000000..442f407b1 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -0,0 +1,229 @@ +""" +Module for performing checks on a Kibana logging deployment +""" + +import json +import ssl + +try: +    from urllib2 import HTTPError, URLError +    import urllib2 +except ImportError: +    from urllib.error import HTTPError, URLError +    import urllib.request as urllib2 + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Kibana(LoggingCheck): +    """Module that checks an integrated logging Kibana deployment""" + +    name = "kibana" +    tags = ["health", "logging"] + +    logging_namespace = None + +    def run(self, tmp, task_vars): +        """Check various things and gather errors. Returns: result as hash""" + +        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") +        kibana_pods, error = super(Kibana, self).get_pods_for_component( +            self.execute_module, +            self.logging_namespace, +            "kibana", +            task_vars, +        ) +        if error: +            return {"failed": True, "changed": False, "msg": error} +        check_error = self.check_kibana(kibana_pods) + +        if not check_error: +            check_error = self._check_kibana_route(task_vars) + +        if check_error: +            msg = ("The following Kibana deployment issue was found:" +                   "\n-------\n" +                   "{}".format(check_error)) +            return {"failed": True, "changed": False, "msg": msg} + +        # TODO(lmeyer): run it all again for the ops cluster +        return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'} + +    def _verify_url_internal(self, url, task_vars): +        """ +        Try to reach a URL from the host. +        Returns: success (bool), reason (for failure) +        """ +        args = dict( +            url=url, +            follow_redirects='none', +            validate_certs='no',  # likely to be signed with internal CA +            # TODO(lmeyer): give users option to validate certs +            status_code=302, +        ) +        result = self.execute_module('uri', args, task_vars) +        if result.get('failed'): +            return result['msg'] +        return None + +    @staticmethod +    def _verify_url_external(url): +        """ +        Try to reach a URL from ansible control host. +        Returns: success (bool), reason (for failure) +        """ +        # This actually checks from the ansible control host, which may or may not +        # really be "external" to the cluster. + +        # Disable SSL cert validation to work around internally signed certs +        ctx = ssl.create_default_context() +        ctx.check_hostname = False  # or setting CERT_NONE is refused +        ctx.verify_mode = ssl.CERT_NONE + +        # Verify that the url is returning a valid response +        try: +            # We only care if the url connects and responds +            return_code = urllib2.urlopen(url, context=ctx).getcode() +        except HTTPError as httperr: +            return httperr.reason +        except URLError as urlerr: +            return str(urlerr) + +        # there appears to be no way to prevent urlopen from following redirects +        if return_code != 200: +            return 'Expected success (200) but got return code {}'.format(int(return_code)) + +        return None + +    def check_kibana(self, pods): +        """Check to see if Kibana is up and working. Returns: error string.""" + +        if not pods: +            return "There are no Kibana pods deployed, so no access to the logging UI." + +        not_running = self.not_running_pods(pods) +        if len(not_running) == len(pods): +            return "No Kibana pod is in a running state, so there is no access to the logging UI." +        elif not_running: +            return ( +                "The following Kibana pods are not currently in a running state:\n" +                "{pods}" +                "However at least one is, so service may not be impacted." +            ).format(pods="".join("  " + pod['metadata']['name'] + "\n" for pod in not_running)) + +        return None + +    def _get_kibana_url(self, task_vars): +        """ +        Get kibana route or report error. +        Returns: url (or empty), reason for failure +        """ + +        # Get logging url +        get_route = self._exec_oc("get route logging-kibana -o json", [], task_vars) +        if not get_route: +            return None, 'no_route_exists' + +        route = json.loads(get_route) + +        # check that the route has been accepted by a router +        ingress = route["status"]["ingress"] +        # ingress can be null if there is no router, or empty if not routed +        if not ingress or not ingress[0]: +            return None, 'route_not_accepted' + +        host = route.get("spec", {}).get("host") +        if not host: +            return None, 'route_missing_host' + +        return 'https://{}/'.format(host), None + +    def _check_kibana_route(self, task_vars): +        """ +        Check to see if kibana route is up and working. +        Returns: error string +        """ +        known_errors = dict( +            no_route_exists=( +                'No route is defined for Kibana in the logging namespace,\n' +                'so the logging stack is not accessible. Is logging deployed?\n' +                'Did something remove the logging-kibana route?' +            ), +            route_not_accepted=( +                'The logging-kibana route is not being routed by any router.\n' +                'Is the router deployed and working?' +            ), +            route_missing_host=( +                'The logging-kibana route has no hostname defined,\n' +                'which should never happen. Did something alter its definition?' +            ), +        ) + +        kibana_url, error = self._get_kibana_url(task_vars) +        if not kibana_url: +            return known_errors.get(error, error) + +        # first, check that kibana is reachable from the master. +        error = self._verify_url_internal(kibana_url, task_vars) +        if error: +            if 'urlopen error [Errno 111] Connection refused' in error: +                error = ( +                    'Failed to connect from this master to Kibana URL {url}\n' +                    'Is kibana running, and is at least one router routing to it?' +                ).format(url=kibana_url) +            elif 'urlopen error [Errno -2] Name or service not known' in error: +                error = ( +                    'Failed to connect from this master to Kibana URL {url}\n' +                    'because the hostname does not resolve.\n' +                    'Is DNS configured for the Kibana hostname?' +                ).format(url=kibana_url) +            elif 'Status code was not' in error: +                error = ( +                    'A request from this master to the Kibana URL {url}\n' +                    'did not return the correct status code (302).\n' +                    'This could mean that Kibana is malfunctioning, the hostname is\n' +                    'resolving incorrectly, or other network issues. The output was:\n' +                    '  {error}' +                ).format(url=kibana_url, error=error) +            return 'Error validating the logging Kibana route:\n' + error + +        # in production we would like the kibana route to work from outside the +        # cluster too; but that may not be the case, so allow disabling just this part. +        if not get_var(task_vars, "openshift_check_efk_kibana_external", default=True): +            return None +        error = self._verify_url_external(kibana_url) +        if error: +            if 'urlopen error [Errno 111] Connection refused' in error: +                error = ( +                    'Failed to connect from the Ansible control host to Kibana URL {url}\n' +                    'Is the router for the Kibana hostname exposed externally?' +                ).format(url=kibana_url) +            elif 'urlopen error [Errno -2] Name or service not known' in error: +                error = ( +                    'Failed to resolve the Kibana hostname in {url}\n' +                    'from the Ansible control host.\n' +                    'Is DNS configured to resolve this Kibana hostname externally?' +                ).format(url=kibana_url) +            elif 'Expected success (200)' in error: +                error = ( +                    'A request to Kibana at {url}\n' +                    'returned the wrong error code:\n' +                    '  {error}\n' +                    'This could mean that Kibana is malfunctioning, the hostname is\n' +                    'resolving incorrectly, or other network issues.' +                ).format(url=kibana_url, error=error) +            error = ( +                'Error validating the logging Kibana route:\n{error}\n' +                'To disable external Kibana route validation, set in your inventory:\n' +                '  openshift_check_efk_kibana_external=False' +            ).format(error=error) +            return error +        return None + +    def _exec_oc(self, cmd_str, extra_args, task_vars): +        return super(Kibana, self).exec_oc(self.execute_module, +                                           self.logging_namespace, +                                           cmd_str, +                                           extra_args, +                                           task_vars) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py new file mode 100644 index 000000000..05b4d300c --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -0,0 +1,96 @@ +""" +Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack +""" + +import json +import os + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var + + +class LoggingCheck(OpenShiftCheck): +    """Base class for logging component checks""" + +    name = "logging" + +    @classmethod +    def is_active(cls, task_vars): +        return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars) + +    @staticmethod +    def is_first_master(task_vars): +        """Run only on first master and only when logging is configured. Returns: bool""" +        logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True) +        # Note: It would be nice to use membership in oo_first_master group, however for now it +        # seems best to avoid requiring that setup and just check this is the first master. +        hostname = get_var(task_vars, "ansible_ssh_host") or [None] +        masters = get_var(task_vars, "groups", "masters", default=None) or [None] +        return logging_deployed and masters[0] == hostname + +    def run(self, tmp, task_vars): +        pass + +    def get_pods_for_component(self, execute_module, namespace, logging_component, task_vars): +        """Get all pods for a given component. Returns: list of pods for component, error string""" +        pod_output = self.exec_oc( +            execute_module, +            namespace, +            "get pods -l component={} -o json".format(logging_component), +            [], +            task_vars +        ) +        try: +            pods = json.loads(pod_output) +            if not pods or not pods.get('items'): +                raise ValueError() +        except ValueError: +            # successful run but non-parsing data generally means there were no pods in the namespace +            return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace) + +        return pods['items'], None + +    @staticmethod +    def not_running_pods(pods): +        """Returns: list of pods not in a ready and running state""" +        return [ +            pod for pod in pods +            if any( +                container['ready'] is False +                for container in pod['status']['containerStatuses'] +            ) or not any( +                condition['type'] == 'Ready' and condition['status'] == 'True' +                for condition in pod['status']['conditions'] +            ) +        ] + +    @staticmethod +    def exec_oc(execute_module=None, namespace="logging", cmd_str="", extra_args=None, task_vars=None): +        """ +        Execute an 'oc' command in the remote host. +        Returns: output of command and namespace, +        or raises OpenShiftCheckException on error +        """ +        config_base = get_var(task_vars, "openshift", "common", "config_base") +        args = { +            "namespace": namespace, +            "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), +            "cmd": cmd_str, +            "extra_args": list(extra_args) if extra_args else [], +        } + +        result = execute_module("ocutil", args, task_vars) +        if result.get("failed"): +            msg = ( +                'Unexpected error using `oc` to validate the logging stack components.\n' +                'Error executing `oc {cmd}`:\n' +                '{error}' +            ).format(cmd=args['cmd'], error=result['result']) + +            if result['result'] == '[Errno 2] No such file or directory': +                msg = ( +                    "This host is supposed to be a master but does not have the `oc` command where expected.\n" +                    "Has an installation been run on this host yet?" +                ) +            raise OpenShiftCheckException(msg) + +        return result.get("result", "") diff --git a/roles/openshift_health_checker/test/curator_test.py b/roles/openshift_health_checker/test/curator_test.py new file mode 100644 index 000000000..ae108c96e --- /dev/null +++ b/roles/openshift_health_checker/test/curator_test.py @@ -0,0 +1,68 @@ +import pytest + +from openshift_checks.logging.curator import Curator + + +def canned_curator(exec_oc=None): +    """Create a Curator check object with canned exec_oc method""" +    check = Curator("dummy")  # fails if a module is actually invoked +    if exec_oc: +        check._exec_oc = exec_oc +    return check + + +def assert_error(error, expect_error): +    if expect_error: +        assert error +        assert expect_error in error +    else: +        assert not error + + +plain_curator_pod = { +    "metadata": { +        "labels": {"component": "curator", "deploymentconfig": "logging-curator"}, +        "name": "logging-curator-1", +    }, +    "status": { +        "containerStatuses": [{"ready": True}], +        "conditions": [{"status": "True", "type": "Ready"}], +        "podIP": "10.10.10.10", +    } +} + +not_running_curator_pod = { +    "metadata": { +        "labels": {"component": "curator", "deploymentconfig": "logging-curator"}, +        "name": "logging-curator-2", +    }, +    "status": { +        "containerStatuses": [{"ready": False}], +        "conditions": [{"status": "False", "type": "Ready"}], +        "podIP": "10.10.10.10", +    } +} + + +@pytest.mark.parametrize('pods, expect_error', [ +    ( +        [], +        "no Curator pods", +    ), +    ( +        [plain_curator_pod], +        None, +    ), +    ( +        [not_running_curator_pod], +        "not currently in a running state", +    ), +    ( +        [plain_curator_pod, plain_curator_pod], +        "more than one Curator pod", +    ), +]) +def test_get_curator_pods(pods, expect_error): +    check = canned_curator() +    error = check.check_curator(pods) +    assert_error(error, expect_error) diff --git a/roles/openshift_health_checker/test/elasticsearch_test.py b/roles/openshift_health_checker/test/elasticsearch_test.py new file mode 100644 index 000000000..b9d375d8c --- /dev/null +++ b/roles/openshift_health_checker/test/elasticsearch_test.py @@ -0,0 +1,180 @@ +import pytest +import json + +from openshift_checks.logging.elasticsearch import Elasticsearch + +task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) + + +def canned_elasticsearch(exec_oc=None): +    """Create an Elasticsearch check object with canned exec_oc method""" +    check = Elasticsearch("dummy")  # fails if a module is actually invoked +    if exec_oc: +        check._exec_oc = exec_oc +    return check + + +def assert_error(error, expect_error): +    if expect_error: +        assert error +        assert expect_error in error +    else: +        assert not error + + +plain_es_pod = { +    "metadata": { +        "labels": {"component": "es", "deploymentconfig": "logging-es"}, +        "name": "logging-es", +    }, +    "status": { +        "conditions": [{"status": "True", "type": "Ready"}], +        "containerStatuses": [{"ready": True}], +        "podIP": "10.10.10.10", +    }, +    "_test_master_name_str": "name logging-es", +} + +split_es_pod = { +    "metadata": { +        "labels": {"component": "es", "deploymentconfig": "logging-es-2"}, +        "name": "logging-es-2", +    }, +    "status": { +        "conditions": [{"status": "True", "type": "Ready"}], +        "containerStatuses": [{"ready": True}], +        "podIP": "10.10.10.10", +    }, +    "_test_master_name_str": "name logging-es-2", +} + + +def test_check_elasticsearch(): +    assert 'No logging Elasticsearch pods' in canned_elasticsearch().check_elasticsearch([], {}) + +    # canned oc responses to match so all the checks pass +    def _exec_oc(cmd, args, task_vars): +        if '_cat/master' in cmd: +            return 'name logging-es' +        elif '/_nodes' in cmd: +            return json.dumps(es_node_list) +        elif '_cluster/health' in cmd: +            return '{"status": "green"}' +        elif ' df ' in cmd: +            return 'IUse% Use%\n 3%  4%\n' +        else: +            raise Exception(cmd) + +    assert not canned_elasticsearch(_exec_oc).check_elasticsearch([plain_es_pod], {}) + + +def pods_by_name(pods): +    return {pod['metadata']['name']: pod for pod in pods} + + +@pytest.mark.parametrize('pods, expect_error', [ +    ( +        [], +        'No logging Elasticsearch masters', +    ), +    ( +        [plain_es_pod], +        None, +    ), +    ( +        [plain_es_pod, split_es_pod], +        'Found multiple Elasticsearch masters', +    ), +]) +def test_check_elasticsearch_masters(pods, expect_error): +    test_pods = list(pods) +    check = canned_elasticsearch(lambda cmd, args, task_vars: test_pods.pop(0)['_test_master_name_str']) + +    errors = check._check_elasticsearch_masters(pods_by_name(pods), task_vars_config_base) +    assert_error(''.join(errors), expect_error) + + +es_node_list = { +    'nodes': { +        'random-es-name': { +            'host': 'logging-es', +        }}} + + +@pytest.mark.parametrize('pods, node_list, expect_error', [ +    ( +        [], +        {}, +        'No logging Elasticsearch masters', +    ), +    ( +        [plain_es_pod], +        es_node_list, +        None, +    ), +    ( +        [plain_es_pod], +        {},  # empty list of nodes triggers KeyError +        "Failed to query", +    ), +    ( +        [split_es_pod], +        es_node_list, +        'does not correspond to any known ES pod', +    ), +]) +def test_check_elasticsearch_node_list(pods, node_list, expect_error): +    check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(node_list)) + +    errors = check._check_elasticsearch_node_list(pods_by_name(pods), task_vars_config_base) +    assert_error(''.join(errors), expect_error) + + +@pytest.mark.parametrize('pods, health_data, expect_error', [ +    ( +        [plain_es_pod], +        [{"status": "green"}], +        None, +    ), +    ( +        [plain_es_pod], +        [{"no-status": "should bomb"}], +        'Could not retrieve cluster health status', +    ), +    ( +        [plain_es_pod, split_es_pod], +        [{"status": "green"}, {"status": "red"}], +        'Elasticsearch cluster health status is RED', +    ), +]) +def test_check_elasticsearch_cluster_health(pods, health_data, expect_error): +    test_health_data = list(health_data) +    check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(test_health_data.pop(0))) + +    errors = check._check_es_cluster_health(pods_by_name(pods), task_vars_config_base) +    assert_error(''.join(errors), expect_error) + + +@pytest.mark.parametrize('disk_data, expect_error', [ +    ( +        'df: /elasticsearch/persistent: No such file or directory\n', +        'Could not retrieve storage usage', +    ), +    ( +        'IUse% Use%\n 3%  4%\n', +        None, +    ), +    ( +        'IUse% Use%\n 95%  40%\n', +        'Inode percent usage on the storage volume', +    ), +    ( +        'IUse% Use%\n 3%  94%\n', +        'Disk percent usage on the storage volume', +    ), +]) +def test_check_elasticsearch_diskspace(disk_data, expect_error): +    check = canned_elasticsearch(lambda cmd, args, task_vars: disk_data) + +    errors = check._check_elasticsearch_diskspace(pods_by_name([plain_es_pod]), task_vars_config_base) +    assert_error(''.join(errors), expect_error) diff --git a/roles/openshift_health_checker/test/fluentd_test.py b/roles/openshift_health_checker/test/fluentd_test.py new file mode 100644 index 000000000..d151c0b19 --- /dev/null +++ b/roles/openshift_health_checker/test/fluentd_test.py @@ -0,0 +1,109 @@ +import pytest +import json + +from openshift_checks.logging.fluentd import Fluentd + + +def canned_fluentd(exec_oc=None): +    """Create a Fluentd check object with canned exec_oc method""" +    check = Fluentd("dummy")  # fails if a module is actually invoked +    if exec_oc: +        check._exec_oc = exec_oc +    return check + + +def assert_error(error, expect_error): +    if expect_error: +        assert error +        assert expect_error in error +    else: +        assert not error + + +fluentd_pod_node1 = { +    "metadata": { +        "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, +        "name": "logging-fluentd-1", +    }, +    "spec": {"host": "node1", "nodeName": "node1"}, +    "status": { +        "containerStatuses": [{"ready": True}], +        "conditions": [{"status": "True", "type": "Ready"}], +    } +} +fluentd_pod_node2_down = { +    "metadata": { +        "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, +        "name": "logging-fluentd-2", +    }, +    "spec": {"host": "node2", "nodeName": "node2"}, +    "status": { +        "containerStatuses": [{"ready": False}], +        "conditions": [{"status": "False", "type": "Ready"}], +    } +} +fluentd_node1 = { +    "metadata": { +        "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "node1"}, +        "name": "node1", +    }, +    "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.1"}]}, +} +fluentd_node2 = { +    "metadata": { +        "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "hostname"}, +        "name": "node2", +    }, +    "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.2"}]}, +} +fluentd_node3_unlabeled = { +    "metadata": { +        "labels": {"kubernetes.io/hostname": "hostname"}, +        "name": "node3", +    }, +    "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.3"}]}, +} + + +@pytest.mark.parametrize('pods, nodes, expect_error', [ +    ( +        [], +        [], +        'No nodes appear to be defined', +    ), +    ( +        [], +        [fluentd_node3_unlabeled], +        'There are no nodes with the fluentd label', +    ), +    ( +        [], +        [fluentd_node1, fluentd_node3_unlabeled], +        'Fluentd will not aggregate logs from these nodes.', +    ), +    ( +        [], +        [fluentd_node2], +        "nodes are supposed to have a Fluentd pod but do not", +    ), +    ( +        [fluentd_pod_node1, fluentd_pod_node1], +        [fluentd_node1], +        'more Fluentd pods running than nodes labeled', +    ), +    ( +        [fluentd_pod_node2_down], +        [fluentd_node2], +        "Fluentd pods are supposed to be running", +    ), +    ( +        [fluentd_pod_node1], +        [fluentd_node1], +        None, +    ), +]) +def test_get_fluentd_pods(pods, nodes, expect_error): +    check = canned_fluentd(lambda cmd, args, task_vars: json.dumps(dict(items=nodes))) + +    error = check.check_fluentd(pods, {}) +    assert_error(error, expect_error) diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py new file mode 100644 index 000000000..19140a1b6 --- /dev/null +++ b/roles/openshift_health_checker/test/kibana_test.py @@ -0,0 +1,218 @@ +import pytest +import json + +try: +    import urllib2 +    from urllib2 import HTTPError, URLError +except ImportError: +    from urllib.error import HTTPError, URLError +    import urllib.request as urllib2 + +from openshift_checks.logging.kibana import Kibana + + +def canned_kibana(exec_oc=None): +    """Create a Kibana check object with canned exec_oc method""" +    check = Kibana("dummy")  # fails if a module is actually invoked +    if exec_oc: +        check._exec_oc = exec_oc +    return check + + +def assert_error(error, expect_error): +    if expect_error: +        assert error +        assert expect_error in error +    else: +        assert not error + + +plain_kibana_pod = { +    "metadata": { +        "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, +        "name": "logging-kibana-1", +    }, +    "status": { +        "containerStatuses": [{"ready": True}, {"ready": True}], +        "conditions": [{"status": "True", "type": "Ready"}], +    } +} +not_running_kibana_pod = { +    "metadata": { +        "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, +        "name": "logging-kibana-2", +    }, +    "status": { +        "containerStatuses": [{"ready": True}, {"ready": False}], +        "conditions": [{"status": "True", "type": "Ready"}], +    } +} + + +@pytest.mark.parametrize('pods, expect_error', [ +    ( +        [], +        "There are no Kibana pods deployed", +    ), +    ( +        [plain_kibana_pod], +        None, +    ), +    ( +        [not_running_kibana_pod], +        "No Kibana pod is in a running state", +    ), +    ( +        [plain_kibana_pod, not_running_kibana_pod], +        "The following Kibana pods are not currently in a running state", +    ), +]) +def test_check_kibana(pods, expect_error): +    check = canned_kibana() +    error = check.check_kibana(pods) +    assert_error(error, expect_error) + + +@pytest.mark.parametrize('route, expect_url, expect_error', [ +    ( +        None, +        None, +        'no_route_exists', +    ), + +    # test route with no ingress +    ( +        { +            "metadata": { +                "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, +                "name": "logging-kibana", +            }, +            "status": { +                "ingress": [], +            }, +            "spec": { +                "host": "hostname", +            } +        }, +        None, +        'route_not_accepted', +    ), + +    # test route with no host +    ( +        { +            "metadata": { +                "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, +                "name": "logging-kibana", +            }, +            "status": { +                "ingress": [{ +                    "status": True, +                }], +            }, +            "spec": {}, +        }, +        None, +        'route_missing_host', +    ), + +    # test route that looks fine +    ( +        { +            "metadata": { +                "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, +                "name": "logging-kibana", +            }, +            "status": { +                "ingress": [{ +                    "status": True, +                }], +            }, +            "spec": { +                "host": "hostname", +            }, +        }, +        "https://hostname/", +        None, +    ), +]) +def test_get_kibana_url(route, expect_url, expect_error): +    check = canned_kibana(lambda cmd, args, task_vars: json.dumps(route) if route else "") + +    url, error = check._get_kibana_url({}) +    if expect_url: +        assert url == expect_url +    else: +        assert not url +    if expect_error: +        assert error == expect_error +    else: +        assert not error + + +@pytest.mark.parametrize('exec_result, expect', [ +    ( +        'urlopen error [Errno 111] Connection refused', +        'at least one router routing to it?', +    ), +    ( +        'urlopen error [Errno -2] Name or service not known', +        'DNS configured for the Kibana hostname?', +    ), +    ( +        'Status code was not [302]: HTTP Error 500: Server error', +        'did not return the correct status code', +    ), +    ( +        'bork bork bork', +        'bork bork bork',  # should pass through +    ), +]) +def test_verify_url_internal_failure(exec_result, expect): +    check = Kibana(execute_module=lambda module_name, args, task_vars: dict(failed=True, msg=exec_result)) +    check._get_kibana_url = lambda task_vars: ('url', None) + +    error = check._check_kibana_route({}) +    assert_error(error, expect) + + +@pytest.mark.parametrize('lib_result, expect', [ +    ( +        HTTPError('url', 500, "it broke", hdrs=None, fp=None), +        'it broke', +    ), +    ( +        URLError('it broke'), +        'it broke', +    ), +    ( +        302, +        'returned the wrong error code', +    ), +    ( +        200, +        None, +    ), +]) +def test_verify_url_external_failure(lib_result, expect, monkeypatch): + +    class _http_return: + +        def __init__(self, code): +            self.code = code + +        def getcode(self): +            return self.code + +    def urlopen(url, context): +        if type(lib_result) is int: +            return _http_return(lib_result) +        raise lib_result +    monkeypatch.setattr(urllib2, 'urlopen', urlopen) + +    check = canned_kibana() +    check._get_kibana_url = lambda task_vars: ('url', None) +    check._verify_url_internal = lambda url, task_vars: None + +    error = check._check_kibana_route({}) +    assert_error(error, expect) diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py new file mode 100644 index 000000000..b6db34fe3 --- /dev/null +++ b/roles/openshift_health_checker/test/logging_check_test.py @@ -0,0 +1,137 @@ +import pytest +import json + +from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException + +task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) + + +logging_namespace = "logging" + + +def canned_loggingcheck(exec_oc=None): +    """Create a LoggingCheck object with canned exec_oc method""" +    check = LoggingCheck("dummy")  # fails if a module is actually invoked +    check.logging_namespace = 'logging' +    if exec_oc: +        check.exec_oc = exec_oc +    return check + + +def assert_error(error, expect_error): +    if expect_error: +        assert error +        assert expect_error in error +    else: +        assert not error + + +plain_es_pod = { +    "metadata": { +        "labels": {"component": "es", "deploymentconfig": "logging-es"}, +        "name": "logging-es", +    }, +    "status": { +        "conditions": [{"status": "True", "type": "Ready"}], +        "containerStatuses": [{"ready": True}], +        "podIP": "10.10.10.10", +    }, +    "_test_master_name_str": "name logging-es", +} + +plain_kibana_pod = { +    "metadata": { +        "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, +        "name": "logging-kibana-1", +    }, +    "status": { +        "containerStatuses": [{"ready": True}, {"ready": True}], +        "conditions": [{"status": "True", "type": "Ready"}], +    } +} + +fluentd_pod_node1 = { +    "metadata": { +        "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"}, +        "name": "logging-fluentd-1", +    }, +    "spec": {"host": "node1", "nodeName": "node1"}, +    "status": { +        "containerStatuses": [{"ready": True}], +        "conditions": [{"status": "True", "type": "Ready"}], +    } +} + +plain_curator_pod = { +    "metadata": { +        "labels": {"component": "curator", "deploymentconfig": "logging-curator"}, +        "name": "logging-curator-1", +    }, +    "status": { +        "containerStatuses": [{"ready": True}], +        "conditions": [{"status": "True", "type": "Ready"}], +        "podIP": "10.10.10.10", +    } +} + + +@pytest.mark.parametrize('problem, expect', [ +    ("[Errno 2] No such file or directory", "supposed to be a master"), +    ("Permission denied", "Unexpected error using `oc`"), +]) +def test_oc_failure(problem, expect): +    def execute_module(module_name, args, task_vars): +        if module_name == "ocutil": +            return dict(failed=True, result=problem) +        return dict(changed=False) + +    check = LoggingCheck({}) + +    with pytest.raises(OpenShiftCheckException) as excinfo: +        check.exec_oc(execute_module, logging_namespace, 'get foo', [], task_vars=task_vars_config_base) +    assert expect in str(excinfo) + + +groups_with_first_master = dict(masters=['this-host', 'other-host']) +groups_with_second_master = dict(masters=['other-host', 'this-host']) +groups_not_a_master = dict(masters=['other-host']) + + +@pytest.mark.parametrize('groups, logging_deployed, is_active', [ +    (groups_with_first_master, True, True), +    (groups_with_first_master, False, False), +    (groups_not_a_master, True, False), +    (groups_with_second_master, True, False), +    (groups_not_a_master, True, False), +]) +def test_is_active(groups, logging_deployed, is_active): +    task_vars = dict( +        ansible_ssh_host='this-host', +        groups=groups, +        openshift_hosted_logging_deploy=logging_deployed, +    ) + +    assert LoggingCheck.is_active(task_vars=task_vars) == is_active + + +@pytest.mark.parametrize('pod_output, expect_pods, expect_error', [ +    ( +        'No resources found.', +        None, +        'There are no pods in the logging namespace', +    ), +    ( +        json.dumps({'items': [plain_kibana_pod, plain_es_pod, plain_curator_pod, fluentd_pod_node1]}), +        [plain_es_pod], +        None, +    ), +]) +def test_get_pods_for_component(pod_output, expect_pods, expect_error): +    check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: pod_output) +    pods, error = check.get_pods_for_component( +        lambda name, args, task_vars: {}, +        logging_namespace, +        "es", +        {} +    ) +    assert_error(error, expect_error)  | 
