diff options
| -rw-r--r-- | roles/openshift_health_checker/openshift_checks/etcd_volume.py | 58 | ||||
| -rw-r--r-- | roles/openshift_health_checker/test/etcd_volume_test.py | 149 | 
2 files changed, 207 insertions, 0 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py new file mode 100644 index 000000000..7452c9cc1 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -0,0 +1,58 @@ +"""A health check for OpenShift clusters.""" + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var + + +class EtcdVolume(OpenShiftCheck): +    """Ensures etcd storage usage does not exceed a given threshold.""" + +    name = "etcd_volume" +    tags = ["etcd", "health"] + +    # Default device usage threshold. Value should be in the range [0, 100]. +    default_threshold_percent = 90 +    # Where to find ectd data, higher priority first. +    supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"] + +    @classmethod +    def is_active(cls, task_vars): +        etcd_hosts = get_var(task_vars, "groups", "etcd", default=[]) or get_var(task_vars, "groups", "masters", +                                                                                 default=[]) or [] +        is_etcd_host = get_var(task_vars, "ansible_ssh_host") in etcd_hosts +        return super(EtcdVolume, cls).is_active(task_vars) and is_etcd_host + +    def run(self, tmp, task_vars): +        mount_info = self._etcd_mount_info(task_vars) +        available = mount_info["size_available"] +        total = mount_info["size_total"] +        used = total - available + +        threshold = get_var( +            task_vars, +            "etcd_device_usage_threshold_percent", +            default=self.default_threshold_percent +        ) + +        used_percent = 100.0 * used / total + +        if used_percent > threshold: +            device = mount_info.get("device", "unknown") +            mount = mount_info.get("mount", "unknown") +            msg = "etcd storage usage ({:.1f}%) is above threshold ({:.1f}%). Device: {}, mount: {}.".format( +                used_percent, threshold, device, mount +            ) +            return {"failed": True, "msg": msg} + +        return {"changed": False} + +    def _etcd_mount_info(self, task_vars): +        ansible_mounts = get_var(task_vars, "ansible_mounts") +        mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts} + +        for path in self.supported_mount_paths: +            if path in mounts: +                return mounts[path] + +        paths = ', '.join(sorted(mounts)) or 'none' +        msg = "Unable to find etcd storage mount point. Paths mounted: {}.".format(paths) +        raise OpenShiftCheckException(msg) diff --git a/roles/openshift_health_checker/test/etcd_volume_test.py b/roles/openshift_health_checker/test/etcd_volume_test.py new file mode 100644 index 000000000..917045526 --- /dev/null +++ b/roles/openshift_health_checker/test/etcd_volume_test.py @@ -0,0 +1,149 @@ +import pytest + +from openshift_checks.etcd_volume import EtcdVolume, OpenShiftCheckException + + +@pytest.mark.parametrize('ansible_mounts,extra_words', [ +    ([], ['none']),  # empty ansible_mounts +    ([{'mount': '/mnt'}], ['/mnt']),  # missing relevant mount paths +]) +def test_cannot_determine_available_disk(ansible_mounts, extra_words): +    task_vars = dict( +        ansible_mounts=ansible_mounts, +    ) +    check = EtcdVolume(execute_module=fake_execute_module) + +    with pytest.raises(OpenShiftCheckException) as excinfo: +        check.run(tmp=None, task_vars=task_vars) + +    for word in 'Unable to find etcd storage mount point'.split() + extra_words: +        assert word in str(excinfo.value) + + +@pytest.mark.parametrize('size_limit,ansible_mounts', [ +    ( +        # if no size limit is specified, expect max usage +        # limit to default to 90% of size_total +        None, +        [{ +            'mount': '/', +            'size_available': 40 * 10**9, +            'size_total': 80 * 10**9 +        }], +    ), +    ( +        1, +        [{ +            'mount': '/', +            'size_available': 30 * 10**9, +            'size_total': 30 * 10**9, +        }], +    ), +    ( +        20000000000, +        [{ +            'mount': '/', +            'size_available': 20 * 10**9, +            'size_total': 40 * 10**9, +        }], +    ), +    ( +        5000000000, +        [{ +            # not enough space on / ... +            'mount': '/', +            'size_available': 0, +            'size_total': 0, +        }, { +            # not enough space on /var/lib ... +            'mount': '/var/lib', +            'size_available': 2 * 10**9, +            'size_total': 21 * 10**9, +        }, { +            # ... but enough on /var/lib/etcd +            'mount': '/var/lib/etcd', +            'size_available': 36 * 10**9, +            'size_total': 40 * 10**9 +        }], +    ) +]) +def test_succeeds_with_recommended_disk_space(size_limit, ansible_mounts): +    task_vars = dict( +        etcd_device_usage_threshold_percent=size_limit, +        ansible_mounts=ansible_mounts, +    ) + +    if task_vars["etcd_device_usage_threshold_percent"] is None: +        task_vars.pop("etcd_device_usage_threshold_percent") + +    check = EtcdVolume(execute_module=fake_execute_module) +    result = check.run(tmp=None, task_vars=task_vars) + +    assert not result.get('failed', False) + + +@pytest.mark.parametrize('size_limit_percent,ansible_mounts,extra_words', [ +    ( +        # if no size limit is specified, expect max usage +        # limit to default to 90% of size_total +        None, +        [{ +            'mount': '/', +            'size_available': 1 * 10**9, +            'size_total': 100 * 10**9, +        }], +        ['99.0%'], +    ), +    ( +        70.0, +        [{ +            'mount': '/', +            'size_available': 1 * 10**6, +            'size_total': 5 * 10**9, +        }], +        ['100.0%'], +    ), +    ( +        40.0, +        [{ +            'mount': '/', +            'size_available': 2 * 10**9, +            'size_total': 6 * 10**9, +        }], +        ['66.7%'], +    ), +    ( +        None, +        [{ +            # enough space on /var ... +            'mount': '/var', +            'size_available': 20 * 10**9, +            'size_total': 20 * 10**9, +        }, { +            # .. but not enough on /var/lib +            'mount': '/var/lib', +            'size_available': 1 * 10**9, +            'size_total': 20 * 10**9, +        }], +        ['95.0%'], +    ), +]) +def test_fails_with_insufficient_disk_space(size_limit_percent, ansible_mounts, extra_words): +    task_vars = dict( +        etcd_device_usage_threshold_percent=size_limit_percent, +        ansible_mounts=ansible_mounts, +    ) + +    if task_vars["etcd_device_usage_threshold_percent"] is None: +        task_vars.pop("etcd_device_usage_threshold_percent") + +    check = EtcdVolume(execute_module=fake_execute_module) +    result = check.run(tmp=None, task_vars=task_vars) + +    assert result['failed'] +    for word in extra_words: +        assert word in result['msg'] + + +def fake_execute_module(*args): +    raise AssertionError('this function should not be called')  | 
