6 files changed, 676 insertions, 42 deletions
diff --git a/roles/openshift_health_checker/library/search_journalctl.py b/roles/openshift_health_checker/library/search_journalctl.py
new file mode 100644
index 000000000..3631f71c8
--- /dev/null
+++ b/roles/openshift_health_checker/library/search_journalctl.py
@@ -0,0 +1,150 @@
+#!/usr/bin/python
+"""Interface to journalctl."""
+
+from time import time
+import json
+import re
+import subprocess
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+class InvalidMatcherRegexp(Exception):
+    """Exception class for invalid matcher regexp."""
+    pass
+
+
+class InvalidLogEntry(Exception):
+    """Exception class for invalid / non-json log entries."""
+    pass
+
+
+class LogInputSubprocessError(Exception):
+    """Exception class for errors that occur while executing a subprocess."""
+    pass
+
+
+def main():
+    """Scan a given list of "log_matchers" for journalctl messages containing given patterns.
+    "log_matchers" is a list of dicts consisting of three keys that help fine-tune log searching:
+    'start_regexp', 'regexp', and 'unit'.
+
+    Sample "log_matchers" list:
+
+    [
+      {
+        'start_regexp': r'Beginning of systemd unit',
+        'regexp': r'the specific log message to find',
+        'unit': 'etcd',
+      }
+    ]
+    """
+    module = AnsibleModule(
+        argument_spec=dict(
+            log_count_limit=dict(type="int", default=500),
+            log_matchers=dict(type="list", required=True),
+        ),
+    )
+
+    timestamp_limit_seconds = time() - 60 * 60  # 1 hour
+
+    log_count_limit = module.params["log_count_limit"]
+    log_matchers = module.params["log_matchers"]
+
+    matched_regexp, errors = get_log_matches(log_matchers, log_count_limit, timestamp_limit_seconds)
+
+    module.exit_json(
+        changed=False,
+        failed=bool(errors),
+        errors=errors,
+        matched=matched_regexp,
+    )
+
+
+def get_log_matches(matchers, log_count_limit, timestamp_limit_seconds):
+    """Return a list of up to log_count_limit matches for each matcher.
+
+    Log entries are only considered if newer than timestamp_limit_seconds.
+    """
+    matched_regexp = []
+    errors = []
+
+    for matcher in matchers:
+        try:
+            log_output = get_log_output(matcher)
+        except LogInputSubprocessError as err:
+            errors.append(str(err))
+            continue
+
+        try:
+            matched = find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds)
+            if matched:
+                matched_regexp.append(matcher.get("regexp", ""))
+        except InvalidMatcherRegexp as err:
+            errors.append(str(err))
+        except InvalidLogEntry as err:
+            errors.append(str(err))
+
+    return matched_regexp, errors
+
+
+def get_log_output(matcher):
+    """Return an iterator on the logs of a given matcher."""
+    try:
+        cmd_output = subprocess.Popen(list([
+            '/bin/journalctl',
+            '-ru', matcher.get("unit", ""),
+            '--output', 'json',
+        ]), stdout=subprocess.PIPE)
+
+        return iter(cmd_output.stdout.readline, '')
+
+    except subprocess.CalledProcessError as exc:
+        msg = "Could not obtain journalctl logs for the specified systemd unit: {}: {}"
+        raise LogInputSubprocessError(msg.format(matcher.get("unit", "<missing>"), str(exc)))
+    except OSError as exc:
+        raise LogInputSubprocessError(str(exc))
+
+
+def find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds):
+    """Return log messages matched in iterable log_output by a given matcher.
+
+    Ignore any log_output items older than timestamp_limit_seconds.
+    """
+    try:
+        regexp = re.compile(matcher.get("regexp", ""))
+        start_regexp = re.compile(matcher.get("start_regexp", ""))
+    except re.error as err:
+        msg = "A log matcher object was provided with an invalid regular expression: {}"
+        raise InvalidMatcherRegexp(msg.format(str(err)))
+
+    matched = None
+
+    for log_count, line in enumerate(log_output):
+        if log_count >= log_count_limit:
+            break
+
+        try:
+            obj = json.loads(line)
+
+            # don't need to look past the most recent service restart
+            if start_regexp.match(obj["MESSAGE"]):
+                break
+
+            log_timestamp_seconds = float(obj["__REALTIME_TIMESTAMP"]) / 1000000
+            if log_timestamp_seconds < timestamp_limit_seconds:
+                break
+
+            if regexp.match(obj["MESSAGE"]):
+                matched = line
+                break
+
+        except ValueError:
+            msg = "Log entry for systemd unit {} contained invalid json syntax: {}"
+            raise InvalidLogEntry(msg.format(matcher.get("unit"), line))
+
+    return matched
+
+
+if __name__ == '__main__':
+    main()
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
index e80691ef3..d2227d244 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_storage.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -1,5 +1,6 @@
 """Check Docker storage driver and usage."""
 import json
+import os.path
 import re
 from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
 from openshift_checks.mixins import DockerHostMixin
@@ -20,10 +21,27 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
     storage_drivers = ["devicemapper", "overlay", "overlay2"]
     max_thinpool_data_usage_percent = 90.0
     max_thinpool_meta_usage_percent = 90.0
+    max_overlay_usage_percent = 90.0
+
+    # TODO(lmeyer): mention these in the output when check fails
+    configuration_variables = [
+        (
+            "max_thinpool_data_usage_percent",
+            "For 'devicemapper' storage driver, usage threshold percentage for data. "
+            "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent),
+        ),
+        (
+            "max_thinpool_meta_usage_percent",
+            "For 'devicemapper' storage driver, usage threshold percentage for metadata. "
+            "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent),
+        ),
+        (
+            "max_overlay_usage_percent",
+            "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. "
+            "Format: float. Default: {:.1f}".format(max_overlay_usage_percent),
+        ),
+    ]
 
-    # pylint: disable=too-many-return-statements
-    # Reason: permanent stylistic exception;
-    #         it is clearer to return on failures and there are just many ways to fail here.
     def run(self, tmp, task_vars):
         msg, failed, changed = self.ensure_dependencies(task_vars)
         if failed:
@@ -34,17 +52,17 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
             }
 
         # attempt to get the docker info hash from the API
-        info = self.execute_module("docker_info", {}, task_vars=task_vars)
-        if info.get("failed"):
+        docker_info = self.execute_module("docker_info", {}, task_vars=task_vars)
+        if docker_info.get("failed"):
             return {"failed": True, "changed": changed,
                     "msg": "Failed to query Docker API. Is docker running on this host?"}
-        if not info.get("info"):  # this would be very strange
+        if not docker_info.get("info"):  # this would be very strange
             return {"failed": True, "changed": changed,
-                    "msg": "Docker API query missing info:\n{}".format(json.dumps(info))}
-        info = info["info"]
+                    "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))}
+        docker_info = docker_info["info"]
 
         # check if the storage driver we saw is valid
-        driver = info.get("Driver", "[NONE]")
+        driver = docker_info.get("Driver", "[NONE]")
         if driver not in self.storage_drivers:
             msg = (
                 "Detected unsupported Docker storage driver '{driver}'.\n"
@@ -53,26 +71,34 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
             return {"failed": True, "changed": changed, "msg": msg}
 
         # driver status info is a list of tuples; convert to dict and validate based on driver
-        driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])}
+        driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])}
+
+        result = {}
+
         if driver == "devicemapper":
-            if driver_status.get("Data loop file"):
-                msg = (
-                    "Use of loopback devices with the Docker devicemapper storage driver\n"
-                    "(the default storage configuration) is unsupported in production.\n"
-                    "Please use docker-storage-setup to configure a backing storage volume.\n"
-                    "See http://red.ht/2rNperO for further information."
-                )
-                return {"failed": True, "changed": changed, "msg": msg}
-            result = self._check_dm_usage(driver_status, task_vars)
-            result['changed'] = result.get('changed', False) or changed
-            return result
+            result = self.check_devicemapper_support(driver_status, task_vars)
 
-        # TODO(lmeyer): determine how to check usage for overlay2
+        if driver in ['overlay', 'overlay2']:
+            result = self.check_overlay_support(docker_info, driver_status, task_vars)
 
-        return {"changed": changed}
+        result['changed'] = result.get('changed', False) or changed
+        return result
 
-    def _check_dm_usage(self, driver_status, task_vars):
-        """
+    def check_devicemapper_support(self, driver_status, task_vars):
+        """Check if dm storage driver is supported as configured. Return: result dict."""
+        if driver_status.get("Data loop file"):
+            msg = (
+                "Use of loopback devices with the Docker devicemapper storage driver\n"
+                "(the default storage configuration) is unsupported in production.\n"
+                "Please use docker-storage-setup to configure a backing storage volume.\n"
+                "See http://red.ht/2rNperO for further information."
+            )
+            return {"failed": True, "msg": msg}
+        result = self.check_dm_usage(driver_status, task_vars)
+        return result
+
+    def check_dm_usage(self, driver_status, task_vars):
+        """Check usage thresholds for Docker dm storage driver. Return: result dict.
         Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
         implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
         devicemapper storage. The LV is "thin" because it does not use all available storage
@@ -83,7 +109,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
         could run out of space first; so we check both.
         """
         vals = dict(
-            vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars),
+            vg_free=self.get_vg_free(driver_status.get("Pool Name"), task_vars),
             data_used=driver_status.get("Data Space Used"),
             data_total=driver_status.get("Data Space Total"),
             metadata_used=driver_status.get("Metadata Space Used"),
@@ -93,7 +119,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
         # convert all human-readable strings to bytes
         for key, value in vals.copy().items():
             try:
-                vals[key + "_bytes"] = self._convert_to_bytes(value)
+                vals[key + "_bytes"] = self.convert_to_bytes(value)
             except ValueError as err:  # unlikely to hit this from API info, but just to be safe
                 return {
                     "failed": True,
@@ -131,10 +157,12 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
         vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
         return vals
 
-    def _get_vg_free(self, pool, task_vars):
-        # Determine which VG to examine according to the pool name, the only indicator currently
-        # available from the Docker API driver info. We assume a name that looks like
-        # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen.
+    def get_vg_free(self, pool, task_vars):
+        """Determine which VG to examine according to the pool name. Return: size vgs reports.
+        Pool name is the only indicator currently available from the Docker API driver info.
+        We assume a name that looks like "vg--name-docker--pool";
+        vg and lv names with inner hyphens doubled, joined by a hyphen.
+        """
         match = re.match(r'((?:[^-]|--)+)-(?!-)', pool)  # matches up to the first single hyphen
         if not match:  # unlikely, but... be clear if we assumed wrong
             raise OpenShiftCheckException(
@@ -163,7 +191,8 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
         return size
 
     @staticmethod
-    def _convert_to_bytes(string):
+    def convert_to_bytes(string):
+        """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes."""
         units = dict(
             b=1,
             k=1024,
@@ -183,3 +212,87 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
             raise ValueError("Cannot convert to a byte size: " + string)
 
         return float(number) * multiplier
+
+    def check_overlay_support(self, docker_info, driver_status, task_vars):
+        """Check if overlay storage driver is supported for this host. Return: result dict."""
+        # check for xfs as backing store
+        backing_fs = driver_status.get("Backing Filesystem", "[NONE]")
+        if backing_fs != "xfs":
+            msg = (
+                "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n"
+                "'xfs' as the backing storage, but this host's storage is type '{fs}'."
+            ).format(fs=backing_fs)
+            return {"failed": True, "msg": msg}
+
+        # check support for OS and kernel version
+        o_s = docker_info.get("OperatingSystem", "[NONE]")
+        if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s:
+            # keep it simple, only check enterprise kernel versions; assume everyone else is good
+            kernel = docker_info.get("KernelVersion", "[NONE]")
+            kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)]
+            if kernel_arr < [3, 10, 0, 514]:  # rhel < 7.3
+                msg = (
+                    "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n"
+                    "kernel version 3.10.0-514; but Docker reports kernel version {version}."
+                ).format(version=kernel)
+                return {"failed": True, "msg": msg}
+            # NOTE: we could check for --selinux-enabled here but docker won't even start with
+            # that option until it's supported in the kernel so we don't need to.
+
+        return self.check_overlay_usage(docker_info, task_vars)
+
+    def check_overlay_usage(self, docker_info, task_vars):
+        """Check disk usage on OverlayFS backing store volume. Return: result dict."""
+        path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"]
+
+        threshold = get_var(task_vars, "max_overlay_usage_percent", default=self.max_overlay_usage_percent)
+        try:
+            threshold = float(threshold)
+        except ValueError:
+            return {
+                "failed": True,
+                "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold),
+            }
+
+        mount = self.find_ansible_mount(path, get_var(task_vars, "ansible_mounts"))
+        try:
+            free_bytes = mount['size_available']
+            total_bytes = mount['size_total']
+            usage = 100.0 * (total_bytes - free_bytes) / total_bytes
+        except (KeyError, ZeroDivisionError):
+            return {
+                "failed": True,
+                "msg": "The ansible_mount found for path {} is invalid.\n"
+                       "This is likely to be an Ansible bug. The record was:\n"
+                       "{}".format(path, json.dumps(mount, indent=2)),
+            }
+
+        if usage > threshold:
+            return {
+                "failed": True,
+                "msg": (
+                    "For Docker OverlayFS mount point {path},\n"
+                    "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}."
+                ).format(path=mount["mount"], pct=usage, thresh=threshold)
+            }
+
+        return {}
+
+    # TODO(lmeyer): migrate to base class
+    @staticmethod
+    def find_ansible_mount(path, ansible_mounts):
+        """Return the mount point for path from ansible_mounts."""
+
+        mount_for_path = {mount['mount']: mount for mount in ansible_mounts}
+        mount_point = path
+        while mount_point not in mount_for_path:
+            if mount_point in ["/", ""]:  # "/" not in ansible_mounts???
+                break
+            mount_point = os.path.dirname(mount_point)
+
+        try:
+            return mount_for_path[mount_point]
+        except KeyError:
+            known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) or 'none'
+            msg = 'Unable to determine mount point for path "{}". Known mount points: {}.'
+            raise OpenShiftCheckException(msg.format(path, known_mounts))
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
new file mode 100644
index 000000000..40c87873d
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
@@ -0,0 +1,47 @@
+"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic."""
+
+from openshift_checks import OpenShiftCheck, get_var
+
+
+class EtcdTraffic(OpenShiftCheck):
+    """Check if host is being affected by an increase in etcd traffic."""
+
+    name = "etcd_traffic"
+    tags = ["health", "etcd"]
+
+    @classmethod
+    def is_active(cls, task_vars):
+        """Skip hosts that do not have etcd in their group names."""
+        group_names = get_var(task_vars, "group_names", default=[])
+        valid_group_names = "etcd" in group_names
+
+        version = get_var(task_vars, "openshift", "common", "short_version")
+        valid_version = version in ("3.4", "3.5", "1.4", "1.5")
+
+        return super(EtcdTraffic, cls).is_active(task_vars) and valid_group_names and valid_version
+
+    def run(self, tmp, task_vars):
+        is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
+        unit = "etcd_container" if is_containerized else "etcd"
+
+        log_matchers = [{
+            "start_regexp": r"Starting Etcd Server",
+            "regexp": r"etcd: sync duration of [^,]+, expected less than 1s",
+            "unit": unit
+        }]
+
+        match = self.execute_module("search_journalctl", {
+            "log_matchers": log_matchers,
+        }, task_vars)
+
+        if match.get("matched"):
+            msg = ("Higher than normal etcd traffic detected.\n"
+                   "OpenShift 3.4 introduced an increase in etcd traffic.\n"
+                   "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n"
+                   "Please refer to https://access.redhat.com/solutions/2916381 for more information.")
+            return {"failed": True, "msg": msg}
+
+        if match.get("failed"):
+            return {"failed": True, "msg": "\n".join(match.get("errors"))}
+
+        return {}
diff --git a/roles/openshift_health_checker/test/docker_storage_test.py b/roles/openshift_health_checker/test/docker_storage_test.py
index bb25e3f66..99c529054 100644
--- a/roles/openshift_health_checker/test/docker_storage_test.py
+++ b/roles/openshift_health_checker/test/docker_storage_test.py
@@ -23,7 +23,8 @@ def test_is_active(is_containerized, group_names, is_active):
     assert DockerStorage.is_active(task_vars=task_vars) == is_active
 
 
-non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
+def non_atomic_task_vars():
+    return {"openshift": {"common": {"is_atomic": False}}}
 
 
 @pytest.mark.parametrize('docker_info, failed, expect_msg', [
@@ -56,7 +57,7 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
     (
         dict(info={
             "Driver": "overlay2",
-            "DriverStatus": []
+            "DriverStatus": [("Backing Filesystem", "xfs")],
         }),
         False,
         [],
@@ -64,6 +65,27 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
     (
         dict(info={
             "Driver": "overlay",
+            "DriverStatus": [("Backing Filesystem", "btrfs")],
+        }),
+        True,
+        ["storage is type 'btrfs'", "only supported with\n'xfs'"],
+    ),
+    (
+        dict(info={
+            "Driver": "overlay2",
+            "DriverStatus": [("Backing Filesystem", "xfs")],
+            "OperatingSystem": "Red Hat Enterprise Linux Server release 7.2 (Maipo)",
+            "KernelVersion": "3.10.0-327.22.2.el7.x86_64",
+        }),
+        True,
+        ["Docker reports kernel version 3.10.0-327"],
+    ),
+    (
+        dict(info={
+            "Driver": "overlay",
+            "DriverStatus": [("Backing Filesystem", "xfs")],
+            "OperatingSystem": "CentOS",
+            "KernelVersion": "3.10.0-514",
         }),
         False,
         [],
@@ -85,8 +107,9 @@ def test_check_storage_driver(docker_info, failed, expect_msg):
         return docker_info
 
     check = dummy_check(execute_module=execute_module)
-    check._check_dm_usage = lambda status, task_vars: dict()  # stub out for this test
-    result = check.run(tmp=None, task_vars=non_atomic_task_vars)
+    check.check_dm_usage = lambda status, task_vars: dict()  # stub out for this test
+    check.check_overlay_usage = lambda info, task_vars: dict()  # stub out for this test
+    result = check.run(tmp=None, task_vars=non_atomic_task_vars())
 
     if failed:
         assert result["failed"]
@@ -146,8 +169,8 @@ not_enough_space = {
 ])
 def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg):
     check = dummy_check()
-    check._get_vg_free = lambda pool, task_vars: vg_free
-    result = check._check_dm_usage(driver_status, task_vars)
+    check.get_vg_free = lambda pool, task_vars: vg_free
+    result = check.check_dm_usage(driver_status, task_vars)
     result_success = not result.get("failed")
 
     assert result_success is success
@@ -195,10 +218,10 @@ def test_vg_free(pool, command_returns, raises, returns):
     check = dummy_check(execute_module=execute_module)
     if raises:
         with pytest.raises(OpenShiftCheckException) as err:
-            check._get_vg_free(pool, {})
+            check.get_vg_free(pool, {})
         assert raises in str(err.value)
     else:
-        ret = check._get_vg_free(pool, {})
+        ret = check.get_vg_free(pool, {})
         assert ret == returns
 
 
@@ -209,7 +232,7 @@ def test_vg_free(pool, command_returns, raises, returns):
     ("12g", 12.0 * 1024**3),
 ])
 def test_convert_to_bytes(string, expect_bytes):
-    got = DockerStorage._convert_to_bytes(string)
+    got = DockerStorage.convert_to_bytes(string)
     assert got == expect_bytes
 
 
@@ -219,6 +242,70 @@ def test_convert_to_bytes(string, expect_bytes):
 ])
 def test_convert_to_bytes_error(string):
     with pytest.raises(ValueError) as err:
-        DockerStorage._convert_to_bytes(string)
+        DockerStorage.convert_to_bytes(string)
     assert "Cannot convert" in str(err.value)
     assert string in str(err.value)
+
+
+ansible_mounts_enough = [{
+    'mount': '/var/lib/docker',
+    'size_available': 50 * 10**9,
+    'size_total': 50 * 10**9,
+}]
+ansible_mounts_not_enough = [{
+    'mount': '/var/lib/docker',
+    'size_available': 0,
+    'size_total': 50 * 10**9,
+}]
+ansible_mounts_missing_fields = [dict(mount='/var/lib/docker')]
+ansible_mounts_zero_size = [{
+    'mount': '/var/lib/docker',
+    'size_available': 0,
+    'size_total': 0,
+}]
+
+
+@pytest.mark.parametrize('ansible_mounts, threshold, expect_fail, expect_msg', [
+    (
+        ansible_mounts_enough,
+        None,
+        False,
+        [],
+    ),
+    (
+        ansible_mounts_not_enough,
+        None,
+        True,
+        ["usage percentage", "higher than threshold"],
+    ),
+    (
+        ansible_mounts_not_enough,
+        "bogus percent",
+        True,
+        ["is not a percentage"],
+    ),
+    (
+        ansible_mounts_missing_fields,
+        None,
+        True,
+        ["Ansible bug"],
+    ),
+    (
+        ansible_mounts_zero_size,
+        None,
+        True,
+        ["Ansible bug"],
+    ),
+])
+def test_overlay_usage(ansible_mounts, threshold, expect_fail, expect_msg):
+    check = dummy_check()
+    task_vars = non_atomic_task_vars()
+    task_vars["ansible_mounts"] = ansible_mounts
+    if threshold is not None:
+        task_vars["max_overlay_usage_percent"] = threshold
+    docker_info = dict(DockerRootDir="/var/lib/docker", Driver="overlay")
+    result = check.check_overlay_usage(docker_info, task_vars)
+
+    assert expect_fail == bool(result.get("failed"))
+    for msg in expect_msg:
+        assert msg in result["msg"]
diff --git a/roles/openshift_health_checker/test/etcd_traffic_test.py b/roles/openshift_health_checker/test/etcd_traffic_test.py
new file mode 100644
index 000000000..287175e29
--- /dev/null
+++ b/roles/openshift_health_checker/test/etcd_traffic_test.py
@@ -0,0 +1,80 @@
+import pytest
+
+from openshift_checks.etcd_traffic import EtcdTraffic
+
+
+@pytest.mark.parametrize('group_names,version,is_active', [
+    (['masters'], "3.5", False),
+    (['masters'], "3.6", False),
+    (['nodes'], "3.4", False),
+    (['etcd'], "3.4", True),
+    (['etcd'], "3.5", True),
+    (['etcd'], "3.1", False),
+    (['masters', 'nodes'], "3.5", False),
+    (['masters', 'etcd'], "3.5", True),
+    ([], "3.4", False),
+])
+def test_is_active(group_names, version, is_active):
+    task_vars = dict(
+        group_names=group_names,
+        openshift=dict(
+            common=dict(short_version=version),
+        ),
+    )
+    assert EtcdTraffic.is_active(task_vars=task_vars) == is_active
+
+
+@pytest.mark.parametrize('group_names,matched,failed,extra_words', [
+    (["masters"], True, True, ["Higher than normal", "traffic"]),
+    (["masters", "etcd"], False, False, []),
+    (["etcd"], False, False, []),
+])
+def test_log_matches_high_traffic_msg(group_names, matched, failed, extra_words):
+    def execute_module(module_name, args, task_vars):
+        return {
+            "matched": matched,
+            "failed": failed,
+        }
+
+    task_vars = dict(
+        group_names=group_names,
+        openshift=dict(
+            common=dict(service_type="origin", is_containerized=False),
+        )
+    )
+
+    check = EtcdTraffic(execute_module=execute_module)
+    result = check.run(tmp=None, task_vars=task_vars)
+
+    for word in extra_words:
+        assert word in result.get("msg", "")
+
+    assert result.get("failed", False) == failed
+
+
+@pytest.mark.parametrize('is_containerized,expected_unit_value', [
+    (False, "etcd"),
+    (True, "etcd_container"),
+])
+def test_systemd_unit_matches_deployment_type(is_containerized, expected_unit_value):
+    task_vars = dict(
+        openshift=dict(
+            common=dict(is_containerized=is_containerized),
+        )
+    )
+
+    def execute_module(module_name, args, task_vars):
+        assert module_name == "search_journalctl"
+        matchers = args["log_matchers"]
+
+        for matcher in matchers:
+            assert matcher["unit"] == expected_unit_value
+
+        return {"failed": False}
+
+    check = EtcdTraffic(execute_module=execute_module)
+    check.run(tmp=None, task_vars=task_vars)
+
+
+def fake_execute_module(*args):
+    raise AssertionError('this function should not be called')
diff --git a/roles/openshift_health_checker/test/search_journalctl_test.py b/roles/openshift_health_checker/test/search_journalctl_test.py
new file mode 100644
index 000000000..724928aa1
--- /dev/null
+++ b/roles/openshift_health_checker/test/search_journalctl_test.py
@@ -0,0 +1,157 @@
+import pytest
+import search_journalctl
+
+
+def canned_search_journalctl(get_log_output=None):
+    """Create a search_journalctl object with canned get_log_output method"""
+    module = search_journalctl
+    if get_log_output:
+        module.get_log_output = get_log_output
+    return module
+
+
+DEFAULT_TIMESTAMP = 1496341364
+
+
+def get_timestamp(modifier=0):
+    return DEFAULT_TIMESTAMP + modifier
+
+
+def get_timestamp_microseconds(modifier=0):
+    return get_timestamp(modifier) * 1000000
+
+
+def create_test_log_object(stamp, msg):
+    return '{{"__REALTIME_TIMESTAMP": "{}", "MESSAGE": "{}"}}'.format(stamp, msg)
+
+
+@pytest.mark.parametrize('name,matchers,log_input,expected_matches,expected_errors', [
+    (
+        'test with valid params',
+        [
+            {
+                "start_regexp": r"Sample Logs Beginning",
+                "regexp": r"test log message",
+                "unit": "test",
+            },
+        ],
+        [
+            create_test_log_object(get_timestamp_microseconds(), "test log message"),
+            create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+        ],
+        ["test log message"],
+        [],
+    ),
+    (
+        'test with invalid json in log input',
+        [
+            {
+                "start_regexp": r"Sample Logs Beginning",
+                "regexp": r"test log message",
+                "unit": "test-unit",
+            },
+        ],
+        [
+            '{__REALTIME_TIMESTAMP: ' + str(get_timestamp_microseconds()) + ', "MESSAGE": "test log message"}',
+        ],
+        [],
+        [
+            ["invalid json", "test-unit", "test log message"],
+        ],
+    ),
+    (
+        'test with invalid regexp',
+        [
+            {
+                "start_regexp": r"Sample Logs Beginning",
+                "regexp": r"test [ log message",
+                "unit": "test",
+            },
+        ],
+        [
+            create_test_log_object(get_timestamp_microseconds(), "test log message"),
+            create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+            create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+            create_test_log_object(get_timestamp_microseconds(), "dummy log message"),
+            create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+        ],
+        [],
+        [
+            ["invalid regular expression"],
+        ],
+    ),
+], ids=lambda argval: argval[0])
+def test_get_log_matches(name, matchers, log_input, expected_matches, expected_errors):
+    def get_log_output(matcher):
+        return log_input
+
+    module = canned_search_journalctl(get_log_output)
+    matched_regexp, errors = module.get_log_matches(matchers, 500, 60 * 60)
+
+    assert set(matched_regexp) == set(expected_matches)
+    assert len(expected_errors) == len(errors)
+
+    for idx, partial_err_set in enumerate(expected_errors):
+        for partial_err_msg in partial_err_set:
+            assert partial_err_msg in errors[idx]
+
+
+@pytest.mark.parametrize('name,matcher,log_count_lim,stamp_lim_seconds,log_input,expected_match', [
+    (
+        'test with matching log message, but out of bounds of log_count_lim',
+        {
+            "start_regexp": r"Sample Logs Beginning",
+            "regexp": r"dummy log message",
+            "unit": "test",
+        },
+        3,
+        get_timestamp(-100 * 60 * 60),
+        [
+            create_test_log_object(get_timestamp_microseconds(), "test log message"),
+            create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+            create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+            create_test_log_object(get_timestamp_microseconds(), "dummy log message"),
+            create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+        ],
+        None,
+    ),
+    (
+        'test with matching log message, but with timestamp too old',
+        {
+            "start_regexp": r"Sample Logs Beginning",
+            "regexp": r"dummy log message",
+            "unit": "test",
+        },
+        100,
+        get_timestamp(-10),
+        [
+            create_test_log_object(get_timestamp_microseconds(), "test log message"),
+            create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+            create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+            create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+            create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"),
+        ],
+        None,
+    ),
+    (
+        'test with matching log message, and timestamp within time limit',
+        {
+            "start_regexp": r"Sample Logs Beginning",
+            "regexp": r"dummy log message",
+            "unit": "test",
+        },
+        100,
+        get_timestamp(-1010),
+        [
+            create_test_log_object(get_timestamp_microseconds(), "test log message"),
+            create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+            create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+            create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+            create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"),
+        ],
+        create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+    ),
+], ids=lambda argval: argval[0])
+def test_find_matches_skips_logs(name, matcher, log_count_lim, stamp_lim_seconds, log_input, expected_match):
+    match = search_journalctl.find_matches(log_input, matcher, log_count_lim, stamp_lim_seconds)
+    assert match == expected_match