diff options
Diffstat (limited to 'roles')
36 files changed, 843 insertions, 85 deletions
diff --git a/roles/ansible_service_broker/defaults/main.yml b/roles/ansible_service_broker/defaults/main.yml index aa1c5022b..12929b354 100644 --- a/roles/ansible_service_broker/defaults/main.yml +++ b/roles/ansible_service_broker/defaults/main.yml @@ -4,6 +4,7 @@ ansible_service_broker_remove: false ansible_service_broker_log_level: info ansible_service_broker_output_request: false ansible_service_broker_recovery: true +ansible_service_broker_bootstrap_on_startup: true # Recommended you do not enable this for now ansible_service_broker_dev_broker: false ansible_service_broker_launch_apb_on_bind: false diff --git a/roles/ansible_service_broker/tasks/install.yml b/roles/ansible_service_broker/tasks/install.yml index 65dffc89b..b3797ef96 100644 --- a/roles/ansible_service_broker/tasks/install.yml +++ b/roles/ansible_service_broker/tasks/install.yml @@ -42,6 +42,14 @@ namespace: openshift-ansible-service-broker state: present +- name: Set SA cluster-role + oc_adm_policy_user: + state: present + namespace: "openshift-ansible-service-broker" + resource_kind: cluster-role + resource_name: admin + user: "system:serviceaccount:openshift-ansible-service-broker:asb" + - name: create ansible-service-broker service oc_service: name: asb @@ -254,6 +262,7 @@ launch_apb_on_bind: {{ ansible_service_broker_launch_apb_on_bind | bool | lower }} recovery: {{ ansible_service_broker_recovery | bool | lower }} output_request: {{ ansible_service_broker_output_request | bool | lower }} + bootstrap_on_startup: {{ ansible_service_broker_bootstrap_on_startup | bool | lower }} - name: Create the Broker resource in the catalog oc_obj: diff --git a/roles/calico/handlers/main.yml b/roles/calico/handlers/main.yml index 53cecfcc3..67fc0065f 100644 --- a/roles/calico/handlers/main.yml +++ b/roles/calico/handlers/main.yml @@ -8,3 +8,7 @@ systemd: name: "{{ openshift.docker.service_name }}" state: restarted + register: l_docker_restart_docker_in_calico_result + until: not l_docker_restart_docker_in_calico_result | failed + retries: 3 + delay: 30 diff --git a/roles/contiv/tasks/netplugin.yml b/roles/contiv/tasks/netplugin.yml index 0847c92bc..e861a2591 100644 --- a/roles/contiv/tasks/netplugin.yml +++ b/roles/contiv/tasks/netplugin.yml @@ -108,6 +108,10 @@ name: "{{ openshift.docker.service_name }}" state: restarted when: docker_updated|changed + register: l_docker_restart_docker_in_contiv_result + until: not l_docker_restart_docker_in_contiv_result | failed + retries: 3 + delay: 30 - name: Netplugin | Enable Netplugin service: diff --git a/roles/docker/handlers/main.yml b/roles/docker/handlers/main.yml index 3a4f4ba92..591367467 100644 --- a/roles/docker/handlers/main.yml +++ b/roles/docker/handlers/main.yml @@ -6,9 +6,8 @@ state: restarted register: r_docker_restart_docker_result until: not r_docker_restart_docker_result | failed - retries: 1 + retries: 3 delay: 30 - when: not docker_service_status_changed | default(false) | bool - name: restart udev diff --git a/roles/docker/tasks/package_docker.yml b/roles/docker/tasks/package_docker.yml index c82d8659a..bc52ab60c 100644 --- a/roles/docker/tasks/package_docker.yml +++ b/roles/docker/tasks/package_docker.yml @@ -93,7 +93,7 @@ dest: /etc/sysconfig/docker regexp: '^OPTIONS=.*$' line: "OPTIONS='\ - {% if ansible_selinux.status | default(None) == '''enabled''' and docker_selinux_enabled | default(true) %} --selinux-enabled {% endif %}\ + {% if ansible_selinux.status | default(None) == 'enabled' and docker_selinux_enabled | default(true) | bool %} --selinux-enabled {% endif %}\ {% if docker_log_driver is defined %} --log-driver {{ docker_log_driver }}{% endif %}\ {% if docker_log_options is defined %} {{ docker_log_options | oo_split() | oo_prepend_strings_in_list('--log-opt ') | join(' ')}}{% endif %}\ {% if docker_options is defined %} {{ docker_options }}{% endif %}\ @@ -123,9 +123,12 @@ enabled: yes state: started daemon_reload: yes - register: start_result + register: r_docker_package_docker_start_result + until: not r_docker_package_docker_start_result | failed + retries: 3 + delay: 30 - set_fact: - docker_service_status_changed: start_result | changed + docker_service_status_changed: "{{ r_docker_package_docker_start_result | changed }}" - meta: flush_handlers diff --git a/roles/docker/tasks/systemcontainer_docker.yml b/roles/docker/tasks/systemcontainer_docker.yml index d8c5ccfd3..57a84bc2c 100644 --- a/roles/docker/tasks/systemcontainer_docker.yml +++ b/roles/docker/tasks/systemcontainer_docker.yml @@ -46,6 +46,11 @@ state: stopped daemon_reload: yes ignore_errors: True + register: r_docker_systemcontainer_docker_stop_result + until: not r_docker_systemcontainer_docker_stop_result | failed + retries: 3 + delay: 30 + # Set http_proxy, https_proxy, and no_proxy in /etc/atomic.conf # regexp: the line starts with or without #, followed by the string @@ -160,9 +165,12 @@ enabled: yes state: started daemon_reload: yes - register: start_result + register: r_docker_systemcontainer_docker_start_result + until: not r_docker_systemcontainer_docker_start_result | failed + retries: 3 + delay: 30 - set_fact: - docker_service_status_changed: start_result | changed + docker_service_status_changed: "{{ r_docker_systemcontainer_docker_start_result | changed }}" - meta: flush_handlers diff --git a/roles/flannel/handlers/main.yml b/roles/flannel/handlers/main.yml index c60c2115a..02f5a5f64 100644 --- a/roles/flannel/handlers/main.yml +++ b/roles/flannel/handlers/main.yml @@ -8,3 +8,7 @@ systemd: name: "{{ openshift.docker.service_name }}" state: restarted + register: l_docker_restart_docker_in_flannel_result + until: not l_docker_restart_docker_in_flannel_result | failed + retries: 3 + delay: 30 diff --git a/roles/openshift_default_storage_class/defaults/main.yml b/roles/openshift_default_storage_class/defaults/main.yml index bda83c933..4f371fd89 100644 --- a/roles/openshift_default_storage_class/defaults/main.yml +++ b/roles/openshift_default_storage_class/defaults/main.yml @@ -12,6 +12,7 @@ openshift_storageclass_defaults: provisioner: kubernetes.io/gce-pd type: pd-standard +openshift_storageclass_default: "true" openshift_storageclass_name: "{{ openshift_storageclass_defaults[openshift_cloudprovider_kind]['name'] }}" openshift_storageclass_provisioner: "{{ openshift_storageclass_defaults[openshift_cloudprovider_kind]['provisioner'] }}" openshift_storageclass_parameters: "{{ openshift_storageclass_defaults[openshift_cloudprovider_kind]['parameters'] }}" diff --git a/roles/openshift_default_storage_class/tasks/main.yml b/roles/openshift_default_storage_class/tasks/main.yml index fd5e4fabe..82cab6746 100644 --- a/roles/openshift_default_storage_class/tasks/main.yml +++ b/roles/openshift_default_storage_class/tasks/main.yml @@ -2,9 +2,8 @@ # Install default storage classes in GCE & AWS - name: Ensure storageclass object oc_storageclass: - kind: storageclass name: "{{ openshift_storageclass_name }}" - default_storage_class: "true" + default_storage_class: "{{ openshift_storageclass_default | default('true') | string}}" parameters: type: "{{ openshift_storageclass_parameters.type | default('gp2') }}" encrypted: "{{ openshift_storageclass_parameters.encrypted | default('false') | string }}" diff --git a/roles/openshift_examples/tasks/main.yml b/roles/openshift_examples/tasks/main.yml index 551e21e72..1a4562776 100644 --- a/roles/openshift_examples/tasks/main.yml +++ b/roles/openshift_examples/tasks/main.yml @@ -53,7 +53,7 @@ # RHEL and Centos image streams are mutually exclusive - name: Import RHEL streams command: > - {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} -n openshift -f {{ item }} + {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n openshift -f {{ item }} when: openshift_examples_load_rhel | bool with_items: - "{{ rhel_image_streams }}" @@ -63,7 +63,7 @@ - name: Import Centos Image streams command: > - {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} -n openshift -f {{ centos_image_streams }} + {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n openshift -f {{ centos_image_streams }} when: openshift_examples_load_centos | bool register: oex_import_centos_streams failed_when: "'already exists' not in oex_import_centos_streams.stderr and oex_import_centos_streams.rc != 0" @@ -71,7 +71,7 @@ - name: Import db templates command: > - {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} -n openshift -f {{ db_templates_base }} + {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n openshift -f {{ db_templates_base }} when: openshift_examples_load_db_templates | bool register: oex_import_db_templates failed_when: "'already exists' not in oex_import_db_templates.stderr and oex_import_db_templates.rc != 0" @@ -88,7 +88,7 @@ - "{{ quickstarts_base }}/django.json" - name: Remove defunct quickstart templates from openshift namespace - command: "{{ openshift.common.client_binary }} -n openshift delete templates/{{ item }}" + command: "{{ openshift.common.client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n openshift delete templates/{{ item }}" with_items: - nodejs-example - cakephp-example @@ -100,7 +100,7 @@ - name: Import quickstart-templates command: > - {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} -n openshift -f {{ quickstarts_base }} + {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n openshift -f {{ quickstarts_base }} when: openshift_examples_load_quickstarts | bool register: oex_import_quickstarts failed_when: "'already exists' not in oex_import_quickstarts.stderr and oex_import_quickstarts.rc != 0" @@ -114,7 +114,7 @@ - "{{ xpaas_templates_base }}/sso70-basic.json" - name: Remove old xPaas templates from openshift namespace - command: "{{ openshift.common.client_binary }} -n openshift delete templates/{{ item }}" + command: "{{ openshift.common.client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n openshift delete templates/{{ item }}" with_items: - sso70-basic register: oex_delete_old_xpaas_templates @@ -123,7 +123,7 @@ - name: Import xPaas image streams command: > - {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} -n openshift -f {{ xpaas_image_streams }} + {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n openshift -f {{ xpaas_image_streams }} when: openshift_examples_load_xpaas | bool register: oex_import_xpaas_streams failed_when: "'already exists' not in oex_import_xpaas_streams.stderr and oex_import_xpaas_streams.rc != 0" @@ -131,7 +131,7 @@ - name: Import xPaas templates command: > - {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} -n openshift -f {{ xpaas_templates_base }} + {{ openshift.common.client_binary }} {{ openshift_examples_import_command }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n openshift -f {{ xpaas_templates_base }} when: openshift_examples_load_xpaas | bool register: oex_import_xpaas_templates failed_when: "'already exists' not in oex_import_xpaas_templates.stderr and oex_import_xpaas_templates.rc != 0" diff --git a/roles/openshift_health_checker/library/search_journalctl.py b/roles/openshift_health_checker/library/search_journalctl.py new file mode 100644 index 000000000..3631f71c8 --- /dev/null +++ b/roles/openshift_health_checker/library/search_journalctl.py @@ -0,0 +1,150 @@ +#!/usr/bin/python +"""Interface to journalctl.""" + +from time import time +import json +import re +import subprocess + +from ansible.module_utils.basic import AnsibleModule + + +class InvalidMatcherRegexp(Exception): + """Exception class for invalid matcher regexp.""" + pass + + +class InvalidLogEntry(Exception): + """Exception class for invalid / non-json log entries.""" + pass + + +class LogInputSubprocessError(Exception): + """Exception class for errors that occur while executing a subprocess.""" + pass + + +def main(): + """Scan a given list of "log_matchers" for journalctl messages containing given patterns. + "log_matchers" is a list of dicts consisting of three keys that help fine-tune log searching: + 'start_regexp', 'regexp', and 'unit'. + + Sample "log_matchers" list: + + [ + { + 'start_regexp': r'Beginning of systemd unit', + 'regexp': r'the specific log message to find', + 'unit': 'etcd', + } + ] + """ + module = AnsibleModule( + argument_spec=dict( + log_count_limit=dict(type="int", default=500), + log_matchers=dict(type="list", required=True), + ), + ) + + timestamp_limit_seconds = time() - 60 * 60 # 1 hour + + log_count_limit = module.params["log_count_limit"] + log_matchers = module.params["log_matchers"] + + matched_regexp, errors = get_log_matches(log_matchers, log_count_limit, timestamp_limit_seconds) + + module.exit_json( + changed=False, + failed=bool(errors), + errors=errors, + matched=matched_regexp, + ) + + +def get_log_matches(matchers, log_count_limit, timestamp_limit_seconds): + """Return a list of up to log_count_limit matches for each matcher. + + Log entries are only considered if newer than timestamp_limit_seconds. + """ + matched_regexp = [] + errors = [] + + for matcher in matchers: + try: + log_output = get_log_output(matcher) + except LogInputSubprocessError as err: + errors.append(str(err)) + continue + + try: + matched = find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds) + if matched: + matched_regexp.append(matcher.get("regexp", "")) + except InvalidMatcherRegexp as err: + errors.append(str(err)) + except InvalidLogEntry as err: + errors.append(str(err)) + + return matched_regexp, errors + + +def get_log_output(matcher): + """Return an iterator on the logs of a given matcher.""" + try: + cmd_output = subprocess.Popen(list([ + '/bin/journalctl', + '-ru', matcher.get("unit", ""), + '--output', 'json', + ]), stdout=subprocess.PIPE) + + return iter(cmd_output.stdout.readline, '') + + except subprocess.CalledProcessError as exc: + msg = "Could not obtain journalctl logs for the specified systemd unit: {}: {}" + raise LogInputSubprocessError(msg.format(matcher.get("unit", "<missing>"), str(exc))) + except OSError as exc: + raise LogInputSubprocessError(str(exc)) + + +def find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds): + """Return log messages matched in iterable log_output by a given matcher. + + Ignore any log_output items older than timestamp_limit_seconds. + """ + try: + regexp = re.compile(matcher.get("regexp", "")) + start_regexp = re.compile(matcher.get("start_regexp", "")) + except re.error as err: + msg = "A log matcher object was provided with an invalid regular expression: {}" + raise InvalidMatcherRegexp(msg.format(str(err))) + + matched = None + + for log_count, line in enumerate(log_output): + if log_count >= log_count_limit: + break + + try: + obj = json.loads(line) + + # don't need to look past the most recent service restart + if start_regexp.match(obj["MESSAGE"]): + break + + log_timestamp_seconds = float(obj["__REALTIME_TIMESTAMP"]) / 1000000 + if log_timestamp_seconds < timestamp_limit_seconds: + break + + if regexp.match(obj["MESSAGE"]): + matched = line + break + + except ValueError: + msg = "Log entry for systemd unit {} contained invalid json syntax: {}" + raise InvalidLogEntry(msg.format(matcher.get("unit"), line)) + + return matched + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py index e80691ef3..d2227d244 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_storage.py +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -1,5 +1,6 @@ """Check Docker storage driver and usage.""" import json +import os.path import re from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var from openshift_checks.mixins import DockerHostMixin @@ -20,10 +21,27 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): storage_drivers = ["devicemapper", "overlay", "overlay2"] max_thinpool_data_usage_percent = 90.0 max_thinpool_meta_usage_percent = 90.0 + max_overlay_usage_percent = 90.0 + + # TODO(lmeyer): mention these in the output when check fails + configuration_variables = [ + ( + "max_thinpool_data_usage_percent", + "For 'devicemapper' storage driver, usage threshold percentage for data. " + "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent), + ), + ( + "max_thinpool_meta_usage_percent", + "For 'devicemapper' storage driver, usage threshold percentage for metadata. " + "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent), + ), + ( + "max_overlay_usage_percent", + "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. " + "Format: float. Default: {:.1f}".format(max_overlay_usage_percent), + ), + ] - # pylint: disable=too-many-return-statements - # Reason: permanent stylistic exception; - # it is clearer to return on failures and there are just many ways to fail here. def run(self, tmp, task_vars): msg, failed, changed = self.ensure_dependencies(task_vars) if failed: @@ -34,17 +52,17 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): } # attempt to get the docker info hash from the API - info = self.execute_module("docker_info", {}, task_vars=task_vars) - if info.get("failed"): + docker_info = self.execute_module("docker_info", {}, task_vars=task_vars) + if docker_info.get("failed"): return {"failed": True, "changed": changed, "msg": "Failed to query Docker API. Is docker running on this host?"} - if not info.get("info"): # this would be very strange + if not docker_info.get("info"): # this would be very strange return {"failed": True, "changed": changed, - "msg": "Docker API query missing info:\n{}".format(json.dumps(info))} - info = info["info"] + "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))} + docker_info = docker_info["info"] # check if the storage driver we saw is valid - driver = info.get("Driver", "[NONE]") + driver = docker_info.get("Driver", "[NONE]") if driver not in self.storage_drivers: msg = ( "Detected unsupported Docker storage driver '{driver}'.\n" @@ -53,26 +71,34 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): return {"failed": True, "changed": changed, "msg": msg} # driver status info is a list of tuples; convert to dict and validate based on driver - driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])} + driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])} + + result = {} + if driver == "devicemapper": - if driver_status.get("Data loop file"): - msg = ( - "Use of loopback devices with the Docker devicemapper storage driver\n" - "(the default storage configuration) is unsupported in production.\n" - "Please use docker-storage-setup to configure a backing storage volume.\n" - "See http://red.ht/2rNperO for further information." - ) - return {"failed": True, "changed": changed, "msg": msg} - result = self._check_dm_usage(driver_status, task_vars) - result['changed'] = result.get('changed', False) or changed - return result + result = self.check_devicemapper_support(driver_status, task_vars) - # TODO(lmeyer): determine how to check usage for overlay2 + if driver in ['overlay', 'overlay2']: + result = self.check_overlay_support(docker_info, driver_status, task_vars) - return {"changed": changed} + result['changed'] = result.get('changed', False) or changed + return result - def _check_dm_usage(self, driver_status, task_vars): - """ + def check_devicemapper_support(self, driver_status, task_vars): + """Check if dm storage driver is supported as configured. Return: result dict.""" + if driver_status.get("Data loop file"): + msg = ( + "Use of loopback devices with the Docker devicemapper storage driver\n" + "(the default storage configuration) is unsupported in production.\n" + "Please use docker-storage-setup to configure a backing storage volume.\n" + "See http://red.ht/2rNperO for further information." + ) + return {"failed": True, "msg": msg} + result = self.check_dm_usage(driver_status, task_vars) + return result + + def check_dm_usage(self, driver_status, task_vars): + """Check usage thresholds for Docker dm storage driver. Return: result dict. Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures devicemapper storage. The LV is "thin" because it does not use all available storage @@ -83,7 +109,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): could run out of space first; so we check both. """ vals = dict( - vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars), + vg_free=self.get_vg_free(driver_status.get("Pool Name"), task_vars), data_used=driver_status.get("Data Space Used"), data_total=driver_status.get("Data Space Total"), metadata_used=driver_status.get("Metadata Space Used"), @@ -93,7 +119,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): # convert all human-readable strings to bytes for key, value in vals.copy().items(): try: - vals[key + "_bytes"] = self._convert_to_bytes(value) + vals[key + "_bytes"] = self.convert_to_bytes(value) except ValueError as err: # unlikely to hit this from API info, but just to be safe return { "failed": True, @@ -131,10 +157,12 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."]) return vals - def _get_vg_free(self, pool, task_vars): - # Determine which VG to examine according to the pool name, the only indicator currently - # available from the Docker API driver info. We assume a name that looks like - # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen. + def get_vg_free(self, pool, task_vars): + """Determine which VG to examine according to the pool name. Return: size vgs reports. + Pool name is the only indicator currently available from the Docker API driver info. + We assume a name that looks like "vg--name-docker--pool"; + vg and lv names with inner hyphens doubled, joined by a hyphen. + """ match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen if not match: # unlikely, but... be clear if we assumed wrong raise OpenShiftCheckException( @@ -163,7 +191,8 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): return size @staticmethod - def _convert_to_bytes(string): + def convert_to_bytes(string): + """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes.""" units = dict( b=1, k=1024, @@ -183,3 +212,87 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): raise ValueError("Cannot convert to a byte size: " + string) return float(number) * multiplier + + def check_overlay_support(self, docker_info, driver_status, task_vars): + """Check if overlay storage driver is supported for this host. Return: result dict.""" + # check for xfs as backing store + backing_fs = driver_status.get("Backing Filesystem", "[NONE]") + if backing_fs != "xfs": + msg = ( + "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n" + "'xfs' as the backing storage, but this host's storage is type '{fs}'." + ).format(fs=backing_fs) + return {"failed": True, "msg": msg} + + # check support for OS and kernel version + o_s = docker_info.get("OperatingSystem", "[NONE]") + if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s: + # keep it simple, only check enterprise kernel versions; assume everyone else is good + kernel = docker_info.get("KernelVersion", "[NONE]") + kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)] + if kernel_arr < [3, 10, 0, 514]: # rhel < 7.3 + msg = ( + "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n" + "kernel version 3.10.0-514; but Docker reports kernel version {version}." + ).format(version=kernel) + return {"failed": True, "msg": msg} + # NOTE: we could check for --selinux-enabled here but docker won't even start with + # that option until it's supported in the kernel so we don't need to. + + return self.check_overlay_usage(docker_info, task_vars) + + def check_overlay_usage(self, docker_info, task_vars): + """Check disk usage on OverlayFS backing store volume. Return: result dict.""" + path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"] + + threshold = get_var(task_vars, "max_overlay_usage_percent", default=self.max_overlay_usage_percent) + try: + threshold = float(threshold) + except ValueError: + return { + "failed": True, + "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold), + } + + mount = self.find_ansible_mount(path, get_var(task_vars, "ansible_mounts")) + try: + free_bytes = mount['size_available'] + total_bytes = mount['size_total'] + usage = 100.0 * (total_bytes - free_bytes) / total_bytes + except (KeyError, ZeroDivisionError): + return { + "failed": True, + "msg": "The ansible_mount found for path {} is invalid.\n" + "This is likely to be an Ansible bug. The record was:\n" + "{}".format(path, json.dumps(mount, indent=2)), + } + + if usage > threshold: + return { + "failed": True, + "msg": ( + "For Docker OverlayFS mount point {path},\n" + "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}." + ).format(path=mount["mount"], pct=usage, thresh=threshold) + } + + return {} + + # TODO(lmeyer): migrate to base class + @staticmethod + def find_ansible_mount(path, ansible_mounts): + """Return the mount point for path from ansible_mounts.""" + + mount_for_path = {mount['mount']: mount for mount in ansible_mounts} + mount_point = path + while mount_point not in mount_for_path: + if mount_point in ["/", ""]: # "/" not in ansible_mounts??? + break + mount_point = os.path.dirname(mount_point) + + try: + return mount_for_path[mount_point] + except KeyError: + known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) or 'none' + msg = 'Unable to determine mount point for path "{}". Known mount points: {}.' + raise OpenShiftCheckException(msg.format(path, known_mounts)) diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py new file mode 100644 index 000000000..40c87873d --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py @@ -0,0 +1,47 @@ +"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic.""" + +from openshift_checks import OpenShiftCheck, get_var + + +class EtcdTraffic(OpenShiftCheck): + """Check if host is being affected by an increase in etcd traffic.""" + + name = "etcd_traffic" + tags = ["health", "etcd"] + + @classmethod + def is_active(cls, task_vars): + """Skip hosts that do not have etcd in their group names.""" + group_names = get_var(task_vars, "group_names", default=[]) + valid_group_names = "etcd" in group_names + + version = get_var(task_vars, "openshift", "common", "short_version") + valid_version = version in ("3.4", "3.5", "1.4", "1.5") + + return super(EtcdTraffic, cls).is_active(task_vars) and valid_group_names and valid_version + + def run(self, tmp, task_vars): + is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") + unit = "etcd_container" if is_containerized else "etcd" + + log_matchers = [{ + "start_regexp": r"Starting Etcd Server", + "regexp": r"etcd: sync duration of [^,]+, expected less than 1s", + "unit": unit + }] + + match = self.execute_module("search_journalctl", { + "log_matchers": log_matchers, + }, task_vars) + + if match.get("matched"): + msg = ("Higher than normal etcd traffic detected.\n" + "OpenShift 3.4 introduced an increase in etcd traffic.\n" + "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n" + "Please refer to https://access.redhat.com/solutions/2916381 for more information.") + return {"failed": True, "msg": msg} + + if match.get("failed"): + return {"failed": True, "msg": "\n".join(match.get("errors"))} + + return {} diff --git a/roles/openshift_health_checker/test/docker_storage_test.py b/roles/openshift_health_checker/test/docker_storage_test.py index bb25e3f66..99c529054 100644 --- a/roles/openshift_health_checker/test/docker_storage_test.py +++ b/roles/openshift_health_checker/test/docker_storage_test.py @@ -23,7 +23,8 @@ def test_is_active(is_containerized, group_names, is_active): assert DockerStorage.is_active(task_vars=task_vars) == is_active -non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}} +def non_atomic_task_vars(): + return {"openshift": {"common": {"is_atomic": False}}} @pytest.mark.parametrize('docker_info, failed, expect_msg', [ @@ -56,7 +57,7 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}} ( dict(info={ "Driver": "overlay2", - "DriverStatus": [] + "DriverStatus": [("Backing Filesystem", "xfs")], }), False, [], @@ -64,6 +65,27 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}} ( dict(info={ "Driver": "overlay", + "DriverStatus": [("Backing Filesystem", "btrfs")], + }), + True, + ["storage is type 'btrfs'", "only supported with\n'xfs'"], + ), + ( + dict(info={ + "Driver": "overlay2", + "DriverStatus": [("Backing Filesystem", "xfs")], + "OperatingSystem": "Red Hat Enterprise Linux Server release 7.2 (Maipo)", + "KernelVersion": "3.10.0-327.22.2.el7.x86_64", + }), + True, + ["Docker reports kernel version 3.10.0-327"], + ), + ( + dict(info={ + "Driver": "overlay", + "DriverStatus": [("Backing Filesystem", "xfs")], + "OperatingSystem": "CentOS", + "KernelVersion": "3.10.0-514", }), False, [], @@ -85,8 +107,9 @@ def test_check_storage_driver(docker_info, failed, expect_msg): return docker_info check = dummy_check(execute_module=execute_module) - check._check_dm_usage = lambda status, task_vars: dict() # stub out for this test - result = check.run(tmp=None, task_vars=non_atomic_task_vars) + check.check_dm_usage = lambda status, task_vars: dict() # stub out for this test + check.check_overlay_usage = lambda info, task_vars: dict() # stub out for this test + result = check.run(tmp=None, task_vars=non_atomic_task_vars()) if failed: assert result["failed"] @@ -146,8 +169,8 @@ not_enough_space = { ]) def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg): check = dummy_check() - check._get_vg_free = lambda pool, task_vars: vg_free - result = check._check_dm_usage(driver_status, task_vars) + check.get_vg_free = lambda pool, task_vars: vg_free + result = check.check_dm_usage(driver_status, task_vars) result_success = not result.get("failed") assert result_success is success @@ -195,10 +218,10 @@ def test_vg_free(pool, command_returns, raises, returns): check = dummy_check(execute_module=execute_module) if raises: with pytest.raises(OpenShiftCheckException) as err: - check._get_vg_free(pool, {}) + check.get_vg_free(pool, {}) assert raises in str(err.value) else: - ret = check._get_vg_free(pool, {}) + ret = check.get_vg_free(pool, {}) assert ret == returns @@ -209,7 +232,7 @@ def test_vg_free(pool, command_returns, raises, returns): ("12g", 12.0 * 1024**3), ]) def test_convert_to_bytes(string, expect_bytes): - got = DockerStorage._convert_to_bytes(string) + got = DockerStorage.convert_to_bytes(string) assert got == expect_bytes @@ -219,6 +242,70 @@ def test_convert_to_bytes(string, expect_bytes): ]) def test_convert_to_bytes_error(string): with pytest.raises(ValueError) as err: - DockerStorage._convert_to_bytes(string) + DockerStorage.convert_to_bytes(string) assert "Cannot convert" in str(err.value) assert string in str(err.value) + + +ansible_mounts_enough = [{ + 'mount': '/var/lib/docker', + 'size_available': 50 * 10**9, + 'size_total': 50 * 10**9, +}] +ansible_mounts_not_enough = [{ + 'mount': '/var/lib/docker', + 'size_available': 0, + 'size_total': 50 * 10**9, +}] +ansible_mounts_missing_fields = [dict(mount='/var/lib/docker')] +ansible_mounts_zero_size = [{ + 'mount': '/var/lib/docker', + 'size_available': 0, + 'size_total': 0, +}] + + +@pytest.mark.parametrize('ansible_mounts, threshold, expect_fail, expect_msg', [ + ( + ansible_mounts_enough, + None, + False, + [], + ), + ( + ansible_mounts_not_enough, + None, + True, + ["usage percentage", "higher than threshold"], + ), + ( + ansible_mounts_not_enough, + "bogus percent", + True, + ["is not a percentage"], + ), + ( + ansible_mounts_missing_fields, + None, + True, + ["Ansible bug"], + ), + ( + ansible_mounts_zero_size, + None, + True, + ["Ansible bug"], + ), +]) +def test_overlay_usage(ansible_mounts, threshold, expect_fail, expect_msg): + check = dummy_check() + task_vars = non_atomic_task_vars() + task_vars["ansible_mounts"] = ansible_mounts + if threshold is not None: + task_vars["max_overlay_usage_percent"] = threshold + docker_info = dict(DockerRootDir="/var/lib/docker", Driver="overlay") + result = check.check_overlay_usage(docker_info, task_vars) + + assert expect_fail == bool(result.get("failed")) + for msg in expect_msg: + assert msg in result["msg"] diff --git a/roles/openshift_health_checker/test/etcd_traffic_test.py b/roles/openshift_health_checker/test/etcd_traffic_test.py new file mode 100644 index 000000000..287175e29 --- /dev/null +++ b/roles/openshift_health_checker/test/etcd_traffic_test.py @@ -0,0 +1,80 @@ +import pytest + +from openshift_checks.etcd_traffic import EtcdTraffic + + +@pytest.mark.parametrize('group_names,version,is_active', [ + (['masters'], "3.5", False), + (['masters'], "3.6", False), + (['nodes'], "3.4", False), + (['etcd'], "3.4", True), + (['etcd'], "3.5", True), + (['etcd'], "3.1", False), + (['masters', 'nodes'], "3.5", False), + (['masters', 'etcd'], "3.5", True), + ([], "3.4", False), +]) +def test_is_active(group_names, version, is_active): + task_vars = dict( + group_names=group_names, + openshift=dict( + common=dict(short_version=version), + ), + ) + assert EtcdTraffic.is_active(task_vars=task_vars) == is_active + + +@pytest.mark.parametrize('group_names,matched,failed,extra_words', [ + (["masters"], True, True, ["Higher than normal", "traffic"]), + (["masters", "etcd"], False, False, []), + (["etcd"], False, False, []), +]) +def test_log_matches_high_traffic_msg(group_names, matched, failed, extra_words): + def execute_module(module_name, args, task_vars): + return { + "matched": matched, + "failed": failed, + } + + task_vars = dict( + group_names=group_names, + openshift=dict( + common=dict(service_type="origin", is_containerized=False), + ) + ) + + check = EtcdTraffic(execute_module=execute_module) + result = check.run(tmp=None, task_vars=task_vars) + + for word in extra_words: + assert word in result.get("msg", "") + + assert result.get("failed", False) == failed + + +@pytest.mark.parametrize('is_containerized,expected_unit_value', [ + (False, "etcd"), + (True, "etcd_container"), +]) +def test_systemd_unit_matches_deployment_type(is_containerized, expected_unit_value): + task_vars = dict( + openshift=dict( + common=dict(is_containerized=is_containerized), + ) + ) + + def execute_module(module_name, args, task_vars): + assert module_name == "search_journalctl" + matchers = args["log_matchers"] + + for matcher in matchers: + assert matcher["unit"] == expected_unit_value + + return {"failed": False} + + check = EtcdTraffic(execute_module=execute_module) + check.run(tmp=None, task_vars=task_vars) + + +def fake_execute_module(*args): + raise AssertionError('this function should not be called') diff --git a/roles/openshift_health_checker/test/search_journalctl_test.py b/roles/openshift_health_checker/test/search_journalctl_test.py new file mode 100644 index 000000000..724928aa1 --- /dev/null +++ b/roles/openshift_health_checker/test/search_journalctl_test.py @@ -0,0 +1,157 @@ +import pytest +import search_journalctl + + +def canned_search_journalctl(get_log_output=None): + """Create a search_journalctl object with canned get_log_output method""" + module = search_journalctl + if get_log_output: + module.get_log_output = get_log_output + return module + + +DEFAULT_TIMESTAMP = 1496341364 + + +def get_timestamp(modifier=0): + return DEFAULT_TIMESTAMP + modifier + + +def get_timestamp_microseconds(modifier=0): + return get_timestamp(modifier) * 1000000 + + +def create_test_log_object(stamp, msg): + return '{{"__REALTIME_TIMESTAMP": "{}", "MESSAGE": "{}"}}'.format(stamp, msg) + + +@pytest.mark.parametrize('name,matchers,log_input,expected_matches,expected_errors', [ + ( + 'test with valid params', + [ + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"test log message", + "unit": "test", + }, + ], + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), + ], + ["test log message"], + [], + ), + ( + 'test with invalid json in log input', + [ + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"test log message", + "unit": "test-unit", + }, + ], + [ + '{__REALTIME_TIMESTAMP: ' + str(get_timestamp_microseconds()) + ', "MESSAGE": "test log message"}', + ], + [], + [ + ["invalid json", "test-unit", "test log message"], + ], + ), + ( + 'test with invalid regexp', + [ + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"test [ log message", + "unit": "test", + }, + ], + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), + ], + [], + [ + ["invalid regular expression"], + ], + ), +], ids=lambda argval: argval[0]) +def test_get_log_matches(name, matchers, log_input, expected_matches, expected_errors): + def get_log_output(matcher): + return log_input + + module = canned_search_journalctl(get_log_output) + matched_regexp, errors = module.get_log_matches(matchers, 500, 60 * 60) + + assert set(matched_regexp) == set(expected_matches) + assert len(expected_errors) == len(errors) + + for idx, partial_err_set in enumerate(expected_errors): + for partial_err_msg in partial_err_set: + assert partial_err_msg in errors[idx] + + +@pytest.mark.parametrize('name,matcher,log_count_lim,stamp_lim_seconds,log_input,expected_match', [ + ( + 'test with matching log message, but out of bounds of log_count_lim', + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"dummy log message", + "unit": "test", + }, + 3, + get_timestamp(-100 * 60 * 60), + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), + ], + None, + ), + ( + 'test with matching log message, but with timestamp too old', + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"dummy log message", + "unit": "test", + }, + 100, + get_timestamp(-10), + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"), + ], + None, + ), + ( + 'test with matching log message, and timestamp within time limit', + { + "start_regexp": r"Sample Logs Beginning", + "regexp": r"dummy log message", + "unit": "test", + }, + 100, + get_timestamp(-1010), + [ + create_test_log_object(get_timestamp_microseconds(), "test log message"), + create_test_log_object(get_timestamp_microseconds(), "sample log message"), + create_test_log_object(get_timestamp_microseconds(), "fake log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), + create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"), + ], + create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), + ), +], ids=lambda argval: argval[0]) +def test_find_matches_skips_logs(name, matcher, log_count_lim, stamp_lim_seconds, log_input, expected_match): + match = search_journalctl.find_matches(log_input, matcher, log_count_lim, stamp_lim_seconds) + assert match == expected_match diff --git a/roles/openshift_hosted/tasks/registry/storage/glusterfs.yml b/roles/openshift_hosted/tasks/registry/storage/glusterfs.yml index c504bfb80..c2954fde1 100644 --- a/roles/openshift_hosted/tasks/registry/storage/glusterfs.yml +++ b/roles/openshift_hosted/tasks/registry/storage/glusterfs.yml @@ -35,7 +35,7 @@ mount: state: mounted fstype: glusterfs - src: "{% if 'glusterfs_registry' in groups %}{{ groups.glusterfs_registry[0] }}{% else %}{{ groups.glusterfs[0] }}{% endif %}:/{{ openshift.hosted.registry.storage.glusterfs.path }}" + src: "{% if 'glusterfs_registry' in groups %}{% set node = groups.glusterfs_registry[0] %}{% else %}{% set node = groups.glusterfs[0] %}{% endif %}{% if 'glusterfs_hostname' in hostvars[node] %}{{ hostvars[node].glusterfs_hostname }}{% elif 'openshift' in hostvars[node] %}{{ hostvars[node].openshift.node.nodename }}{% else %}{{ node }}{% endif %}:/{{ openshift.hosted.registry.storage.glusterfs.path }}" name: "{{ mktemp.stdout }}" - name: Set registry volume permissions diff --git a/roles/openshift_hosted/templates/registry_config.j2 b/roles/openshift_hosted/templates/registry_config.j2 index 9673841bf..fc9272679 100644 --- a/roles/openshift_hosted/templates/registry_config.j2 +++ b/roles/openshift_hosted/templates/registry_config.j2 @@ -22,7 +22,7 @@ storage: {% endif %} bucket: {{ openshift_hosted_registry_storage_s3_bucket }} encrypt: {{ openshift_hosted_registry_storage_s3_encrypt | default(false) }} -{% if openshift_hosted_registry_storage_s3_kmskeyid %} +{% if openshift_hosted_registry_storage_s3_kmskeyid is defined %} keyid: {{ openshift_hosted_registry_storage_s3_kmskeyid }} {% endif %} secure: true diff --git a/roles/openshift_node/handlers/main.yml b/roles/openshift_node/handlers/main.yml index a6bd12d4e..6b38da7f8 100644 --- a/roles/openshift_node/handlers/main.yml +++ b/roles/openshift_node/handlers/main.yml @@ -4,9 +4,14 @@ name: openvswitch state: restarted when: (not skip_node_svc_handlers | default(False) | bool) and not (ovs_service_status_changed | default(false) | bool) and openshift.common.use_openshift_sdn | bool + register: l_openshift_node_stop_openvswitch_result + until: not l_openshift_node_stop_openvswitch_result | failed + retries: 3 + delay: 30 notify: - restart openvswitch pause + - name: restart openvswitch pause pause: seconds=15 when: (not skip_node_svc_handlers | default(False) | bool) and openshift.common.is_containerized | bool @@ -15,7 +20,13 @@ systemd: name: "{{ openshift.common.service_type }}-node" state: restarted - when: (not skip_node_svc_handlers | default(False) | bool) and not (node_service_status_changed | default(false) | bool) + register: l_openshift_node_restart_node_result + until: not l_openshift_node_restart_node_result | failed + retries: 3 + delay: 30 + when: + - (not skip_node_svc_handlers | default(False) | bool) + - not (node_service_status_changed | default(false) | bool) - name: reload sysctl.conf command: /sbin/sysctl -p diff --git a/roles/openshift_node/tasks/main.yml b/roles/openshift_node/tasks/main.yml index 573051504..879f6c207 100644 --- a/roles/openshift_node/tasks/main.yml +++ b/roles/openshift_node/tasks/main.yml @@ -118,8 +118,12 @@ name: openvswitch.service enabled: yes state: started + daemon_reload: yes when: openshift.common.is_containerized | bool and openshift.common.use_openshift_sdn | bool register: ovs_start_result + until: not ovs_start_result | failed + retries: 3 + delay: 30 - set_fact: ovs_service_status_changed: "{{ ovs_start_result | changed }}" @@ -212,15 +216,27 @@ state: started when: openshift.common.is_containerized | bool + - name: Start and enable node systemd: name: "{{ openshift.common.service_type }}-node" enabled: yes state: started + daemon_reload: yes register: node_start_result until: not node_start_result | failed retries: 1 delay: 30 + ignore_errors: true + +- name: Dump logs from node service if it failed + command: journalctl --no-pager -n 100 {{ openshift.common.service_type }}-node + when: node_start_result | failed + +- name: Abort if node failed to start + fail: + msg: Node failed to start please inspect the logs and try again + when: node_start_result | failed - set_fact: node_service_status_changed: "{{ node_start_result | changed }}" diff --git a/roles/openshift_node_certificates/handlers/main.yml b/roles/openshift_node_certificates/handlers/main.yml index 502f80434..4abe8bcaf 100644 --- a/roles/openshift_node_certificates/handlers/main.yml +++ b/roles/openshift_node_certificates/handlers/main.yml @@ -9,3 +9,7 @@ name: "{{ openshift.docker.service_name }}" state: restarted when: not openshift_certificates_redeploy | default(false) | bool + register: l_docker_restart_docker_in_cert_result + until: not l_docker_restart_docker_in_cert_result | failed + retries: 3 + delay: 30 diff --git a/roles/openshift_node_upgrade/README.md b/roles/openshift_node_upgrade/README.md index 8b388cc6a..4e6229bfb 100644 --- a/roles/openshift_node_upgrade/README.md +++ b/roles/openshift_node_upgrade/README.md @@ -84,6 +84,11 @@ Including an example of how to use your role (for instance, with variables passe command: > {{ hostvars[groups.oo_first_master.0].openshift.common.admin_binary }} drain {{ openshift.node.nodename | lower }} --force --delete-local-data --ignore-daemonsets delegate_to: "{{ groups.oo_first_master.0 }}" + register: l_docker_upgrade_drain_result + until: not l_docker_upgrade_drain_result | failed + retries: 60 + delay: 60 + roles: - openshift_facts diff --git a/roles/openshift_node_upgrade/handlers/main.yml b/roles/openshift_node_upgrade/handlers/main.yml index cb51416d4..110dfe5ce 100644 --- a/roles/openshift_node_upgrade/handlers/main.yml +++ b/roles/openshift_node_upgrade/handlers/main.yml @@ -1,7 +1,13 @@ --- - name: restart openvswitch - systemd: name=openvswitch state=restarted + systemd: + name: openvswitch + state: restarted when: (not skip_node_svc_handlers | default(False) | bool) and not (ovs_service_status_changed | default(false) | bool) and openshift.common.use_openshift_sdn | bool + register: l_openshift_node_upgrade_stop_openvswitch_result + until: not l_openshift_node_upgrade_stop_openvswitch_result | failed + retries: 3 + delay: 30 notify: - restart openvswitch pause @@ -10,5 +16,13 @@ when: (not skip_node_svc_handlers | default(False) | bool) and openshift.common.is_containerized | bool - name: restart node - systemd: name={{ openshift.common.service_type }}-node state=restarted - when: (not skip_node_svc_handlers | default(False) | bool) and not (node_service_status_changed | default(false) | bool) + systemd: + name: "{{ openshift.common.service_type }}-node" + state: restarted + register: l_openshift_node_upgrade_restart_node_result + until: not l_openshift_node_upgrade_restart_node_result | failed + retries: 3 + delay: 30 + when: + - (not skip_node_svc_handlers | default(False) | bool) + - not (node_service_status_changed | default(false) | bool) diff --git a/roles/openshift_node_upgrade/tasks/docker/upgrade.yml b/roles/openshift_node_upgrade/tasks/docker/upgrade.yml index 416cf605a..ebe87d6fd 100644 --- a/roles/openshift_node_upgrade/tasks/docker/upgrade.yml +++ b/roles/openshift_node_upgrade/tasks/docker/upgrade.yml @@ -26,7 +26,13 @@ - debug: var=docker_image_count.stdout when: docker_upgrade_nuke_images is defined and docker_upgrade_nuke_images | bool -- service: name=docker state=stopped +- service: + name: docker + state: stopped + register: l_openshift_node_upgrade_docker_stop_result + until: not l_openshift_node_upgrade_docker_stop_result | failed + retries: 3 + delay: 30 - name: Upgrade Docker package: name=docker{{ '-' + docker_version }} state=present diff --git a/roles/openshift_node_upgrade/tasks/restart.yml b/roles/openshift_node_upgrade/tasks/restart.yml index 6947223af..f228b6e08 100644 --- a/roles/openshift_node_upgrade/tasks/restart.yml +++ b/roles/openshift_node_upgrade/tasks/restart.yml @@ -19,7 +19,7 @@ state: started register: docker_start_result until: not docker_start_result | failed - retries: 1 + retries: 3 delay: 30 - name: Update docker facts diff --git a/roles/openshift_service_catalog/tasks/install.yml b/roles/openshift_service_catalog/tasks/install.yml index 4d1a38e61..686857d94 100644 --- a/roles/openshift_service_catalog/tasks/install.yml +++ b/roles/openshift_service_catalog/tasks/install.yml @@ -23,7 +23,7 @@ oc_project: state: present name: "kube-service-catalog" -# node_selector: "{{ openshift_service_catalog_nodeselector | default(null) }}" + node_selector: "" - name: Make kube-service-catalog project network global command: > diff --git a/roles/openshift_service_catalog/tasks/wire_aggregator.yml b/roles/openshift_service_catalog/tasks/wire_aggregator.yml index 55e68dc00..d5291a99a 100644 --- a/roles/openshift_service_catalog/tasks/wire_aggregator.yml +++ b/roles/openshift_service_catalog/tasks/wire_aggregator.yml @@ -147,6 +147,12 @@ value: [/etc/origin/master/openshift-ansible-catalog-console.js] - key: kubernetesMasterConfig.apiServerArguments.runtime-config value: [apis/settings.k8s.io/v1alpha1=true] + - key: admissionConfig.pluginConfig.PodPreset.configuration.kind + value: DefaultAdmissionConfig + - key: admissionConfig.pluginConfig.PodPreset.configuration.apiVersion + value: v1 + - key: admissionConfig.pluginConfig.PodPreset.configuration.disable + value: false register: yedit_output #restart master serially here diff --git a/roles/openshift_storage_glusterfs/README.md b/roles/openshift_storage_glusterfs/README.md index 7f4738f1c..b367e7daf 100644 --- a/roles/openshift_storage_glusterfs/README.md +++ b/roles/openshift_storage_glusterfs/README.md @@ -64,7 +64,7 @@ their configuration as GlusterFS nodes: |--------------------|---------------------------|-----------------------------------------| | glusterfs_cluster | 1 | The ID of the cluster this node should belong to. This is useful when a single heketi service is expected to manage multiple distinct clusters. **NOTE:** For natively-hosted clusters, all pods will be in the same OpenShift namespace | glusterfs_hostname | openshift.node.nodename | A hostname (or IP address) that will be used for internal GlusterFS communication -| glusterfs_ip | openshift.common.ip | An IP address that will be used by pods to communicate with the GlusterFS node +| glusterfs_ip | openshift.common.ip | An IP address that will be used by pods to communicate with the GlusterFS node. **NOTE:** Required for external GlusterFS nodes | glusterfs_zone | 1 | A zone number for the node. Zones are used within the cluster for determining how to distribute the bricks of GlusterFS volumes. heketi will try to spread each volumes' bricks as evenly as possible across all zones Role Variables @@ -76,7 +76,7 @@ GlusterFS cluster into a new or existing OpenShift cluster: | Name | Default value | Description | |--------------------------------------------------|-------------------------|-----------------------------------------| | openshift_storage_glusterfs_timeout | 300 | Seconds to wait for pods to become ready -| openshift_storage_glusterfs_namespace | 'default' | Namespace in which to create GlusterFS resources +| openshift_storage_glusterfs_namespace | 'glusterfs' | Namespace in which to create GlusterFS resources | openshift_storage_glusterfs_is_native | True | GlusterFS should be containerized | openshift_storage_glusterfs_name | 'storage' | A name to identify the GlusterFS cluster, which will be used in resource names | openshift_storage_glusterfs_nodeselector | 'glusterfs=storage-host'| Selector to determine which nodes will host GlusterFS pods in native mode. **NOTE:** The label value is taken from the cluster name @@ -85,6 +85,7 @@ GlusterFS cluster into a new or existing OpenShift cluster: | openshift_storage_glusterfs_version | 'latest' | Container image version to use for GlusterFS pods | openshift_storage_glusterfs_wipe | False | Destroy any existing GlusterFS resources and wipe storage devices. **WARNING: THIS WILL DESTROY ANY DATA ON THOSE DEVICES.** | openshift_storage_glusterfs_heketi_is_native | True | heketi should be containerized +| openshift_storage_glusterfs_heketi_cli | 'heketi-cli' | Command/Path to invoke the heketi-cli tool **NOTE:** Change this only for **non-native heketi** if heketi-cli is not in the global `$PATH` of the machine running openshift-ansible | openshift_storage_glusterfs_heketi_image | 'heketi/heketi' | Container image to use for heketi pods, enterprise default is 'rhgs3/rhgs-volmanager-rhel7' | openshift_storage_glusterfs_heketi_version | 'latest' | Container image version to use for heketi pods | openshift_storage_glusterfs_heketi_admin_key | auto-generated | String to use as secret key for performing heketi commands as admin @@ -108,7 +109,7 @@ are an exception: | Name | Default value | Description | |-------------------------------------------------------|-----------------------|-----------------------------------------| -| openshift_storage_glusterfs_registry_namespace | registry namespace | Default is to use the hosted registry's namespace, otherwise 'default' +| openshift_storage_glusterfs_registry_namespace | registry namespace | Default is to use the hosted registry's namespace, otherwise 'glusterfs' | openshift_storage_glusterfs_registry_name | 'registry' | This allows for the logical separation of the registry GlusterFS cluster from other GlusterFS clusters | openshift_storage_glusterfs_registry_storageclass | False | It is recommended to not create a StorageClass for GlusterFS clusters serving registry storage, so as to avoid performance penalties | openshift_storage_glusterfs_registry_heketi_admin_key | auto-generated | Separate from the above diff --git a/roles/openshift_storage_glusterfs/defaults/main.yml b/roles/openshift_storage_glusterfs/defaults/main.yml index 88e122f55..a846889ca 100644 --- a/roles/openshift_storage_glusterfs/defaults/main.yml +++ b/roles/openshift_storage_glusterfs/defaults/main.yml @@ -1,6 +1,6 @@ --- openshift_storage_glusterfs_timeout: 300 -openshift_storage_glusterfs_namespace: 'default' +openshift_storage_glusterfs_namespace: 'glusterfs' openshift_storage_glusterfs_is_native: True openshift_storage_glusterfs_name: 'storage' openshift_storage_glusterfs_nodeselector: "glusterfs={{ openshift_storage_glusterfs_name }}-host" @@ -8,9 +8,10 @@ openshift_storage_glusterfs_storageclass: True openshift_storage_glusterfs_image: "{{ 'rhgs3/rhgs-server-rhel7' | quote if deployment_type == 'openshift-enterprise' else 'gluster/gluster-centos' | quote }}" openshift_storage_glusterfs_version: 'latest' openshift_storage_glusterfs_wipe: False -openshift_storage_glusterfs_heketi_is_native: True +openshift_storage_glusterfs_heketi_is_native: "{{ openshift_storage_glusterfs_is_native }}" openshift_storage_glusterfs_heketi_is_missing: True openshift_storage_glusterfs_heketi_deploy_is_missing: True +openshift_storage_glusterfs_heketi_cli: 'heketi-cli' openshift_storage_glusterfs_heketi_image: "{{ 'rhgs3/rhgs-volmanager-rhel7' | quote if deployment_type == 'openshift-enterprise' else 'heketi/heketi' | quote }}" openshift_storage_glusterfs_heketi_version: 'latest' openshift_storage_glusterfs_heketi_admin_key: "{{ omit }}" @@ -26,7 +27,7 @@ openshift_storage_glusterfs_heketi_ssh_sudo: False openshift_storage_glusterfs_heketi_ssh_keyfile: '/dev/null' openshift_storage_glusterfs_registry_timeout: "{{ openshift_storage_glusterfs_timeout }}" -openshift_storage_glusterfs_registry_namespace: "{{ openshift.hosted.registry.namespace | default('default') }}" +openshift_storage_glusterfs_registry_namespace: "{{ openshift.hosted.registry.namespace | default(openshift_storage_glusterfs_namespace) }}" openshift_storage_glusterfs_registry_is_native: "{{ openshift_storage_glusterfs_is_native }}" openshift_storage_glusterfs_registry_name: 'registry' openshift_storage_glusterfs_registry_nodeselector: "glusterfs={{ openshift_storage_glusterfs_registry_name }}-host" @@ -34,9 +35,10 @@ openshift_storage_glusterfs_registry_storageclass: False openshift_storage_glusterfs_registry_image: "{{ openshift_storage_glusterfs_image }}" openshift_storage_glusterfs_registry_version: "{{ openshift_storage_glusterfs_version }}" openshift_storage_glusterfs_registry_wipe: "{{ openshift_storage_glusterfs_wipe }}" -openshift_storage_glusterfs_registry_heketi_is_native: "{{ openshift_storage_glusterfs_heketi_is_native }}" +openshift_storage_glusterfs_registry_heketi_is_native: "{{ openshift_storage_glusterfs_registry_is_native }}" openshift_storage_glusterfs_registry_heketi_is_missing: "{{ openshift_storage_glusterfs_heketi_is_missing }}" openshift_storage_glusterfs_registry_heketi_deploy_is_missing: "{{ openshift_storage_glusterfs_heketi_deploy_is_missing }}" +openshift_storage_glusterfs_registry_heketi_cli: "{{ openshift_storage_glusterfs_heketi_cli }}" openshift_storage_glusterfs_registry_heketi_image: "{{ openshift_storage_glusterfs_heketi_image }}" openshift_storage_glusterfs_registry_heketi_version: "{{ openshift_storage_glusterfs_heketi_version }}" openshift_storage_glusterfs_registry_heketi_admin_key: "{{ omit }}" @@ -44,9 +46,9 @@ openshift_storage_glusterfs_registry_heketi_user_key: "{{ omit }}" openshift_storage_glusterfs_registry_heketi_topology_load: "{{ openshift_storage_glusterfs_heketi_topology_load }}" openshift_storage_glusterfs_registry_heketi_wipe: "{{ openshift_storage_glusterfs_heketi_wipe }}" openshift_storage_glusterfs_registry_heketi_url: "{{ openshift_storage_glusterfs_heketi_url | default(omit) }}" -openshift_storage_glusterfs_registry_heketi_port: 8080 -openshift_storage_glusterfs_registry_heketi_executor: 'kubernetes' -openshift_storage_glusterfs_registry_heketi_ssh_port: 22 -openshift_storage_glusterfs_registry_heketi_ssh_user: 'root' -openshift_storage_glusterfs_registry_heketi_ssh_sudo: False -openshift_storage_glusterfs_registry_heketi_ssh_keyfile: '/dev/null' +openshift_storage_glusterfs_registry_heketi_port: "{{ openshift_storage_glusterfs_heketi_port }}" +openshift_storage_glusterfs_registry_heketi_executor: "{{ openshift_storage_glusterfs_heketi_executor }}" +openshift_storage_glusterfs_registry_heketi_ssh_port: "{{ openshift_storage_glusterfs_heketi_ssh_port }}" +openshift_storage_glusterfs_registry_heketi_ssh_user: "{{ openshift_storage_glusterfs_heketi_ssh_user }}" +openshift_storage_glusterfs_registry_heketi_ssh_sudo: "{{ openshift_storage_glusterfs_heketi_ssh_sudo }}" +openshift_storage_glusterfs_registry_heketi_ssh_keyfile: "{{ openshift_storage_glusterfs_heketi_ssh_keyfile }}" diff --git a/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml b/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml index 99ad029da..600d8f676 100644 --- a/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml +++ b/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml @@ -1,4 +1,16 @@ --- +- name: Make sure heketi-client is installed + package: name=heketi-client state=present + when: + - not openshift.common.is_atomic | bool + - not glusterfs_heketi_is_native | bool + +- name: Verify heketi-cli is installed + shell: "command -v {{ glusterfs_heketi_cli }} >/dev/null 2>&1 || { echo >&2 'ERROR: Make sure heketi-cli is available, then re-run the installer'; exit 1; }" + changed_when: False + when: + - not glusterfs_heketi_is_native | bool + - name: Verify target namespace exists oc_project: state: present @@ -173,7 +185,7 @@ - name: Set heketi-cli command set_fact: - glusterfs_heketi_client: "{% if glusterfs_heketi_is_native %}{{ openshift.common.client_binary }} rsh --namespace={{ glusterfs_namespace }} {{ heketi_pod.results.results[0]['items'][0]['metadata']['name'] }} {% endif %}heketi-cli -s http://{% if glusterfs_heketi_is_native %}localhost:8080{% else %}{{ glusterfs_heketi_url }}:{{ glusterfs_heketi_port }}{% endif %} --user admin --secret '{{ glusterfs_heketi_admin_key }}'" + glusterfs_heketi_client: "{% if glusterfs_heketi_is_native %}{{ openshift.common.client_binary }} rsh --namespace={{ glusterfs_namespace }} {{ heketi_pod.results.results[0]['items'][0]['metadata']['name'] }} {% endif %}{{ glusterfs_heketi_cli }} -s http://{% if glusterfs_heketi_is_native %}localhost:8080{% else %}{{ glusterfs_heketi_url }}:{{ glusterfs_heketi_port }}{% endif %} --user admin {% if glusterfs_heketi_admin_key is defined %}--secret '{{ glusterfs_heketi_admin_key }}'{% endif %}" - name: Verify heketi service command: "{{ glusterfs_heketi_client }} cluster list" @@ -203,6 +215,7 @@ data: "{{ glusterfs_heketi_admin_key }}" when: - glusterfs_storageclass + - glusterfs_heketi_admin_key is defined - name: Get heketi route oc_obj: diff --git a/roles/openshift_storage_glusterfs/tasks/glusterfs_config.yml b/roles/openshift_storage_glusterfs/tasks/glusterfs_config.yml index 76611d936..b54a8e36c 100644 --- a/roles/openshift_storage_glusterfs/tasks/glusterfs_config.yml +++ b/roles/openshift_storage_glusterfs/tasks/glusterfs_config.yml @@ -12,6 +12,7 @@ glusterfs_heketi_is_native: "{{ openshift_storage_glusterfs_heketi_is_native }}" glusterfs_heketi_is_missing: "{{ openshift_storage_glusterfs_heketi_is_missing }}" glusterfs_heketi_deploy_is_missing: "{{ openshift_storage_glusterfs_heketi_deploy_is_missing }}" + glusterfs_heketi_cli: "{{ openshift_storage_glusterfs_heketi_cli }}" glusterfs_heketi_image: "{{ openshift_storage_glusterfs_heketi_image }}" glusterfs_heketi_version: "{{ openshift_storage_glusterfs_heketi_version }}" glusterfs_heketi_admin_key: "{{ openshift_storage_glusterfs_heketi_admin_key }}" diff --git a/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml b/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml index 280088fe4..0b4d1c82b 100644 --- a/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml +++ b/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml @@ -12,6 +12,7 @@ glusterfs_heketi_is_native: "{{ openshift_storage_glusterfs_registry_heketi_is_native }}" glusterfs_heketi_is_missing: "{{ openshift_storage_glusterfs_registry_heketi_is_missing }}" glusterfs_heketi_deploy_is_missing: "{{ openshift_storage_glusterfs_registry_heketi_deploy_is_missing }}" + glusterfs_heketi_cli: "{{ openshift_storage_glusterfs_registry_heketi_cli }}" glusterfs_heketi_image: "{{ openshift_storage_glusterfs_registry_heketi_image }}" glusterfs_heketi_version: "{{ openshift_storage_glusterfs_registry_heketi_version }}" glusterfs_heketi_admin_key: "{{ openshift_storage_glusterfs_registry_heketi_admin_key }}" diff --git a/roles/openshift_storage_glusterfs/tasks/heketi_deploy_part2.yml b/roles/openshift_storage_glusterfs/tasks/heketi_deploy_part2.yml index 5ef4b5c83..37d3e6ba2 100644 --- a/roles/openshift_storage_glusterfs/tasks/heketi_deploy_part2.yml +++ b/roles/openshift_storage_glusterfs/tasks/heketi_deploy_part2.yml @@ -126,7 +126,7 @@ - name: Set heketi-cli command set_fact: - glusterfs_heketi_client: "{{ openshift.common.client_binary }} rsh --namespace={{ glusterfs_namespace }} {{ heketi_pod.results.results[0]['items'][0]['metadata']['name'] }} heketi-cli -s http://localhost:8080 --user admin --secret '{{ glusterfs_heketi_admin_key }}'" + glusterfs_heketi_client: "{{ openshift.common.client_binary }} rsh --namespace={{ glusterfs_namespace }} {{ heketi_pod.results.results[0]['items'][0]['metadata']['name'] }} {{ glusterfs_heketi_cli }} -s http://localhost:8080 --user admin --secret '{{ glusterfs_heketi_admin_key }}'" - name: Verify heketi service command: "{{ glusterfs_heketi_client }} cluster list" diff --git a/roles/openshift_storage_glusterfs/templates/v3.6/glusterfs-storageclass.yml.j2 b/roles/openshift_storage_glusterfs/templates/v3.6/glusterfs-storageclass.yml.j2 index 2ec9a9e9a..095fb780f 100644 --- a/roles/openshift_storage_glusterfs/templates/v3.6/glusterfs-storageclass.yml.j2 +++ b/roles/openshift_storage_glusterfs/templates/v3.6/glusterfs-storageclass.yml.j2 @@ -7,5 +7,7 @@ provisioner: kubernetes.io/glusterfs parameters: resturl: "http://{% if glusterfs_heketi_is_native %}{{ glusterfs_heketi_route }}{% else %}{{ glusterfs_heketi_url }}:{{ glusterfs_heketi_port }}{% endif %}" restuser: "admin" +{% if glusterfs_heketi_admin_key is defined %} secretNamespace: "{{ glusterfs_namespace }}" secretName: "heketi-{{ glusterfs_name }}-admin-secret" +{%- endif -%} diff --git a/roles/openshift_storage_glusterfs/templates/v3.6/topology.json.j2 b/roles/openshift_storage_glusterfs/templates/v3.6/topology.json.j2 index 3aac68e2f..d6c28f6dd 100644 --- a/roles/openshift_storage_glusterfs/templates/v3.6/topology.json.j2 +++ b/roles/openshift_storage_glusterfs/templates/v3.6/topology.json.j2 @@ -17,10 +17,20 @@ "node": { "hostnames": { "manage": [ - "{{ hostvars[node].glusterfs_hostname | default(hostvars[node].openshift.node.nodename) }}" +{%- if 'glusterfs_hostname' in hostvars[node] -%} + "{{ hostvars[node].glusterfs_hostname }}" +{%- elif 'openshift' in hostvars[node] -%} + "{{ hostvars[node].openshift.node.nodename }}" +{%- else -%} + "{{ node }}" +{%- endif -%} ], "storage": [ - "{{ hostvars[node].glusterfs_ip | default(hostvars[node].openshift.common.ip) }}" +{%- if 'glusterfs_ip' in hostvars[node] -%} + "{{ hostvars[node].glusterfs_ip }}" +{%- else -%} + "{{ hostvars[node].openshift.common.ip }}" +{%- endif -%} ] }, "zone": {{ hostvars[node].glusterfs_zone | default(1) }} |