From d0fbf1598a9ee6e52a4fae4c5922c580df06a78d Mon Sep 17 00:00:00 2001 From: Andrew Butcher Date: Thu, 5 Nov 2015 18:09:00 -0500 Subject: Start to handle pacemaker ha during upgrade --- playbooks/adhoc/upgrades/upgrade.yml | 54 +++++++++++++++++++++++++++++++++-- roles/openshift_master/tasks/main.yml | 5 ++++ 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/playbooks/adhoc/upgrades/upgrade.yml b/playbooks/adhoc/upgrades/upgrade.yml index e63add4d1..e807d3fa8 100644 --- a/playbooks/adhoc/upgrades/upgrade.yml +++ b/playbooks/adhoc/upgrades/upgrade.yml @@ -6,12 +6,17 @@ - name: Verify upgrade can proceed hosts: masters[0] + vars: + openshift_master_ha: "{{ groups['masters'] | length > 1 }}" gather_facts: no tasks: # Checking the global deployment type rather than host facts, this is about # what the user is requesting. - fail: msg="Deployment type enterprise not supported for upgrade" when: deployment_type == "enterprise" + # Pacemaker is currently the only supported upgrade path for multiple masters + - fail: msg="openshift_master_cluster_method must be set to 'pacemaker'" + when: openshift_master_ha | bool and ((openshift_master_cluster_method is not defined) or (openshift_master_cluster_method is defined and openshift_master_cluster_method != "pacemaker")) - name: Evaluate etcd_hosts hosts: localhost @@ -182,8 +187,6 @@ command: > tar -czvf {{ master_generated_certs_dir }}/{{ item.master_cert_subdir }}.tgz -C {{ master_generated_certs_dir }}/{{ item.master_cert_subdir }} . - args: - creates: "{{ master_generated_certs_dir }}/{{ item.master_cert_subdir }}.tgz" with_items: masters_needing_certs - name: Retrieve the master cert tarball from the master @@ -195,11 +198,11 @@ validate_checksum: yes with_items: masters_needing_certs - - name: Sync certs and restart masters post configuration change hosts: masters vars: sync_tmpdir: "{{ hostvars.localhost.g_master_mktemp.stdout }}" + openshift_master_ha: "{{ groups['masters'] | length > 1 }}" tasks: - name: Unarchive the tarball on the master unarchive: @@ -209,7 +212,41 @@ - name: Restart master services service: name="{{ openshift.common.service_type}}-master" state=restarted + when: not openshift_master_ha | bool +- name: Destroy cluster + hosts: masters[0] + vars: + openshift_master_ha: "{{ groups['masters'] | length > 1 }}" + openshift_deployment_type: "{{ deployment_type }}" + pre_tasks: + - name: Check for configured cluster + stat: + path: /etc/corosync/corosync.conf + register: corosync_conf + when: openshift_master_ha | bool + - name: Destroy cluster + command: pcs cluster destroy --all + when: openshift_master_ha | bool and corosync_conf.stat.exists == true + +- name: Start pcsd on masters + hosts: masters + vars: + openshift_master_ha: "{{ groups['masters'] | length > 1 }}" + tasks: + - name: Start pcsd + service: name=pcsd state=started + when: openshift_master_ha | bool + +- name: Re-create cluster + hosts: masters[0] + vars: + openshift_master_ha: "{{ groups['masters'] | length > 1 }}" + openshift_deployment_type: "{{ deployment_type }}" + omc_cluster_hosts: "{{ groups.masters | join(' ') }}" + roles: + - role: openshift_master_cluster + when: openshift_master_ha | bool - name: Delete temporary directory on localhost hosts: localhost @@ -255,10 +292,21 @@ - name: Restart masters post reconcile hosts: masters + vars: + openshift_master_ha: "{{ groups['masters'] | length > 1 }}" tasks: - name: Restart master services service: name="{{ openshift.common.service_type}}-master" state=restarted + when: not openshift_master_ha | bool +- name: Restart cluster post reconcile + hosts: masters[0] + vars: + openshift_master_ha: "{{ groups['masters'] | length > 1 }}" + tasks: + - name: Restart master cluster + command: pcs resource restart master + when: openshift_master_ha | bool - name: Upgrade default router and registry hosts: masters[0] diff --git a/roles/openshift_master/tasks/main.yml b/roles/openshift_master/tasks/main.yml index be77fce4a..35570923c 100644 --- a/roles/openshift_master/tasks/main.yml +++ b/roles/openshift_master/tasks/main.yml @@ -140,22 +140,27 @@ src: atomic-openshift-master-api.service.j2 dest: /usr/lib/systemd/system/{{ openshift.common.service_type }}-master-api.service force: no + when: openshift_master_ha | bool and openshift_master_cluster_method == "native" - name: Create the controllers service file template: src: atomic-openshift-master-controllers.service.j2 dest: /usr/lib/systemd/system/{{ openshift.common.service_type }}-master-controllers.service force: no + when: openshift_master_ha | bool and openshift_master_cluster_method == "native" - name: Create the api env file template: src: atomic-openshift-master-api.j2 dest: /etc/sysconfig/{{ openshift.common.service_type }}-master-api force: no + when: openshift_master_ha | bool and openshift_master_cluster_method == "native" - name: Create the controllers env file template: src: atomic-openshift-master-controllers.j2 dest: /etc/sysconfig/{{ openshift.common.service_type }}-master-controllers force: no + when: openshift_master_ha | bool and openshift_master_cluster_method == "native" - command: systemctl daemon-reload + when: openshift_master_ha | bool and openshift_master_cluster_method == "native" # end workaround for missing systemd unit files - name: Create session secrets file -- cgit v1.2.3 From 66791fd954731c7d4286d48683b7fe40288d5000 Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Fri, 6 Nov 2015 09:49:17 -0400 Subject: Add pre-upgrade script to be run on first master. Script currently just checks for port names that are no longer valid. In theory other checks may be added to this script in the future. Script was originally written by Steve Milner and Andy Goldstein. If the script fails, ansible seems to handle this nicely by default, exiting the upgrade and displaying stderr and stdout, both of which contain useful info on what the problem was. --- playbooks/adhoc/upgrades/files/pre-upgrade-check | 185 +++++++++++++++++++++++ playbooks/adhoc/upgrades/upgrade.yml | 7 + 2 files changed, 192 insertions(+) create mode 100644 playbooks/adhoc/upgrades/files/pre-upgrade-check diff --git a/playbooks/adhoc/upgrades/files/pre-upgrade-check b/playbooks/adhoc/upgrades/files/pre-upgrade-check new file mode 100644 index 000000000..c8ecae399 --- /dev/null +++ b/playbooks/adhoc/upgrades/files/pre-upgrade-check @@ -0,0 +1,185 @@ +#!/usr/bin/env python +""" +Pre-upgrade checks that must be run on a master before proceeding with upgrade. +""" +# This is a script not a python module: +# pylint: disable=invalid-name + +# NOTE: This script should not require any python libs other than what is +# in the standard library. + +__license__ = "ASL 2.0" + +import json +import os +import subprocess +import re + +# The maximum length of container.ports.name +ALLOWED_LENGTH = 15 +# The valid structure of container.ports.name +ALLOWED_CHARS = re.compile('^[a-z0-9][a-z0-9\\-]*[a-z0-9]$') +AT_LEAST_ONE_LETTER = re.compile('[a-z]') +# look at OS_PATH for the full path. Default ot 'oc' +OC_PATH = os.getenv('OC_PATH', 'oc') + + +def validate(value): + """ + validate verifies that value matches required conventions + + Rules of container.ports.name validation: + + * must be less that 16 chars + * at least one letter + * only a-z0-9- + * hyphens can not be leading or trailing or next to each other + + :Parameters: + - `value`: Value to validate + """ + if len(value) > ALLOWED_LENGTH: + return False + + if '--' in value: + return False + + # We search since it can be anywhere + if not AT_LEAST_ONE_LETTER.search(value): + return False + + # We match because it must start at the beginning + if not ALLOWED_CHARS.match(value): + return False + return True + + +def list_items(kind): + """ + list_items returns a list of items from the api + + :Parameters: + - `kind`: Kind of item to access + """ + response = subprocess.check_output([OC_PATH, 'get', '--all-namespaces', '-o', 'json', kind]) + items = json.loads(response) + return items.get("items", []) + + +def get(obj, *paths): + """ + Gets an object + + :Parameters: + - `obj`: A dictionary structure + - `path`: All other non-keyword arguments + """ + ret_obj = obj + for path in paths: + if ret_obj.get(path, None) is None: + return [] + ret_obj = ret_obj[path] + return ret_obj + + +# pylint: disable=too-many-arguments +def pretty_print_errors(namespace, kind, item_name, container_name, port_name, valid): + """ + Prints out results in human friendly way. + + :Parameters: + - `namespace`: Namespace of the resource + - `kind`: Kind of the resource + - `item_name`: Name of the resource + - `container_name`: Name of the container. May be "" when kind=Service. + - `port_name`: Name of the port + - `valid`: True if the port is valid + """ + if not valid: + if len(container_name) > 0: + print('%s/%s -n %s (Container="%s" Port="%s")' % ( + kind, item_name, namespace, container_name, port_name)) + else: + print('%s/%s -n %s (Port="%s")' % ( + kind, item_name, namespace, port_name)) + + +def print_validation_header(): + """ + Prints the error header. Should run on the first error to avoid + overwhelming the user. + """ + print """\ +At least one port name does not validate. Valid port names: + + * must be less that 16 chars + * have at least one letter + * only a-z0-9- + * do not start or end with - + * Dashes may not be next to eachother ('--') +""" + + +def main(): + """ + main is the main entry point to this script + """ + try: + # the comma at the end suppresses the newline + print "Checking for oc ...", + subprocess.check_output([OC_PATH, 'whoami']) + print "found" + except: + print( + 'Can not find oc (%s). Override the path with the ' + 'OC_PATH environment variable. Exiting...' % OC_PATH) + raise SystemExit(1) + + # Where the magic happens + first_error = True + for kind, path in [ + ('replicationcontrollers', ("spec", "template", "spec", "containers")), + ('pods', ("spec", "containers")), + ('deploymentconfigs', ("spec", "template", "spec", "containers"))]: + for item in list_items(kind): + namespace = item["metadata"]["namespace"] + item_name = item["metadata"]["name"] + for container in get(item, *path): + container_name = container["name"] + for port in get(container, "ports"): + port_name = port.get("name", None) + if not port_name: + # Unnamed ports are OK + continue + valid = validate(port_name) + if not valid and first_error: + first_error = False + print_validation_header() + pretty_print_errors( + namespace, kind, item_name, + container_name, port_name, valid) + + # Services follow a different flow + for item in list_items('services'): + namespace = item["metadata"]["namespace"] + item_name = item["metadata"]["name"] + for port in get(item, "spec", "ports"): + port_name = port.get("targetPort", None) + if isinstance(port_name, int) or port_name is None: + # Integer only or unnamed ports are OK + continue + valid = validate(port_name) + if not valid and first_error: + first_error = False + print_validation_header() + pretty_print_errors( + namespace, "services", item_name, "", port_name, valid) + + # If we had at least 1 error then exit with 1 + if not first_error: + raise SystemExit(1) + + +if __name__ == '__main__': + main() + diff --git a/playbooks/adhoc/upgrades/upgrade.yml b/playbooks/adhoc/upgrades/upgrade.yml index e63add4d1..4ca9b94ac 100644 --- a/playbooks/adhoc/upgrades/upgrade.yml +++ b/playbooks/adhoc/upgrades/upgrade.yml @@ -13,6 +13,13 @@ - fail: msg="Deployment type enterprise not supported for upgrade" when: deployment_type == "enterprise" +- name: Run pre-upgrade checks on first master + hosts: masters[0] + tasks: + # If this script errors out ansible will show the default stdout/stderr + # which contains details for the user: + - script: files/pre-upgrade-check + - name: Evaluate etcd_hosts hosts: localhost tasks: -- cgit v1.2.3 From 98b69946496d0b214c5bd0d384e1cea0856c4cbb Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Fri, 6 Nov 2015 11:43:25 -0400 Subject: Fix pylint errors with getting hosts to run on. --- utils/src/ooinstall/cli_installer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/src/ooinstall/cli_installer.py b/utils/src/ooinstall/cli_installer.py index 9f0861b77..e63f14816 100644 --- a/utils/src/ooinstall/cli_installer.py +++ b/utils/src/ooinstall/cli_installer.py @@ -323,6 +323,8 @@ def get_installed_hosts(hosts, callback_facts): installed_hosts.append(host) return installed_hosts +# pylint: disable=too-many-branches +# This pylint error will be corrected shortly in separate PR. def get_hosts_to_run_on(oo_cfg, callback_facts, unattended, force, verbose): # Copy the list of existing hosts so we can remove any already installed nodes. @@ -383,7 +385,7 @@ def get_hosts_to_run_on(oo_cfg, callback_facts, unattended, force, verbose): openshift_ansible.set_config(oo_cfg) click.echo('Gathering information from hosts...') - callback_facts, error = openshift_ansible.default_facts(oo_cfg.hosts) + callback_facts, error = openshift_ansible.default_facts(oo_cfg.hosts, verbose) if error: click.echo("There was a problem fetching the required information. " \ "See {} for details.".format(oo_cfg.settings['ansible_log_path'])) -- cgit v1.2.3