From 82f4e4eaeaaf3059013e9ea23d87dcf89fd8455e Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Tue, 21 Jun 2016 15:01:01 -0300 Subject: Refactor 3.2 upgrade to avoid killing nodes without evac. We now handle the two pieces of upgrade that require a node evac in the same play. (docker, and node itself) --- .../openshift-cluster/upgrades/docker/upgrade.yml | 52 ++++++++++++++++ .../upgrades/docker/upgrade_check.yml | 37 +++++++++++ .../upgrades/files/nuke_images.sh | 23 +++++++ .../v3_1_to_v3_2/containerized_node_upgrade.yml | 11 ++++ .../v3_1_to_v3_2/containerized_upgrade.yml | 11 ---- .../upgrades/v3_1_to_v3_2/docker_upgrade.yml | 14 ----- .../upgrades/v3_1_to_v3_2/node_upgrade.yml | 24 -------- .../upgrades/v3_1_to_v3_2/pre.yml | 2 +- .../upgrades/v3_1_to_v3_2/upgrade.yml | 72 +++++++++++----------- 9 files changed, 161 insertions(+), 85 deletions(-) create mode 100644 playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml create mode 100644 playbooks/common/openshift-cluster/upgrades/docker/upgrade_check.yml create mode 100644 playbooks/common/openshift-cluster/upgrades/files/nuke_images.sh create mode 100644 playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/containerized_node_upgrade.yml delete mode 100644 playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/containerized_upgrade.yml delete mode 100644 playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/docker_upgrade.yml delete mode 100644 playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/node_upgrade.yml (limited to 'playbooks/common') diff --git a/playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml b/playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml new file mode 100644 index 000000000..78b123881 --- /dev/null +++ b/playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml @@ -0,0 +1,52 @@ +--- +# We need docker service up to remove all the images, but these services will keep +# trying to re-start and thus re-pull the images we're trying to delete. +- name: stop containerized services + service: name={{ item }} state=stopped + with_items: + - "{{ openshift.common.service_type }}-master" + - "{{ openshift.common.service_type }}-master-api" + - "{{ openshift.common.service_type }}-master-controllers" + - "{{ openshift.common.service_type }}-node" + - etcd_container + - openvswitch + failed_when: false + when: docker_upgrade is defined and docker_upgrade | bool and openshift.common.is_containerized | bool + +- name: remove all containers and images + script: nuke_images.sh docker + register: nuke_images_result + when: docker_upgrade is defined and docker_upgrade | bool + +# todo: should we use the docker role to actually do the upgrade? +- name: upgrade to specified docker version + action: "{{ ansible_pkg_mgr }} name=docker{{ '-' + docker_version }} state=present" + register: docker_upgrade_result + when: docker_upgrade is defined and docker_upgrade | bool and docker_version is defined + +- name: upgrade to latest docker version + action: "{{ ansible_pkg_mgr }} name=docker state=latest" + register: docker_upgrade_result + when: docker_upgrade is defined and docker_upgrade | bool and docker_version is not defined + +- name: restart containerized services + service: name={{ item }} state=started + with_items: + - etcd_container + - openvswitch + - "{{ openshift.common.service_type }}-master" + - "{{ openshift.common.service_type }}-master-api" + - "{{ openshift.common.service_type }}-master-controllers" + - "{{ openshift.common.service_type }}-node" + failed_when: false + when: docker_upgrade is defined and docker_upgrade | bool and openshift.common.is_containerized | bool + +- name: wait for master api to come back online + become: no + local_action: + module: wait_for + host="{{ inventory_hostname }}" + state=started + delay=10 + port="{{ openshift.master.api_port }}" + when: docker_upgrade is defined and docker_upgrade | bool and inventory_hostname in groups.oo_masters_to_config diff --git a/playbooks/common/openshift-cluster/upgrades/docker/upgrade_check.yml b/playbooks/common/openshift-cluster/upgrades/docker/upgrade_check.yml new file mode 100644 index 000000000..928913ef3 --- /dev/null +++ b/playbooks/common/openshift-cluster/upgrades/docker/upgrade_check.yml @@ -0,0 +1,37 @@ +--- +- name: Determine available Docker version + script: ../../../../common/openshift-cluster/upgrades/files/rpm_versions.sh docker + register: g_docker_version_result + +- name: Check if Docker is installed + command: rpm -q docker + register: pkg_check + failed_when: pkg_check.rc > 1 + changed_when: no + +- name: Get current version of Docker + command: "{{ repoquery_cmd }} --installed --qf '%{version}' docker" + register: curr_docker_version + changed_when: false + +- name: Get latest available version of Docker + command: > + {{ repoquery_cmd }} --qf '%{version}' "docker" + register: avail_docker_version + failed_when: false + changed_when: false + +- fail: + msg: This playbook requires access to Docker 1.10 or later + # Disable the 1.10 requirement if the user set a specific Docker version + when: avail_docker_version.stdout | version_compare('1.10','<') and docker_version is not defined + +- name: Flag for upgrade if Docker version does not equal latest + set_fact: + docker_upgrade: true + when: docker_version is not defined and pkg_check.rc == 0 and curr_docker_version.stdout | version_compare(avail_docker_version.stdout,'<') + +- name: Flag for upgrade if Docker version does not equal requested version + set_fact: + docker_upgrade: true + when: docker_version is defined and pkg_check.rc == 0 and curr_docker_version.stdout | version_compare(docker_version,'<') diff --git a/playbooks/common/openshift-cluster/upgrades/files/nuke_images.sh b/playbooks/common/openshift-cluster/upgrades/files/nuke_images.sh new file mode 100644 index 000000000..9a5ee2276 --- /dev/null +++ b/playbooks/common/openshift-cluster/upgrades/files/nuke_images.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Stop any running containers +running_container_count=`docker ps -q | wc -l` +if test $running_container_count -gt 0 +then + docker stop $(docker ps -q) +fi + +# Delete all containers +container_count=`docker ps -a -q | wc -l` +if test $container_count -gt 0 +then + docker rm -f -v $(docker ps -a -q) +fi + +# Delete all images (forcefully) +image_count=`docker images -q | wc -l` +if test $image_count -gt 0 +then + # Taken from: https://gist.github.com/brianclements/f72b2de8e307c7b56689#gistcomment-1443144 + docker rmi $(docker images | grep "$2/\|/$2 \| $2 \|$2 \|$2-\|$2_" | awk '{print $1 ":" $2}') 2>/dev/null || echo "No images matching \"$2\" left to purge." +fi diff --git a/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/containerized_node_upgrade.yml b/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/containerized_node_upgrade.yml new file mode 100644 index 000000000..319758a06 --- /dev/null +++ b/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/containerized_node_upgrade.yml @@ -0,0 +1,11 @@ +- include_vars: ../../../../../roles/openshift_node/vars/main.yml + +- name: Update systemd units + include: ../../../../../roles/openshift_node/tasks/systemd_units.yml openshift_version=v{{ g_new_version }} + +- name: Verifying the correct version was configured + shell: grep {{ verify_upgrade_version }} {{ item }} + with_items: + - /etc/sysconfig/openvswitch + - /etc/sysconfig/{{ openshift.common.service_type }}* + when: verify_upgrade_version is defined diff --git a/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/containerized_upgrade.yml b/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/containerized_upgrade.yml deleted file mode 100644 index 319758a06..000000000 --- a/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/containerized_upgrade.yml +++ /dev/null @@ -1,11 +0,0 @@ -- include_vars: ../../../../../roles/openshift_node/vars/main.yml - -- name: Update systemd units - include: ../../../../../roles/openshift_node/tasks/systemd_units.yml openshift_version=v{{ g_new_version }} - -- name: Verifying the correct version was configured - shell: grep {{ verify_upgrade_version }} {{ item }} - with_items: - - /etc/sysconfig/openvswitch - - /etc/sysconfig/{{ openshift.common.service_type }}* - when: verify_upgrade_version is defined diff --git a/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/docker_upgrade.yml b/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/docker_upgrade.yml deleted file mode 100644 index c7b18f51b..000000000 --- a/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/docker_upgrade.yml +++ /dev/null @@ -1,14 +0,0 @@ -- name: Check if Docker is installed - command: rpm -q docker - register: pkg_check - failed_when: pkg_check.rc > 1 - changed_when: no - -- name: Upgrade Docker - command: "{{ ansible_pkg_mgr}} update -y docker" - when: pkg_check.rc == 0 and g_docker_version.curr_version | version_compare('1.9','<') - register: docker_upgrade - -- name: Restart Docker - command: systemctl restart docker - when: docker_upgrade | changed diff --git a/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/node_upgrade.yml b/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/node_upgrade.yml deleted file mode 100644 index a911f12be..000000000 --- a/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/node_upgrade.yml +++ /dev/null @@ -1,24 +0,0 @@ -- name: Prepare for Node evacuation - command: > - {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --schedulable=false - delegate_to: "{{ groups.oo_first_master.0 }}" - -- name: Evacuate Node for Kubelet upgrade - command: > - {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --evacuate --force - delegate_to: "{{ groups.oo_first_master.0 }}" - -- include: rpm_upgrade.yml - vars: - component: "node" - openshift_version: "{{ openshift_pkg_version | default('') }}" - when: not openshift.common.is_containerized | bool - -- include: containerized_upgrade.yml - when: openshift.common.is_containerized | bool - -- name: Set node schedulability - command: > - {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --schedulable=true - delegate_to: "{{ groups.oo_first_master.0 }}" - when: openshift.node.schedulable | bool diff --git a/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/pre.yml b/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/pre.yml index ec07f0a60..55ede13f0 100644 --- a/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/pre.yml +++ b/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/pre.yml @@ -3,7 +3,7 @@ # Evaluate host groups and gather facts ############################################################################### -- include: ../../common/openshift-cluster/initialize_facts.yml +- include: ../../initialize_facts.yml - name: Update repos hosts: oo_masters_to_config:oo_nodes_to_config:oo_etcd_to_config:oo_lb_to_config diff --git a/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/upgrade.yml b/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/upgrade.yml index 66f6f8e71..8eeb652a7 100644 --- a/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/upgrade.yml +++ b/playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/upgrade.yml @@ -3,19 +3,6 @@ # The restart playbook should be run after this playbook completes. ############################################################################### -- name: Upgrade docker - hosts: oo_masters_to_config:oo_nodes_to_config:oo_etcd_to_config - roles: - - openshift_facts - tasks: - - include: docker_upgrade.yml - when: not openshift.common.is_atomic | bool - - name: Set post docker install facts - openshift_facts: - role: "{{ item.role }}" - with_items: - - role: docker - ############################################################################### # Upgrade Masters ############################################################################### @@ -68,36 +55,51 @@ ############################################################################### # Upgrade Nodes ############################################################################### -- name: Upgrade nodes - hosts: oo_nodes_to_config + +# Here we handle all tasks that might require a node evac. (upgrading docker, and the node service) +- name: Perform upgrades that may require node evacuation + hosts: oo_masters_to_config:oo_etcd_to_config:oo_nodes_to_config serial: 1 + any_errors_fatal: true roles: - openshift_facts handlers: - include: ../../../../../roles/openshift_node/handlers/main.yml tasks: - - include: node_upgrade.yml + # TODO: To better handle re-trying failed upgrades, it would be nice to check if the node + # or docker actually needs an upgrade before proceeding. + - name: Mark unschedulable if host is a node + command: > + {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --schedulable=false + delegate_to: "{{ groups.oo_first_master.0 }}" + when: inventory_hostname in groups.oo_nodes_to_config - - set_fact: - node_update_complete: True + - name: Evacuate Node for Kubelet upgrade + command: > + {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --evacuate --force + delegate_to: "{{ groups.oo_first_master.0 }}" + when: inventory_hostname in groups.oo_nodes_to_config + + - include: ../docker/upgrade_check.yml + + - include: ../docker/upgrade.yml + when: docker_upgrade is defined and docker_upgrade | bool + + - include: rpm_upgrade.yml + vars: + component: "node" + openshift_version: "{{ openshift_pkg_version | default('') }}" + when: inventory_hostname in groups.oo_nodes_to_config and not openshift.common.is_containerized | bool + + - include: containerized_node_upgrade.yml + when: inventory_hostname in groups.oo_nodes_to_config and openshift.common.is_containerized | bool + + - name: Set node schedulability + command: > + {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --schedulable=true + delegate_to: "{{ groups.oo_first_master.0 }}" + when: inventory_hostname in groups.oo_nodes_to_config and openshift.node.schedulable | bool -############################################################################## -# Gate on nodes update -############################################################################## -- name: Gate on nodes update - hosts: localhost - connection: local - become: no - tasks: - - set_fact: - node_update_completed: "{{ hostvars - | oo_select_keys(groups.oo_nodes_to_config) - | oo_collect('inventory_hostname', {'node_update_complete': true}) }}" - - set_fact: - node_update_failed: "{{ groups.oo_nodes_to_config | difference(node_update_completed) }}" - - fail: - msg: "Upgrade cannot continue. The following nodes did not finish updating: {{ node_update_failed | join(',') }}" - when: node_update_failed | length > 0 ############################################################################### # Reconcile Cluster Roles, Cluster Role Bindings and Security Context Constraints -- cgit v1.2.3