Merge pull request #5080 from sdodson/drain-timeouts

Automatic merge from submit-queue. Add the ability to specify a timeout for node drain operations A timeout to wait for nodes to drain pods can be specified to ensure that the upgrade continues even if nodes fail to drain pods in the allowed time. The default value of 0 will wait indefinitely allowing the admin to investigate the root cause and ensuring that disruption budgets are respected. In practice the `oc adm drain` command will eventually error out, at least that's what we've seen in our large online clusters, when that happens a second attempt will be made to drain the nodes, if it fails again it will abort the upgrade for that node or for the entire cluster based on your defined `openshift_upgrade_nodes_max_fail_percentage`. `openshift_upgrade_nodes_drain_timeout=0` is the default and will wait until all pods have been drained successfully `openshift_upgrade_nodes_drain_timeout=600` would wait for 600s before moving on to the tasks which would forcefully stop pods such as stopping docker, node, and openvswitch.
author: OpenShift Merge Robot <openshift-merge-robot@users.noreply.github.com> 2018-01-10 17:58:44 -0800
committer: GitHub <noreply@github.com> 2018-01-10 17:58:44 -0800
commit: 693769209936849a6f83c4ef85bda39dabfb8800 (patch)
tree: b3e17a9559c7dea02accb8e75865aad7eee3f764 /playbooks/common/openshift-cluster/upgrades
parent: e45ef801051202f9d79a0dc814d4a3e056b257d2 (diff)
parent: 0841917f05cfad2701164edbb271167c277d3300 (diff)
4 files changed, 29 insertions, 11 deletions
diff --git a/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml b/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml
index ffb11670d..8392e21ee 100644
--- a/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml
+++ b/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml
@@ -51,13 +51,19 @@
 
   - name: Drain Node for Kubelet upgrade
     command: >
-      {{ openshift_client_binary }} adm drain {{ openshift.node.nodename }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig --force --delete-local-data --ignore-daemonsets
+      {{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }} adm drain {{ openshift.node.nodename | lower }}
+      --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+      --force --delete-local-data --ignore-daemonsets
+      --timeout={{ openshift_upgrade_nodes_drain_timeout | default(0) }}s
     delegate_to: "{{ groups.oo_first_master.0 }}"
     when: l_docker_upgrade is defined and l_docker_upgrade | bool and inventory_hostname in groups.oo_nodes_to_upgrade
     register: l_docker_upgrade_drain_result
     until: not (l_docker_upgrade_drain_result is failed)
-    retries: 60
-    delay: 60
+    retries: "{{ 1 if ( openshift_upgrade_nodes_drain_timeout | default(0) | int ) == 0 else 0 }}"
+    delay: 5
+    failed_when:
+    - l_docker_upgrade_drain_result is failed
+    - openshift_upgrade_nodes_drain_timeout | default(0) | int == 0
 
   - include_tasks: tasks/upgrade.yml
     when: l_docker_upgrade is defined and l_docker_upgrade | bool
diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
index ecb7c360c..e89f06f17 100644
--- a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
+++ b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
@@ -291,12 +291,18 @@
 
   - name: Drain Node for Kubelet upgrade
     command: >
-      {{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }} adm drain {{ openshift.node.nodename | lower }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig --force --delete-local-data --ignore-daemonsets
+      {{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }} adm drain {{ openshift.node.nodename | lower }}
+      --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+      --force --delete-local-data --ignore-daemonsets
+      --timeout={{ openshift_upgrade_nodes_drain_timeout | default(0) }}s
     delegate_to: "{{ groups.oo_first_master.0 }}"
     register: l_upgrade_control_plane_drain_result
     until: not (l_upgrade_control_plane_drain_result is failed)
-    retries: 60
-    delay: 60
+    retries: "{{ 1 if ( openshift_upgrade_nodes_drain_timeout | default(0) | int ) == 0 else 0 }}"
+    delay: 5
+    failed_when:
+    - l_upgrade_control_plane_drain_result is failed
+    - openshift_upgrade_nodes_drain_timeout | default(0) | int == 0
 
   roles:
   - openshift_facts
diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
index 464af3ae6..850442b3b 100644
--- a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
+++ b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
@@ -33,12 +33,18 @@
 
   - name: Drain Node for Kubelet upgrade
     command: >
-      {{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }} adm drain {{ openshift.node.nodename | lower }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig --force --delete-local-data --ignore-daemonsets
+      {{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }} adm drain {{ openshift.node.nodename | lower }}
+      --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+      --force --delete-local-data --ignore-daemonsets
+      --timeout={{ openshift_upgrade_nodes_drain_timeout | default(0) }}s
     delegate_to: "{{ groups.oo_first_master.0 }}"
     register: l_upgrade_nodes_drain_result
     until: not (l_upgrade_nodes_drain_result is failed)
-    retries: 60
-    delay: 60
+    retries: "{{ 1 if ( openshift_upgrade_nodes_drain_timeout | default(0) | int ) == 0 else 0 }}"
+    delay: 5
+    failed_when:
+    - l_upgrade_nodes_drain_result is failed
+    - openshift_upgrade_nodes_drain_timeout | default(0) | int == 0
 
   post_tasks:
   - import_role:
diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_scale_group.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_scale_group.yml
index 6d59bfd0b..e259b5d09 100644
--- a/playbooks/common/openshift-cluster/upgrades/upgrade_scale_group.yml
+++ b/playbooks/common/openshift-cluster/upgrades/upgrade_scale_group.yml
@@ -50,11 +50,11 @@
     delegate_to: "{{ groups.oo_first_master.0 }}"
     register: l_upgrade_nodes_drain_result
     until: not (l_upgrade_nodes_drain_result is failed)
-    retries: "{{ 1 if openshift_upgrade_nodes_drain_timeout | default(0) == '0' else 0  | int }}"
+    retries: "{{ 1 if ( openshift_upgrade_nodes_drain_timeout | default(0) | int ) == 0 else 0 }}"
     delay: 5
     failed_when:
     - l_upgrade_nodes_drain_result is failed
-    - openshift_upgrade_nodes_drain_timeout | default(0) == '0'
+    - openshift_upgrade_nodes_drain_timeout | default(0) | int == 0
 
 # Alright, let's clean up!
 - name: clean up the old scale group
author	OpenShift Merge Robot <openshift-merge-robot@users.noreply.github.com>	2018-01-10 17:58:44 -0800
committer	GitHub <noreply@github.com>	2018-01-10 17:58:44 -0800
commit	693769209936849a6f83c4ef85bda39dabfb8800 (patch)
tree	b3e17a9559c7dea02accb8e75865aad7eee3f764 /playbooks/common/openshift-cluster/upgrades
parent	e45ef801051202f9d79a0dc814d4a3e056b257d2 (diff)
parent	0841917f05cfad2701164edbb271167c277d3300 (diff)