From 1a868e61fbab8f1e2095c0952031656c47926220 Mon Sep 17 00:00:00 2001
From: Scott Dodson <sdodson@redhat.com>
Date: Fri, 19 May 2017 16:01:03 -0400
Subject: Tolerate failures in the node upgrade playbook

---
 inventory/byo/hosts.origin.example                 | 25 ++++++++++++++++++++++
 inventory/byo/hosts.ose.example                    | 25 ++++++++++++++++++++++
 .../upgrades/upgrade_control_plane.yml             |  2 +-
 .../openshift-cluster/upgrades/upgrade_nodes.yml   |  2 +-
 4 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/inventory/byo/hosts.origin.example b/inventory/byo/hosts.origin.example
index 20f342023..310b8ab44 100644
--- a/inventory/byo/hosts.origin.example
+++ b/inventory/byo/hosts.origin.example
@@ -788,6 +788,31 @@ openshift_master_identity_providers=[{'name': 'htpasswd_auth', 'login': 'true',
 #
 #etcd_ca_default_days=1825
 
+# Upgrade Control
+#
+# By default nodes are upgraded in a serial manner one at a time and all failures
+# are fatal
+#openshift_upgrade_nodes_serial=1
+#openshift_upgrade_nodes_max_fail_percentage=0
+#
+# You can specify the number of nodes to upgrade at once. We do not currently
+# attempt to verify that you have capacity to drain this many nodes at once
+# so please be careful when specifying these values. You should also verify that
+# the expected number of nodes are all schedulable and ready before starting an
+# upgrade. If it's not possible to drain the requested nodes the upgrade will
+# stall indefinitely until the drain is successful.
+#
+# If you're upgrading more than one node at a time you can specify the maximum
+# percentage of failure within the batch before the upgrade is aborted. Any
+# nodes that do fail are ignored for the rest of the playbook run and you should
+# take care to investigate the failure and return the node to service so that
+# your cluster.
+#
+# The percentage must exceed the value, this would fail on two failures
+# openshift_upgrade_nodes_serial=4 openshift_upgrade_nodes_max_fail_percentage=49
+# where as this would not
+# openshift_upgrade_nodes_serial=4 openshift_upgrade_nodes_max_fail_percentage=50
+
 # host group for masters
 [masters]
 ose3-master[1:3]-ansible.test.example.com
diff --git a/inventory/byo/hosts.ose.example b/inventory/byo/hosts.ose.example
index f75a47bb8..e126bbcab 100644
--- a/inventory/byo/hosts.ose.example
+++ b/inventory/byo/hosts.ose.example
@@ -785,6 +785,31 @@ openshift_master_identity_providers=[{'name': 'htpasswd_auth', 'login': 'true',
 #
 #etcd_ca_default_days=1825
 
+# Upgrade Control
+#
+# By default nodes are upgraded in a serial manner one at a time and all failures
+# are fatal
+#openshift_upgrade_nodes_serial=1
+#openshift_upgrade_nodes_max_fail_percentage=0
+#
+# You can specify the number of nodes to upgrade at once. We do not currently
+# attempt to verify that you have capacity to drain this many nodes at once
+# so please be careful when specifying these values. You should also verify that
+# the expected number of nodes are all schedulable and ready before starting an
+# upgrade. If it's not possible to drain the requested nodes the upgrade will
+# stall indefinitely until the drain is successful.
+#
+# If you're upgrading more than one node at a time you can specify the maximum
+# percentage of failure within the batch before the upgrade is aborted. Any
+# nodes that do fail are ignored for the rest of the playbook run and you should
+# take care to investigate the failure and return the node to service so that
+# your cluster.
+#
+# The percentage must exceed the value, this would fail on two failures
+# openshift_upgrade_nodes_serial=4 openshift_upgrade_nodes_max_fail_percentage=49
+# where as this would not
+# openshift_upgrade_nodes_serial=4 openshift_upgrade_nodes_max_fail_percentage=50
+
 # host group for masters
 [masters]
 ose3-master[1:3]-ansible.test.example.com
diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
index 0ad934d2d..a335c1072 100644
--- a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
+++ b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
@@ -248,7 +248,7 @@
   # This var must be set with -e on invocation, as it is not a per-host inventory var
   # and is evaluated early. Values such as "20%" can also be used.
   serial: "{{ openshift_upgrade_nodes_serial | default(1) }}"
-  any_errors_fatal: true
+  max_fail_percentage: "{{ openshift_upgrade_nodes_max_fail_percentage | default(0) }}"
 
   pre_tasks:
   - name: Load lib_openshift modules
diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
index 4d455fe0a..91dbc2cd4 100644
--- a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
+++ b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
@@ -4,7 +4,7 @@
   # This var must be set with -e on invocation, as it is not a per-host inventory var
   # and is evaluated early. Values such as "20%" can also be used.
   serial: "{{ openshift_upgrade_nodes_serial | default(1) }}"
-  any_errors_fatal: true
+  max_fail_percentage: "{{ openshift_upgrade_nodes_max_fail_percentage | default(0) }}"
 
   pre_tasks:
   - name: Load lib_openshift modules
-- 
cgit v1.2.3


From 2d8e3d2b28ce19569c76c56102e9639a6f26b0c2 Mon Sep 17 00:00:00 2001
From: Scott Dodson <sdodson@redhat.com>
Date: Mon, 22 May 2017 14:51:50 -0400
Subject: Add separate variables for control plane nodes

---
 inventory/byo/hosts.origin.example                                  | 6 +++++-
 inventory/byo/hosts.ose.example                                     | 6 +++++-
 .../common/openshift-cluster/upgrades/upgrade_control_plane.yml     | 4 ++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/inventory/byo/hosts.origin.example b/inventory/byo/hosts.origin.example
index 310b8ab44..023930954 100644
--- a/inventory/byo/hosts.origin.example
+++ b/inventory/byo/hosts.origin.example
@@ -791,9 +791,13 @@ openshift_master_identity_providers=[{'name': 'htpasswd_auth', 'login': 'true',
 # Upgrade Control
 #
 # By default nodes are upgraded in a serial manner one at a time and all failures
-# are fatal
+# are fatal, one set of variables for normal nodes, one set of variables for
+# nodes that are part of control plane as the number of hosts may be different
+# in those two groups.
 #openshift_upgrade_nodes_serial=1
 #openshift_upgrade_nodes_max_fail_percentage=0
+#openshift_upgrade_control_plane_nodes_serial=1
+#openshift_upgrade_control_plane_nodes_max_fail_percentage=0
 #
 # You can specify the number of nodes to upgrade at once. We do not currently
 # attempt to verify that you have capacity to drain this many nodes at once
diff --git a/inventory/byo/hosts.ose.example b/inventory/byo/hosts.ose.example
index e126bbcab..407ef19fb 100644
--- a/inventory/byo/hosts.ose.example
+++ b/inventory/byo/hosts.ose.example
@@ -788,9 +788,13 @@ openshift_master_identity_providers=[{'name': 'htpasswd_auth', 'login': 'true',
 # Upgrade Control
 #
 # By default nodes are upgraded in a serial manner one at a time and all failures
-# are fatal
+# are fatal, one set of variables for normal nodes, one set of variables for
+# nodes that are part of control plane as the number of hosts may be different
+# in those two groups.
 #openshift_upgrade_nodes_serial=1
 #openshift_upgrade_nodes_max_fail_percentage=0
+#openshift_upgrade_control_plane_nodes_serial=1
+#openshift_upgrade_control_plane_nodes_max_fail_percentage=0
 #
 # You can specify the number of nodes to upgrade at once. We do not currently
 # attempt to verify that you have capacity to drain this many nodes at once
diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
index a335c1072..275596582 100644
--- a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
+++ b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
@@ -247,8 +247,8 @@
   hosts: oo_masters_to_config:&oo_nodes_to_upgrade
   # This var must be set with -e on invocation, as it is not a per-host inventory var
   # and is evaluated early. Values such as "20%" can also be used.
-  serial: "{{ openshift_upgrade_nodes_serial | default(1) }}"
-  max_fail_percentage: "{{ openshift_upgrade_nodes_max_fail_percentage | default(0) }}"
+  serial: "{{ openshift_upgrade_control_plane_nodes_serial | default(1) }}"
+  max_fail_percentage: "{{ openshift_upgrade_control_plane_nodes_max_fail_percentage | default(0) }}"
 
   pre_tasks:
   - name: Load lib_openshift modules
-- 
cgit v1.2.3