Switch to migrating one host and forming a new cluster

With large datasets where there are many keys with TTLs the expiry was creating a data inconsistency problem. The hope is that by performing the migration once and then forming a new cluster this is avoided. Fixes https://bugzilla.redhat.com/show_bug.cgi?id=1475351
author: Scott Dodson <sdodson@redhat.com> 2017-08-01 12:55:47 -0400
committer: Scott Dodson <sdodson@redhat.com> 2017-08-22 12:57:57 -0400
commit: 4b5d8d2dc25dbca20be59f3d5d111d737fd865bc (patch)
tree: b2707928f22b8a308ff8b110b2ce466e0a9b3c95 /playbooks/common
parent: b107f677a3563f6db9a1ef17fe10518d321d495c (diff)
2 files changed, 68 insertions, 10 deletions
diff --git a/playbooks/common/openshift-etcd/migrate.yml b/playbooks/common/openshift-etcd/migrate.yml
index 3e7a48669..311ff84b6 100644
--- a/playbooks/common/openshift-etcd/migrate.yml
+++ b/playbooks/common/openshift-etcd/migrate.yml
@@ -17,18 +17,14 @@
   tags:
   - always
 
+# TODO: This will be different for release-3.6 branch
 - name: Prepare masters for etcd data migration
   hosts: oo_masters_to_config
   tasks:
   - set_fact:
       master_services:
-      - "{{ openshift.common.service_type + '-master' }}"
-  - set_fact:
-      master_services:
       - "{{ openshift.common.service_type + '-master-controllers' }}"
       - "{{ openshift.common.service_type + '-master-api' }}"
-    when:
-    - (openshift_master_cluster_method is defined and openshift_master_cluster_method == "native") or openshift.common.is_master_system_container | bool
   - debug:
       msg: "master service name: {{ master_services }}"
   - name: Stop masters
@@ -67,16 +63,59 @@
     when:
     - etcd_backup_failed | length > 0
 
-- name: Migrate etcd data from v2 to v3
+- name: Stop etcd
   hosts: oo_etcd_to_migrate
   gather_facts: no
   tags:
   - always
+  pre_tasks:
+  - set_fact:
+      l_etcd_service: "{{ 'etcd_container' if openshift.common.is_containerized else 'etcd' }}"
+  - name: Disable etcd members
+    service:
+      name: "{{ l_etcd_service }}"
+      state: stopped
+
+- name: Migrate data on first etcd
+  hosts: oo_etcd_to_migrate[0]
+  gather_facts: no
+  tags:
+  - always
   roles:
   - role: etcd_migrate
     r_etcd_migrate_action: migrate
     r_etcd_common_embedded_etcd: "{{ groups.oo_etcd_to_config | default([]) | length == 0 }}"
     etcd_peer: "{{ ansible_default_ipv4.address }}"
+    etcd_url_scheme: "https"
+    etcd_peer_url_scheme: "https"
+
+- name: Clean data stores on remaining etcd hosts
+  hosts: oo_etcd_to_migrate[1:]
+  gather_facts: no
+  tags:
+  - always
+  roles:
+  - role: etcd_migrate
+    r_etcd_migrate_action: clean_data
+    r_etcd_common_embedded_etcd: "{{ groups.oo_etcd_to_config | default([]) | length == 0 }}"
+    etcd_peer: "{{ ansible_default_ipv4.address }}"
+    etcd_url_scheme: "https"
+    etcd_peer_url_scheme: "https"
+  post_tasks:
+  - name: Add etcd hosts
+    delegate_to: localhost
+    add_host:
+      name: "{{ item }}"
+      groups: oo_new_etcd_to_config
+      ansible_ssh_user: "{{ g_ssh_user | default(omit) }}"
+      ansible_become: "{{ g_sudo | default(omit) }}"
+    with_items: "{{ groups.oo_etcd_to_migrate[1:] | default([]) }}"
+    changed_when: no
+  - name: Set success
+    set_fact:
+      r_etcd_migrate_success: true
+
+- include: ./scaleup.yml
 
 - name: Gate on etcd migration
   hosts: oo_masters_to_config
@@ -89,6 +128,16 @@
   - set_fact:
       etcd_migration_failed: "{{ groups.oo_etcd_to_migrate | difference(etcd_migration_completed) }}"
 
+- name: Add TTLs on the first master
+  hosts: oo_first_master[0]
+  roles:
+  - role: etcd_migrate
+    r_etcd_migrate_action: add_ttls
+    etcd_peer: "{{ hostvars[groups.oo_etcd_to_migrate.0].ansible_default_ipv4.address }}"
+    etcd_url_scheme: "https"
+    etcd_peer_url_scheme: "https"
+    when: etcd_migration_failed | length == 0
+
 - name: Configure masters if etcd data migration is succesfull
   hosts: oo_masters_to_config
   roles:
@@ -100,10 +149,6 @@
       msg: "Skipping master re-configuration since migration failed."
     when:
     - etcd_migration_failed | length > 0
-
-- name: Start masters after etcd data migration
-  hosts: oo_masters_to_config
-  tasks:
   - name: Start master services
     service:
       name: "{{ item }}"
diff --git a/playbooks/common/openshift-etcd/scaleup.yml b/playbooks/common/openshift-etcd/scaleup.yml
index 192305bc8..52b90daca 100644
--- a/playbooks/common/openshift-etcd/scaleup.yml
+++ b/playbooks/common/openshift-etcd/scaleup.yml
@@ -24,6 +24,9 @@
                        member add {{ etcd_hostname }} {{ etcd_peer_url_scheme }}://{{ etcd_ip }}:{{ etcd_peer_port }}
     delegate_to: "{{ etcd_ca_host }}"
     register: etcd_add_check
+    retries: 3
+    delay: 10
+    until: etcd_add_check.rc == 0
   roles:
   - role: openshift_etcd
     when: etcd_add_check.rc == 0
@@ -36,3 +39,13 @@
     r_etcd_common_etcd_runtime: "{{ openshift.common.etcd_runtime }}"
   - role: nickhammond.logrotate
     when: etcd_add_check.rc == 0
+  post_tasks:
+  - name: Verify cluster is stable
+    command: >
+      /usr/bin/etcdctl --cert-file {{ etcd_peer_cert_file }}
+                       --key-file {{ etcd_peer_key_file }}
+                       --ca-file {{ etcd_peer_ca_file }}
+                       -C {{ etcd_peer_url_scheme }}://{{ hostvars[etcd_ca_host].etcd_hostname }}:{{ etcd_client_port }}
+                       cluster-health
+    retries: 1
+    delay: 30
author	Scott Dodson <sdodson@redhat.com>	2017-08-01 12:55:47 -0400
committer	Scott Dodson <sdodson@redhat.com>	2017-08-22 12:57:57 -0400
commit	4b5d8d2dc25dbca20be59f3d5d111d737fd865bc (patch)
tree	b2707928f22b8a308ff8b110b2ce466e0a9b3c95 /playbooks/common
parent	b107f677a3563f6db9a1ef17fe10518d321d495c (diff)