From 4b5d8d2dc25dbca20be59f3d5d111d737fd865bc Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Tue, 1 Aug 2017 12:55:47 -0400 Subject: Switch to migrating one host and forming a new cluster With large datasets where there are many keys with TTLs the expiry was creating a data inconsistency problem. The hope is that by performing the migration once and then forming a new cluster this is avoided. Fixes https://bugzilla.redhat.com/show_bug.cgi?id=1475351 --- playbooks/common/openshift-etcd/migrate.yml | 65 ++++++++++++++++++++++++----- playbooks/common/openshift-etcd/scaleup.yml | 13 ++++++ roles/etcd_common/defaults/main.yml | 10 +++++ roles/etcd_migrate/tasks/add_ttls.yml | 33 +++++++++++++++ roles/etcd_migrate/tasks/check.yml | 3 -- roles/etcd_migrate/tasks/clean_data.yml | 5 +++ roles/etcd_migrate/tasks/main.yml | 4 +- roles/etcd_migrate/tasks/migrate.yml | 49 +++++++--------------- 8 files changed, 134 insertions(+), 48 deletions(-) create mode 100644 roles/etcd_migrate/tasks/add_ttls.yml create mode 100644 roles/etcd_migrate/tasks/clean_data.yml diff --git a/playbooks/common/openshift-etcd/migrate.yml b/playbooks/common/openshift-etcd/migrate.yml index 3e7a48669..311ff84b6 100644 --- a/playbooks/common/openshift-etcd/migrate.yml +++ b/playbooks/common/openshift-etcd/migrate.yml @@ -17,18 +17,14 @@ tags: - always +# TODO: This will be different for release-3.6 branch - name: Prepare masters for etcd data migration hosts: oo_masters_to_config tasks: - - set_fact: - master_services: - - "{{ openshift.common.service_type + '-master' }}" - set_fact: master_services: - "{{ openshift.common.service_type + '-master-controllers' }}" - "{{ openshift.common.service_type + '-master-api' }}" - when: - - (openshift_master_cluster_method is defined and openshift_master_cluster_method == "native") or openshift.common.is_master_system_container | bool - debug: msg: "master service name: {{ master_services }}" - name: Stop masters @@ -67,16 +63,59 @@ when: - etcd_backup_failed | length > 0 -- name: Migrate etcd data from v2 to v3 +- name: Stop etcd hosts: oo_etcd_to_migrate gather_facts: no tags: - always + pre_tasks: + - set_fact: + l_etcd_service: "{{ 'etcd_container' if openshift.common.is_containerized else 'etcd' }}" + - name: Disable etcd members + service: + name: "{{ l_etcd_service }}" + state: stopped + +- name: Migrate data on first etcd + hosts: oo_etcd_to_migrate[0] + gather_facts: no + tags: + - always roles: - role: etcd_migrate r_etcd_migrate_action: migrate r_etcd_common_embedded_etcd: "{{ groups.oo_etcd_to_config | default([]) | length == 0 }}" etcd_peer: "{{ ansible_default_ipv4.address }}" + etcd_url_scheme: "https" + etcd_peer_url_scheme: "https" + +- name: Clean data stores on remaining etcd hosts + hosts: oo_etcd_to_migrate[1:] + gather_facts: no + tags: + - always + roles: + - role: etcd_migrate + r_etcd_migrate_action: clean_data + r_etcd_common_embedded_etcd: "{{ groups.oo_etcd_to_config | default([]) | length == 0 }}" + etcd_peer: "{{ ansible_default_ipv4.address }}" + etcd_url_scheme: "https" + etcd_peer_url_scheme: "https" + post_tasks: + - name: Add etcd hosts + delegate_to: localhost + add_host: + name: "{{ item }}" + groups: oo_new_etcd_to_config + ansible_ssh_user: "{{ g_ssh_user | default(omit) }}" + ansible_become: "{{ g_sudo | default(omit) }}" + with_items: "{{ groups.oo_etcd_to_migrate[1:] | default([]) }}" + changed_when: no + - name: Set success + set_fact: + r_etcd_migrate_success: true + +- include: ./scaleup.yml - name: Gate on etcd migration hosts: oo_masters_to_config @@ -89,6 +128,16 @@ - set_fact: etcd_migration_failed: "{{ groups.oo_etcd_to_migrate | difference(etcd_migration_completed) }}" +- name: Add TTLs on the first master + hosts: oo_first_master[0] + roles: + - role: etcd_migrate + r_etcd_migrate_action: add_ttls + etcd_peer: "{{ hostvars[groups.oo_etcd_to_migrate.0].ansible_default_ipv4.address }}" + etcd_url_scheme: "https" + etcd_peer_url_scheme: "https" + when: etcd_migration_failed | length == 0 + - name: Configure masters if etcd data migration is succesfull hosts: oo_masters_to_config roles: @@ -100,10 +149,6 @@ msg: "Skipping master re-configuration since migration failed." when: - etcd_migration_failed | length > 0 - -- name: Start masters after etcd data migration - hosts: oo_masters_to_config - tasks: - name: Start master services service: name: "{{ item }}" diff --git a/playbooks/common/openshift-etcd/scaleup.yml b/playbooks/common/openshift-etcd/scaleup.yml index 192305bc8..52b90daca 100644 --- a/playbooks/common/openshift-etcd/scaleup.yml +++ b/playbooks/common/openshift-etcd/scaleup.yml @@ -24,6 +24,9 @@ member add {{ etcd_hostname }} {{ etcd_peer_url_scheme }}://{{ etcd_ip }}:{{ etcd_peer_port }} delegate_to: "{{ etcd_ca_host }}" register: etcd_add_check + retries: 3 + delay: 10 + until: etcd_add_check.rc == 0 roles: - role: openshift_etcd when: etcd_add_check.rc == 0 @@ -36,3 +39,13 @@ r_etcd_common_etcd_runtime: "{{ openshift.common.etcd_runtime }}" - role: nickhammond.logrotate when: etcd_add_check.rc == 0 + post_tasks: + - name: Verify cluster is stable + command: > + /usr/bin/etcdctl --cert-file {{ etcd_peer_cert_file }} + --key-file {{ etcd_peer_key_file }} + --ca-file {{ etcd_peer_ca_file }} + -C {{ etcd_peer_url_scheme }}://{{ hostvars[etcd_ca_host].etcd_hostname }}:{{ etcd_client_port }} + cluster-health + retries: 1 + delay: 30 diff --git a/roles/etcd_common/defaults/main.yml b/roles/etcd_common/defaults/main.yml index b1bfa4592..89993f7ea 100644 --- a/roles/etcd_common/defaults/main.yml +++ b/roles/etcd_common/defaults/main.yml @@ -63,3 +63,13 @@ etcd_client_port: 2379 etcd_peer_port: 2380 etcd_url_scheme: http etcd_peer_url_scheme: http + +etcd_initial_cluster_state: new +etcd_initial_cluster_token: etcd-cluster-1 + +etcd_initial_advertise_peer_urls: "{{ etcd_peer_url_scheme }}://{{ etcd_ip }}:{{ etcd_peer_port }}" +etcd_listen_peer_urls: "{{ etcd_peer_url_scheme }}://{{ etcd_ip }}:{{ etcd_peer_port }}" +etcd_advertise_client_urls: "{{ etcd_url_scheme }}://{{ etcd_ip }}:{{ etcd_client_port }}" +etcd_listen_client_urls: "{{ etcd_url_scheme }}://{{ etcd_ip }}:{{ etcd_client_port }}" + +etcd_systemd_dir: "/etc/systemd/system/{{ etcd_service }}.service.d" diff --git a/roles/etcd_migrate/tasks/add_ttls.yml b/roles/etcd_migrate/tasks/add_ttls.yml new file mode 100644 index 000000000..c10465af9 --- /dev/null +++ b/roles/etcd_migrate/tasks/add_ttls.yml @@ -0,0 +1,33 @@ +--- +# To be executed on first master +- slurp: + src: "{{ openshift.common.config_base }}/master/master-config.yaml" + register: g_master_config_output + +- set_fact: + accessTokenMaxAgeSeconds: "{{ (g_master_config_output.content|b64decode|from_yaml).oauthConfig.tokenConfig.accessTokenMaxAgeSeconds | default(86400) }}" + authroizeTokenMaxAgeSeconds: "{{ (g_master_config_output.content|b64decode|from_yaml).oauthConfig.tokenConfig.authroizeTokenMaxAgeSeconds | default(500) }}" + controllerLeaseTTL: "{{ (g_master_config_output.content|b64decode|from_yaml).controllerLeaseTTL | default(30) }}" +- name: Re-introduce leases (as a replacement for key TTLs) + command: > + oadm migrate etcd-ttl \ + --cert {{ r_etcd_common_master_peer_cert_file }} \ + --key {{ r_etcd_common_master_peer_key_file }} \ + --cacert {{ r_etcd_common_master_peer_ca_file }} \ + --etcd-address 'https://{{ etcd_peer }}:{{ etcd_client_port }}' \ + --ttl-keys-prefix {{ item.keys }} \ + --lease-duration {{ item.ttl }} + environment: + ETCDCTL_API: 3 + PATH: "/usr/local/bin:/var/usrlocal/bin:{{ ansible_env.PATH }}" + with_items: + - keys: "/kubernetes.io/events" + ttl: "1h" + - keys: "/kubernetes.io/masterleases" + ttl: "10s" + - keys: "/openshift.io/oauth/accesstokens" + ttl: "{{ accessTokenMaxAgeSeconds }}s" + - keys: "/openshift.io/oauth/authorizetokens" + ttl: "{{ authroizeTokenMaxAgeSeconds }}s" + - keys: "/openshift.io/leases/controllers" + ttl: "{{ controllerLeaseTTL }}s" diff --git a/roles/etcd_migrate/tasks/check.yml b/roles/etcd_migrate/tasks/check.yml index b66696b55..0804d9e1c 100644 --- a/roles/etcd_migrate/tasks/check.yml +++ b/roles/etcd_migrate/tasks/check.yml @@ -1,7 +1,4 @@ --- -- fail: - msg: "Currently etcd v3 migration is unsupported while we test it more thoroughly" - when: not openshift_enable_unsupported_configurations | default(false) | bool # Check the cluster is healthy - include: check_cluster_health.yml diff --git a/roles/etcd_migrate/tasks/clean_data.yml b/roles/etcd_migrate/tasks/clean_data.yml new file mode 100644 index 000000000..95a0e7c0a --- /dev/null +++ b/roles/etcd_migrate/tasks/clean_data.yml @@ -0,0 +1,5 @@ +--- +- name: Remove member data + file: + path: /var/lib/etcd/member + state: absent diff --git a/roles/etcd_migrate/tasks/main.yml b/roles/etcd_migrate/tasks/main.yml index 409b0b613..e82f6a6b4 100644 --- a/roles/etcd_migrate/tasks/main.yml +++ b/roles/etcd_migrate/tasks/main.yml @@ -1,8 +1,8 @@ --- - name: Fail if invalid r_etcd_migrate_action provided fail: - msg: "etcd_migrate role can only be called with 'check' or 'migrate' or 'configure'" - when: r_etcd_migrate_action not in ['check', 'migrate', 'configure'] + msg: "etcd_migrate role can only be called with 'check', 'migrate', 'configure', 'add_ttls', or 'clean_data'" + when: r_etcd_migrate_action not in ['check', 'migrate', 'configure', 'add_ttls', 'clean_data'] - name: Include main action task file include: "{{ r_etcd_migrate_action }}.yml" diff --git a/roles/etcd_migrate/tasks/migrate.yml b/roles/etcd_migrate/tasks/migrate.yml index b2cf6d20a..173de77f4 100644 --- a/roles/etcd_migrate/tasks/migrate.yml +++ b/roles/etcd_migrate/tasks/migrate.yml @@ -3,62 +3,45 @@ - set_fact: l_etcd_service: "{{ 'etcd_container' if openshift.common.is_containerized else 'etcd' }}" -- name: Disable etcd members - service: - name: "{{ l_etcd_service }}" - state: stopped - -# Should we skip all TTL keys? https://bugzilla.redhat.com/show_bug.cgi?id=1389773 - name: Migrate etcd data command: > etcdctl migrate --data-dir={{ etcd_data_dir }} environment: ETCDCTL_API: 3 register: l_etcdctl_migrate - # TODO(jchaloup): If any of the members fails, we need to restore all members to v2 from the pre-migrate backup - name: Check the etcd v2 data are correctly migrated fail: msg: "Failed to migrate a member" when: "'finished transforming keys' not in l_etcdctl_migrate.stdout and 'no v2 keys to migrate' not in l_etcdctl_migrate.stdout" - - name: Migration message debug: msg: "Etcd migration finished with: {{ l_etcdctl_migrate.stdout }}" - -- name: Enable etcd member - service: +- name: Set ETCD_FORCE_NEW_CLUSTER=true on first etcd host + lineinfile: + line: "ETCD_FORCE_NEW_CLUSTER=true" + dest: /etc/etcd/etcd.conf +- name: Start etcd + systemd: name: "{{ l_etcd_service }}" state: started +- name: Unset ETCD_FORCE_NEW_CLUSTER=true on first etcd host + lineinfile: + line: "ETCD_FORCE_NEW_CLUSTER=true" + dest: /etc/etcd/etcd.conf + state: absent +- name: Restart first etcd host + systemd: + name: "{{ l_etcd_service }}" + state: restarted -- name: Wait for cluster to become healthy after migration +- name: Wait for cluster to become healthy after bringing up first member command: > etcdctl --cert-file {{ etcd_peer_cert_file }} --key-file {{ etcd_peer_key_file }} --ca-file {{ etcd_peer_ca_file }} --endpoint https://{{ etcd_peer }}:{{ etcd_client_port }} cluster-health register: l_etcd_migrate_health until: l_etcd_migrate_health.rc == 0 retries: 3 delay: 30 - run_once: true - -# NOTE: /usr/local/bin may be removed from the PATH by ansible hence why -# it's added to the environment in this task. -- name: Re-introduce leases (as a replacement for key TTLs) - command: > - oadm migrate etcd-ttl \ - --cert {{ r_etcd_common_master_peer_cert_file }} \ - --key {{ r_etcd_common_master_peer_key_file }} \ - --cacert {{ r_etcd_common_master_peer_ca_file }} \ - --etcd-address 'https://{{ etcd_peer }}:{{ etcd_client_port }}' \ - --ttl-keys-prefix {{ item }} \ - --lease-duration 1h - environment: - ETCDCTL_API: 3 - PATH: "/usr/local/bin:/var/usrlocal/bin:{{ ansible_env.PATH }}" - with_items: - - "/kubernetes.io/events" - - "/kubernetes.io/masterleases" - delegate_to: "{{ groups.oo_first_master[0] }}" - run_once: true - set_fact: r_etcd_migrate_success: true -- cgit v1.2.3