diff options
6 files changed, 268 insertions, 0 deletions
diff --git a/playbooks/adhoc/docker_loopback_to_lvm/docker-storage-setup b/playbooks/adhoc/docker_loopback_to_lvm/docker-storage-setup new file mode 100644 index 000000000..059058823 --- /dev/null +++ b/playbooks/adhoc/docker_loopback_to_lvm/docker-storage-setup @@ -0,0 +1,2 @@ +DEVS=/dev/xvdb +VG=docker_vg diff --git a/playbooks/adhoc/docker_loopback_to_lvm/docker_loopback_to_direct_lvm.yml b/playbooks/adhoc/docker_loopback_to_lvm/docker_loopback_to_direct_lvm.yml new file mode 100644 index 000000000..c9ae923bb --- /dev/null +++ b/playbooks/adhoc/docker_loopback_to_lvm/docker_loopback_to_direct_lvm.yml @@ -0,0 +1,151 @@ +--- +# This playbook coverts docker to go from loopback to direct-lvm (the Red Hat recommended way to run docker) +#  in AWS.  This adds an additional EBS volume and creates the Volume Group on this EBS volume to use. +# +#  To run: +#  1. Source your AWS credentials (make sure it's the corresponding AWS account) into your environment +#    export AWS_ACCESS_KEY_ID='XXXXX' +#    export AWS_SECRET_ACCESS_KEY='XXXXXX' +# +# 2. run the playbook: +#   ansible-playbook -e 'cli_tag_name=<tag-name>' -e "cli_volume_size=30" docker_loopback_to_direct_lvm.yml +# +#  Example: +#   ansible-playbook -e 'cli_tag_name=ops-master-12345' -e "cli_volume_size=30" docker_loopback_to_direct_lvm.yml +# +#  Notes: +#  * By default this will do a 30GB volume. +#  * iops are calculated by Disk Size * 30.  e.g ( 30GB * 30) = 900 iops +#  * This will remove /var/lib/docker! +#  * You may need to re-deploy docker images after this is run (like monitoring) +# + +- name: Fix docker to have a provisioned iops drive +  hosts: "tag_Name_{{ cli_tag_name }}" +  user: root +  connection: ssh +  gather_facts: no + +  vars: +    cli_volume_type: io1 +    cli_volume_size: 30 +    cli_volume_iops: "{{ 30 * cli_volume_size }}" + +  pre_tasks: +  - fail: +      msg: "This playbook requires {{item}} to be set." +    when: "{{ item }} is not defined or {{ item }} == ''" +    with_items: +    - cli_tag_name +    - cli_volume_size + +  - debug: +      var: hosts + +  - name: start docker +    service: +      name: docker +      state: started + +  - name: Determine if loopback +    shell: docker info | grep 'Data file:.*loop' +    register: loop_device_check +    ignore_errors: yes + +  - debug: +      var: loop_device_check + +  - name: fail if we don't detect loopback +    fail: +      msg:  loopback not detected! Please investigate manually. +    when: loop_device_check.rc == 1 + +  - name: stop zagg client monitoring container +    service: +      name: oso-rhel7-zagg-client +      state: stopped +    ignore_errors: yes + +  - name: stop pcp client monitoring container +    service: +      name: oso-f22-host-monitoring +      state: stopped +    ignore_errors: yes + +  - name: stop docker +    service: +      name: docker +      state: stopped + +  - name: delete /var/lib/docker +    command: rm -rf /var/lib/docker + +  - name: remove /var/lib/docker +    command: rm -rf /var/lib/docker + +  - name: check to see if /dev/xvdb exists +    command: test -e /dev/xvdb +    register: xvdb_check +    ignore_errors: yes + +  - debug: var=xvdb_check + +  - name: fail if /dev/xvdb already exists +    fail: +      msg: /dev/xvdb already exists.  Please investigate +    when: xvdb_check.rc == 0 + +  - name: Create a volume and attach it +    delegate_to: localhost +    ec2_vol: +      state: present +      instance: "{{ ec2_id }}" +      region: "{{ ec2_region }}" +      volume_size: "{{ cli_volume_size | default(30, True)}}" +      volume_type: "{{ cli_volume_type }}" +      device_name: /dev/xvdb +      iops: "{{ 30 * cli_volume_size }}" +    register: vol + +  - debug: var=vol + +  - name: tag the vol with a name +    delegate_to: localhost +    ec2_tag: region={{ ec2_region }} resource={{ vol.volume_id }} +    args: +      tags: +        Name: "{{ ec2_tag_Name }}" +        env: "{{ ec2_tag_environment }}" +    register: voltags + +  - name: Wait for volume to attach +    pause: +      seconds: 30 + +  - name: copy the docker-storage-setup config file +    copy: +      src: docker-storage-setup +      dest: /etc/sysconfig/docker-storage-setup +      owner: root +      group: root +      mode: 0664 + +  - name: docker storage setup +    command: docker-storage-setup +    register: setup_output + +  - debug: var=setup_output + +  - name: start docker +    command: systemctl start docker.service +    register: dockerstart + +  - debug: var=dockerstart + +  - name: Wait for docker to stabilize +    pause: +      seconds: 30 + +  # leaving off the '-t' for docker exec.  With it, it doesn't work with ansible and tty support +  - name: update zabbix docker items +    command: docker exec -i oso-rhel7-zagg-client /usr/local/bin/cron-send-docker-metrics.py diff --git a/playbooks/adhoc/docker_storage_cleanup/docker_storage_cleanup.yml b/playbooks/adhoc/docker_storage_cleanup/docker_storage_cleanup.yml new file mode 100644 index 000000000..1946a5f4f --- /dev/null +++ b/playbooks/adhoc/docker_storage_cleanup/docker_storage_cleanup.yml @@ -0,0 +1,69 @@ +--- +# This playbook attempts to cleanup unwanted docker files to help alleviate docker disk space issues. +# +#  To run: +# +#  1. run the playbook: +# +#   ansible-playbook -e 'cli_tag_name=<tag-name>' docker_storage_cleanup.yml +# +#  Example: +# +#   ansible-playbook -e 'cli_tag_name=ops-node-compute-12345' docker_storage_cleanup.yml +# +#  Notes: +#  *  This *should* not interfere with running docker images +# + +- name: Clean up Docker Storage +  gather_facts: no +  hosts: "tag_Name_{{ cli_tag_name }}" +  user: root +  connection: ssh + +  pre_tasks: + +  - fail: +      msg: "This playbook requires {{item}} to be set." +    when: "{{ item }} is not defined or {{ item }} == ''" +    with_items: +    - cli_tag_name + +  - name: Ensure docker is running +    service: +      name: docker +      state: started +      enabled: yes + +  - name: Get docker info +    command: docker info +    register: docker_info + +  - name: Show docker info +    debug: +      var: docker_info.stdout_lines + +  - name: Remove exited and dead containers +    shell: "docker ps -a | awk '/Exited|Dead/ {print $1}' | xargs --no-run-if-empty docker rm" +    ignore_errors: yes + +  - name: Remove dangling docker images +    shell: "docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi" +    ignore_errors: yes + +  - name: Remove non-running docker images +    shell: "docker images -aq | xargs --no-run-if-empty docker rmi 2>/dev/null" +    ignore_errors: yes + +  # leaving off the '-t' for docker exec.  With it, it doesn't work with ansible and tty support +  - name: update zabbix docker items +    command: docker exec -i oso-rhel7-zagg-client /usr/local/bin/cron-send-docker-metrics.py + +  # Get and show docker info again. +  - name: Get docker info +    command: docker info +    register: docker_info + +  - name: Show docker info +    debug: +      var: docker_info.stdout_lines diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml index f9672b9c4..8347e9a61 100644 --- a/roles/os_zabbix/tasks/main.yml +++ b/roles/os_zabbix/tasks/main.yml @@ -11,6 +11,7 @@  - include_vars: template_os_linux.yml  - include_vars: template_docker.yml  - include_vars: template_openshift_master.yml +- include_vars: template_openshift_node.yml  - name: Include Template Heartbeat    include: ../../lib_zabbix/tasks/create_template.yml @@ -43,3 +44,11 @@      server: "{{ ozb_server }}"      user: "{{ ozb_user }}"      password: "{{ ozb_password }}" + +- name: Include Template Openshift Node +  include: ../../lib_zabbix/tasks/create_template.yml +  vars: +    template: "{{ g_template_openshift_node }}" +    server: "{{ ozb_server }}" +    user: "{{ ozb_user }}" +    password: "{{ ozb_password }}" diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml index 728423ac1..c71e07910 100644 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -6,8 +6,25 @@ g_template_openshift_master:      applications:      - Openshift Master      key: create_app + +  - key: openshift.master.process.count +    description: Shows number of master processes running +    type: int +    applications: +    - Openshift Master +    ztriggers:    - name: 'Application creation has failed on {HOST.NAME}'      expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'      url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'      priority: avg + +  - name: 'Openshift Master process not running on {HOST.NAME}' +    expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1' +    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' +    priority: high + +  - name: 'Too many Openshift Master processes running on {HOST.NAME}' +    expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1' +    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' +    priority: high diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml new file mode 100644 index 000000000..36f9cc4a3 --- /dev/null +++ b/roles/os_zabbix/vars/template_openshift_node.yml @@ -0,0 +1,20 @@ +--- +g_template_openshift_node: +  name: Template Openshift Node +  zitems: +  - key: openshift.node.process.count +    description: Shows number of OpenShift Node processes running +    type: int +    applications: +    - Openshift Node + +  ztriggers: +  - name: 'Openshift Node process not running on {HOST.NAME}' +    expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1' +    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' +    priority: high + +  - name: 'Too many Openshift Node processes running on {HOST.NAME}' +    expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1' +    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' +    priority: high  | 
