diff options
42 files changed, 942 insertions, 147 deletions
diff --git a/README_GCE.md b/README_GCE.md index f6c5138c1..50f8ade70 100644 --- a/README_GCE.md +++ b/README_GCE.md @@ -39,6 +39,13 @@ Create a gce.ini file for GCE * gce_service_account_pem_file_path - Full path from previous steps * gce_project_id - Found in "Projects", it list all the gce projects you are associated with. The page lists their "Project Name" and "Project ID". You want the "Project ID" +Mandatory customization variables (check the values according to your tenant): +* zone = europe-west1-d +* network = default +* gce_machine_type = n1-standard-2 +* gce_machine_image = preinstalled-slave-50g-v5 + + 1. vi ~/.gce/gce.ini 1. make the contents look like this: ``` @@ -46,11 +53,15 @@ Create a gce.ini file for GCE gce_service_account_email_address = long...@developer.gserviceaccount.com gce_service_account_pem_file_path = /full/path/to/project_id-gce_key_hash.pem gce_project_id = project_id +zone = europe-west1-d +network = default +gce_machine_type = n1-standard-2 +gce_machine_image = preinstalled-slave-50g-v5 + ``` -1. Setup a sym link so that gce.py will pick it up (link must be in same dir as gce.py) +1. Define the environment variable GCE_INI_PATH so gce.py can pick it up and bin/cluster can also read it ``` - cd openshift-ansible/inventory/gce - ln -s ~/.gce/gce.ini gce.ini +export GCE_INI_PATH=~/.gce/gce.ini ``` diff --git a/README_libvirt.md b/README_libvirt.md index 1a710ff3b..18ec66f2a 100644 --- a/README_libvirt.md +++ b/README_libvirt.md @@ -75,7 +75,7 @@ In order to fix that issue, you have several possibilities: * accessible by the qemu user. * Grant the qemu user access to the storage pool. -On Arch: +On Arch or Fedora 22+: ``` setfacl -m g:kvm:--x ~ @@ -94,7 +94,8 @@ dns=dnsmasq - Configure dnsmasq to use the Virtual Network router for example.com: ```sh -sudo vi /etc/NetworkManager/dnsmasq.d/libvirt_dnsmasq.conf server=/example.com/192.168.55.1 +sudo vi /etc/NetworkManager/dnsmasq.d/libvirt_dnsmasq.conf +server=/example.com/192.168.55.1 ``` Test The Setup diff --git a/bin/cluster b/bin/cluster index 582327415..59a6755d3 100755 --- a/bin/cluster +++ b/bin/cluster @@ -5,6 +5,7 @@ import argparse import ConfigParser import os import sys +import subprocess import traceback @@ -53,7 +54,6 @@ class Cluster(object): """ Create an OpenShift cluster for given provider :param args: command line arguments provided by user - :return: exit status from run command """ env = {'cluster_id': args.cluster_id, 'deployment_type': self.get_deployment_type(args)} @@ -65,65 +65,60 @@ class Cluster(object): env['num_infra'] = args.infra env['num_etcd'] = args.etcd - return self.action(args, inventory, env, playbook) + self.action(args, inventory, env, playbook) def terminate(self, args): """ Destroy OpenShift cluster :param args: command line arguments provided by user - :return: exit status from run command """ env = {'cluster_id': args.cluster_id, 'deployment_type': self.get_deployment_type(args)} playbook = "playbooks/{}/openshift-cluster/terminate.yml".format(args.provider) inventory = self.setup_provider(args.provider) - return self.action(args, inventory, env, playbook) + self.action(args, inventory, env, playbook) def list(self, args): """ List VMs in cluster :param args: command line arguments provided by user - :return: exit status from run command """ env = {'cluster_id': args.cluster_id, 'deployment_type': self.get_deployment_type(args)} playbook = "playbooks/{}/openshift-cluster/list.yml".format(args.provider) inventory = self.setup_provider(args.provider) - return self.action(args, inventory, env, playbook) + self.action(args, inventory, env, playbook) def config(self, args): """ Configure or reconfigure OpenShift across clustered VMs :param args: command line arguments provided by user - :return: exit status from run command """ env = {'cluster_id': args.cluster_id, 'deployment_type': self.get_deployment_type(args)} playbook = "playbooks/{}/openshift-cluster/config.yml".format(args.provider) inventory = self.setup_provider(args.provider) - return self.action(args, inventory, env, playbook) + self.action(args, inventory, env, playbook) def update(self, args): """ Update to latest OpenShift across clustered VMs :param args: command line arguments provided by user - :return: exit status from run command """ env = {'cluster_id': args.cluster_id, 'deployment_type': self.get_deployment_type(args)} playbook = "playbooks/{}/openshift-cluster/update.yml".format(args.provider) inventory = self.setup_provider(args.provider) - return self.action(args, inventory, env, playbook) + self.action(args, inventory, env, playbook) def service(self, args): """ Make the same service call across all nodes in the cluster :param args: command line arguments provided by user - :return: exit status from run command """ env = {'cluster_id': args.cluster_id, 'deployment_type': self.get_deployment_type(args), @@ -132,7 +127,7 @@ class Cluster(object): playbook = "playbooks/{}/openshift-cluster/service.yml".format(args.provider) inventory = self.setup_provider(args.provider) - return self.action(args, inventory, env, playbook) + self.action(args, inventory, env, playbook) def setup_provider(self, provider): """ @@ -142,10 +137,14 @@ class Cluster(object): """ config = ConfigParser.ConfigParser() if 'gce' == provider: - config.readfp(open('inventory/gce/hosts/gce.ini')) + gce_ini_default_path = os.path.join( + 'inventory/gce/hosts/gce.ini') + gce_ini_path = os.environ.get('GCE_INI_PATH', gce_ini_default_path) + if os.path.exists(gce_ini_path): + config.readfp(open(gce_ini_path)) - for key in config.options('gce'): - os.environ[key] = config.get('gce', key) + for key in config.options('gce'): + os.environ[key] = config.get('gce', key) inventory = '-i inventory/gce/hosts' elif 'aws' == provider: @@ -183,7 +182,6 @@ class Cluster(object): :param inventory: derived provider library :param env: environment variables for kubernetes :param playbook: ansible playbook to execute - :return: exit status from ansible-playbook command """ verbose = '' @@ -213,7 +211,18 @@ class Cluster(object): sys.stderr.write('RUN [{}]\n'.format(command)) sys.stderr.flush() - return os.system(command) + try: + subprocess.check_call(command, shell=True) + except subprocess.CalledProcessError as exc: + raise ActionFailed("ACTION [{}] failed: {}" + .format(args.action, exc)) + + +class ActionFailed(Exception): + """ + Raised when action failed. + """ + pass if __name__ == '__main__': @@ -328,14 +337,11 @@ if __name__ == '__main__': sys.stderr.write('\nACTION [update] aborted by user!\n') exit(1) - status = 1 try: - status = args.func(args) - if status != 0: - sys.stderr.write("ACTION [{}] failed with exit status {}\n".format(args.action, status)) - except Exception, e: + args.func(args) + except Exception as exc: if args.verbose: traceback.print_exc(file=sys.stderr) else: - sys.stderr.write("{}\n".format(e)) - exit(status) + print >>sys.stderr, exc + exit(1) diff --git a/inventory/byo/hosts.example b/inventory/byo/hosts.example index df1bae49f..f554cc660 100644 --- a/inventory/byo/hosts.example +++ b/inventory/byo/hosts.example @@ -44,6 +44,12 @@ openshift_master_identity_providers=[{'name': 'htpasswd_auth', 'login': 'true', # Configure Fluentd #use_fluentd=true +# Enable cockpit +#osm_use_cockpit=true +# +# Set cockpit plugins +#osm_cockpit_plugins=['cockpit-kubernetes'] + # master cluster ha variables using pacemaker or RHEL HA #openshift_master_cluster_password=openshift_cluster #openshift_master_cluster_vip=192.168.133.25 diff --git a/inventory/gce/hosts/gce.py b/inventory/gce/hosts/gce.py index 3403f735e..6ed12e011 100755 --- a/inventory/gce/hosts/gce.py +++ b/inventory/gce/hosts/gce.py @@ -120,6 +120,7 @@ class GceInventory(object): os.path.dirname(os.path.realpath(__file__)), "gce.ini") gce_ini_path = os.environ.get('GCE_INI_PATH', gce_ini_default_path) + # Create a ConfigParser. # This provides empty defaults to each key, so that environment # variable configuration (as opposed to INI configuration) is able @@ -173,6 +174,7 @@ class GceInventory(object): args[1] = os.environ.get('GCE_PEM_FILE_PATH', args[1]) kwargs['project'] = os.environ.get('GCE_PROJECT', kwargs['project']) + # Retrieve and return the GCE driver. gce = get_driver(Provider.GCE)(*args, **kwargs) gce.connection.user_agent_append( @@ -211,7 +213,8 @@ class GceInventory(object): 'gce_image': inst.image, 'gce_machine_type': inst.size, 'gce_private_ip': inst.private_ips[0], - 'gce_public_ip': inst.public_ips[0], + # Hosts don't always have a public IP name + #'gce_public_ip': inst.public_ips[0], 'gce_name': inst.name, 'gce_description': inst.extra['description'], 'gce_status': inst.extra['status'], @@ -219,8 +222,8 @@ class GceInventory(object): 'gce_tags': inst.extra['tags'], 'gce_metadata': md, 'gce_network': net, - # Hosts don't have a public name, so we add an IP - 'ansible_ssh_host': inst.public_ips[0] + # Hosts don't always have a public IP name + #'ansible_ssh_host': inst.public_ips[0] } def get_instance(self, instance_name): diff --git a/inventory/openstack/hosts/nova.py b/inventory/openstack/hosts/nova.py index d5bd8d1ee..3197a57bc 100755 --- a/inventory/openstack/hosts/nova.py +++ b/inventory/openstack/hosts/nova.py @@ -34,7 +34,7 @@ except ImportError: # executed with no parameters, return the list of # all groups and hosts -NOVA_CONFIG_FILES = [os.getcwd() + "/nova.ini", +NOVA_CONFIG_FILES = [os.path.join(os.path.dirname(os.path.realpath(__file__)), "nova.ini"), os.path.expanduser(os.environ.get('ANSIBLE_CONFIG', "~/nova.ini")), "/etc/ansible/nova.ini"] diff --git a/playbooks/adhoc/docker_loopback_to_lvm/docker_loopback_to_direct_lvm.yml b/playbooks/adhoc/docker_loopback_to_lvm/docker_loopback_to_direct_lvm.yml index c9ae923bb..b6a2d2f26 100644 --- a/playbooks/adhoc/docker_loopback_to_lvm/docker_loopback_to_direct_lvm.yml +++ b/playbooks/adhoc/docker_loopback_to_lvm/docker_loopback_to_direct_lvm.yml @@ -27,9 +27,8 @@ gather_facts: no vars: - cli_volume_type: io1 + cli_volume_type: gp2 cli_volume_size: 30 - cli_volume_iops: "{{ 30 * cli_volume_size }}" pre_tasks: - fail: @@ -104,7 +103,6 @@ volume_size: "{{ cli_volume_size | default(30, True)}}" volume_type: "{{ cli_volume_type }}" device_name: /dev/xvdb - iops: "{{ 30 * cli_volume_size }}" register: vol - debug: var=vol @@ -142,10 +140,3 @@ - debug: var=dockerstart - - name: Wait for docker to stabilize - pause: - seconds: 30 - - # leaving off the '-t' for docker exec. With it, it doesn't work with ansible and tty support - - name: update zabbix docker items - command: docker exec -i oso-rhel7-zagg-client /usr/local/bin/cron-send-docker-metrics.py diff --git a/playbooks/adhoc/docker_loopback_to_lvm/ops-docker-loopback-to-direct-lvm.yml b/playbooks/adhoc/docker_loopback_to_lvm/ops-docker-loopback-to-direct-lvm.yml new file mode 100755 index 000000000..614b2537a --- /dev/null +++ b/playbooks/adhoc/docker_loopback_to_lvm/ops-docker-loopback-to-direct-lvm.yml @@ -0,0 +1,104 @@ +#!/usr/bin/ansible-playbook +--- +# This playbook coverts docker to go from loopback to direct-lvm (the Red Hat recommended way to run docker). +# +# It requires the block device to be already provisioned and attached to the host. This is a generic playbook, +# meant to be used for manual conversion. For AWS specific conversions, use the other playbook in this directory. +# +# To run: +# ./ops-docker-loopback-to-direct-lvm.yml -e cli_host=<host to run on> -e cli_docker_device=<path to device> +# +# Example: +# ./ops-docker-loopback-to-direct-lvm.yml -e cli_host=twiesttest-master-fd32 -e cli_docker_device=/dev/sdb +# +# Notes: +# * This will remove /var/lib/docker! +# * You may need to re-deploy docker images after this is run (like monitoring) + +- name: Fix docker to have a provisioned iops drive + hosts: "{{ cli_name }}" + user: root + connection: ssh + gather_facts: no + + pre_tasks: + - fail: + msg: "This playbook requires {{item}} to be set." + when: "{{ item }} is not defined or {{ item }} == ''" + with_items: + - cli_docker_device + + - name: start docker + service: + name: docker + state: started + + - name: Determine if loopback + shell: docker info | grep 'Data file:.*loop' + register: loop_device_check + ignore_errors: yes + + - debug: + var: loop_device_check + + - name: fail if we don't detect loopback + fail: + msg: loopback not detected! Please investigate manually. + when: loop_device_check.rc == 1 + + - name: stop zagg client monitoring container + service: + name: oso-rhel7-zagg-client + state: stopped + ignore_errors: yes + + - name: stop pcp client monitoring container + service: + name: oso-f22-host-monitoring + state: stopped + ignore_errors: yes + + - name: "check to see if {{ cli_docker_device }} exists" + command: "test -e {{ cli_docker_device }}" + register: docker_dev_check + ignore_errors: yes + + - debug: var=docker_dev_check + + - name: "fail if {{ cli_docker_device }} doesn't exist" + fail: + msg: "{{ cli_docker_device }} doesn't exist. Please investigate" + when: docker_dev_check.rc != 0 + + - name: stop docker + service: + name: docker + state: stopped + + - name: delete /var/lib/docker + command: rm -rf /var/lib/docker + + - name: remove /var/lib/docker + command: rm -rf /var/lib/docker + + - name: copy the docker-storage-setup config file + copy: + content: > + DEVS={{ cli_docker_device }} + VG=docker_vg + dest: /etc/sysconfig/docker-storage-setup + owner: root + group: root + mode: 0664 + + - name: docker storage setup + command: docker-storage-setup + register: setup_output + + - debug: var=setup_output + + - name: start docker + command: systemctl start docker.service + register: dockerstart + + - debug: var=dockerstart diff --git a/playbooks/adhoc/grow_docker_vg/grow_docker_vg.yml b/playbooks/adhoc/grow_docker_vg/grow_docker_vg.yml index ef9b45abd..63d473146 100644 --- a/playbooks/adhoc/grow_docker_vg/grow_docker_vg.yml +++ b/playbooks/adhoc/grow_docker_vg/grow_docker_vg.yml @@ -172,7 +172,7 @@ - name: pvmove onto new volume command: "pvmove {{ docker_pv_name.stdout }} /dev/xvdc1" - async: 3600 + async: 43200 poll: 10 - name: Remove the old docker drive from the volume group diff --git a/playbooks/adhoc/s3_registry/s3_registry.j2 b/playbooks/adhoc/s3_registry/s3_registry.j2 index 026b24456..acfa89515 100644 --- a/playbooks/adhoc/s3_registry/s3_registry.j2 +++ b/playbooks/adhoc/s3_registry/s3_registry.j2 @@ -7,8 +7,8 @@ storage: cache: layerinfo: inmemory s3: - accesskey: {{ accesskey }} - secretkey: {{ secretkey }} + accesskey: {{ aws_access_key }} + secretkey: {{ aws_secret_key }} region: us-east-1 bucket: {{ clusterid }}-docker encrypt: true diff --git a/playbooks/adhoc/s3_registry/s3_registry.yml b/playbooks/adhoc/s3_registry/s3_registry.yml index 30b873db3..5dc1abf17 100644 --- a/playbooks/adhoc/s3_registry/s3_registry.yml +++ b/playbooks/adhoc/s3_registry/s3_registry.yml @@ -1,7 +1,7 @@ --- # This playbook creates an S3 bucket named after your cluster and configures the docker-registry service to use the bucket as its backend storage. # Usage: -# ansible-playbook s3_registry.yml -e accesskey="S3 aws access key" -e secretkey="S3 aws secret key" -e clusterid="mycluster" +# ansible-playbook s3_registry.yml -e clusterid="mycluster" # # The AWS access/secret keys should be the keys of a separate user (not your main user), containing only the necessary S3 access role. # The 'clusterid' is the short name of your cluster. @@ -10,11 +10,22 @@ remote_user: root gather_facts: False + vars: + aws_access_key: "{{ lookup('env', 'AWS_ACCESS_KEY_ID') }}" + aws_secret_key: "{{ lookup('env', 'AWS_SECRET_ACCESS_KEY') }}" tasks: + - name: Check for AWS creds + fail: + msg: "Couldn't find {{ item }} creds in ENV" + when: "{{ item }} == ''" + with_items: + - aws_access_key + - aws_secret_key + - name: Create S3 bucket local_action: - module: s3 bucket="{{ clusterid }}-docker" mode=create aws_access_key={{ accesskey|quote }} aws_secret_key={{ secretkey|quote }} + module: s3 bucket="{{ clusterid }}-docker" mode=create - name: Generate docker registry config template: src="s3_registry.j2" dest="/root/config.yml" owner=root mode=0600 diff --git a/playbooks/common/openshift-master/config.yml b/playbooks/common/openshift-master/config.yml index 64cf7a65b..14ec82e85 100644 --- a/playbooks/common/openshift-master/config.yml +++ b/playbooks/common/openshift-master/config.yml @@ -221,6 +221,15 @@ - role: openshift_cluster_metrics when: openshift.common.use_cluster_metrics | bool +- name: Enable cockpit + hosts: oo_first_master + vars: + cockpit_plugins: "{{ osm_cockpit_plugins | default(['cockpit-kubernetes']) }}" + roles: + - role: cockpit + when: ( deployment_type in ['atomic-enterprise','openshift-enterprise'] ) and + (osm_use_cockpit | bool or osm_use_cockpit is undefined ) + # Additional instance config for online deployments - name: Additional instance config hosts: oo_masters_deployment_type_online diff --git a/playbooks/gce/openshift-cluster/config.yml b/playbooks/gce/openshift-cluster/config.yml index fd5dfcc72..6ca4f7395 100644 --- a/playbooks/gce/openshift-cluster/config.yml +++ b/playbooks/gce/openshift-cluster/config.yml @@ -10,6 +10,8 @@ - set_fact: g_ssh_user_tmp: "{{ deployment_vars[deployment_type].ssh_user }}" g_sudo_tmp: "{{ deployment_vars[deployment_type].sudo }}" + use_sdn: "{{ do_we_use_openshift_sdn }}" + sdn_plugin: "{{ sdn_network_plugin }}" - include: ../../common/openshift-cluster/config.yml vars: @@ -18,7 +20,10 @@ g_nodes_group: "{{ 'tag_env-host-type-' ~ cluster_id ~ '-openshift-node' }}" g_ssh_user: "{{ hostvars.localhost.g_ssh_user_tmp }}" g_sudo: "{{ hostvars.localhost.g_sudo_tmp }}" + g_nodeonmaster: true openshift_cluster_id: "{{ cluster_id }}" openshift_debug_level: 2 openshift_deployment_type: "{{ deployment_type }}" openshift_hostname: "{{ gce_private_ip }}" + openshift_use_openshift_sdn: "{{ hostvars.localhost.use_sdn }}" + os_sdn_network_plugin_name: "{{ hostvars.localhost.sdn_plugin }}" diff --git a/playbooks/gce/openshift-cluster/join_node.yml b/playbooks/gce/openshift-cluster/join_node.yml new file mode 100644 index 000000000..0dfa3e9d7 --- /dev/null +++ b/playbooks/gce/openshift-cluster/join_node.yml @@ -0,0 +1,49 @@ +--- +- name: Populate oo_hosts_to_update group + hosts: localhost + gather_facts: no + vars_files: + - vars.yml + tasks: + - name: Evaluate oo_hosts_to_update + add_host: + name: "{{ node_ip }}" + groups: oo_hosts_to_update + ansible_ssh_user: "{{ deployment_vars[deployment_type].ssh_user }}" + ansible_sudo: "{{ deployment_vars[deployment_type].sudo }}" + +- include: ../../common/openshift-cluster/update_repos_and_packages.yml + +- name: Populate oo_masters_to_config host group + hosts: localhost + gather_facts: no + vars_files: + - vars.yml + tasks: + - name: Evaluate oo_nodes_to_config + add_host: + name: "{{ node_ip }}" + ansible_ssh_user: "{{ deployment_vars[deployment_type].ssh_user }}" + ansible_sudo: "{{ deployment_vars[deployment_type].sudo }}" + groups: oo_nodes_to_config + + - name: Evaluate oo_first_master + add_host: + name: "{{ groups['tag_env-host-type-' ~ cluster_id ~ '-openshift-master'][0] }}" + ansible_ssh_user: "{{ deployment_vars[deployment_type].ssh_user }}" + ansible_sudo: "{{ deployment_vars[deployment_type].sudo }}" + groups: oo_first_master + when: "'tag_env-host-type-{{ cluster_id }}-openshift-master' in groups" + +#- include: config.yml +- include: ../../common/openshift-node/config.yml + vars: + openshift_cluster_id: "{{ cluster_id }}" + openshift_debug_level: 4 + openshift_deployment_type: "{{ deployment_type }}" + openshift_hostname: "{{ ansible_default_ipv4.address }}" + openshift_use_openshift_sdn: true + openshift_node_labels: "{{ lookup('oo_option', 'openshift_node_labels') }} " + os_sdn_network_plugin_name: "redhat/openshift-ovs-subnet" + osn_cluster_dns_domain: "{{ hostvars[groups.oo_first_master.0].openshift.dns.domain }}" + osn_cluster_dns_ip: "{{ hostvars[groups.oo_first_master.0].openshift.dns.ip }}" diff --git a/playbooks/gce/openshift-cluster/launch.yml b/playbooks/gce/openshift-cluster/launch.yml index 7a3b80da0..c22b897d5 100644 --- a/playbooks/gce/openshift-cluster/launch.yml +++ b/playbooks/gce/openshift-cluster/launch.yml @@ -34,27 +34,28 @@ count: "{{ num_infra }}" - include: tasks/launch_instances.yml vars: - instances: "{{ infra_names }}" + instances: "{{ node_names }}" cluster: "{{ cluster_id }}" type: "{{ k8s_type }}" g_sub_host_type: "{{ sub_host_type }}" - - set_fact: - a_infra: "{{ infra_names[0] }}" - - add_host: name={{ a_infra }} groups=service_master + - add_host: + name: "{{ master_names.0 }}" + groups: service_master + when: master_names is defined and master_names.0 is defined - include: update.yml - -- name: Deploy OpenShift Services - hosts: service_master - connection: ssh - gather_facts: yes - roles: - - openshift_registry - - openshift_router - -- include: ../../common/openshift-cluster/create_services.yml - vars: - g_svc_master: "{{ service_master }}" +# +#- name: Deploy OpenShift Services +# hosts: service_master +# connection: ssh +# gather_facts: yes +# roles: +# - openshift_registry +# - openshift_router +# +#- include: ../../common/openshift-cluster/create_services.yml +# vars: +# g_svc_master: "{{ service_master }}" - include: list.yml diff --git a/playbooks/gce/openshift-cluster/list.yml b/playbooks/gce/openshift-cluster/list.yml index 5ba0f5a48..53b2b9a5e 100644 --- a/playbooks/gce/openshift-cluster/list.yml +++ b/playbooks/gce/openshift-cluster/list.yml @@ -14,11 +14,11 @@ groups: oo_list_hosts ansible_ssh_user: "{{ deployment_vars[deployment_type].ssh_user | default(ansible_ssh_user, true) }}" ansible_sudo: "{{ deployment_vars[deployment_type].sudo }}" - with_items: groups[scratch_group] | default([]) | difference(['localhost']) | difference(groups.status_terminated) + with_items: groups[scratch_group] | default([], true) | difference(['localhost']) | difference(groups.status_terminated | default([], true)) - name: List instance(s) hosts: oo_list_hosts gather_facts: no tasks: - debug: - msg: "public ip:{{ hostvars[inventory_hostname].gce_public_ip }} private ip:{{ hostvars[inventory_hostname].gce_private_ip }}" + msg: "private ip:{{ hostvars[inventory_hostname].gce_private_ip }}" diff --git a/playbooks/gce/openshift-cluster/tasks/launch_instances.yml b/playbooks/gce/openshift-cluster/tasks/launch_instances.yml index 6307ecc27..c428cb465 100644 --- a/playbooks/gce/openshift-cluster/tasks/launch_instances.yml +++ b/playbooks/gce/openshift-cluster/tasks/launch_instances.yml @@ -10,14 +10,33 @@ service_account_email: "{{ lookup('env', 'gce_service_account_email_address') }}" pem_file: "{{ lookup('env', 'gce_service_account_pem_file_path') }}" project_id: "{{ lookup('env', 'gce_project_id') }}" + zone: "{{ lookup('env', 'zone') }}" + network: "{{ lookup('env', 'network') }}" +# unsupported in 1.9.+ + #service_account_permissions: "datastore,logging-write" tags: - created-by-{{ lookup('env', 'LOGNAME') |default(cluster, true) }} - env-{{ cluster }} - host-type-{{ type }} - - sub-host-type-{{ sub_host_type }} + - sub-host-type-{{ g_sub_host_type }} - env-host-type-{{ cluster }}-openshift-{{ type }} + when: instances |length > 0 register: gce +- set_fact: + node_label: + # There doesn't seem to be a way to get the region directly, so parse it out of the zone. + region: "{{ gce.zone | regex_replace('^(.*)-.*$', '\\\\1') }}" + type: "{{ g_sub_host_type }}" + when: instances |length > 0 and type == "node" + +- set_fact: + node_label: + # There doesn't seem to be a way to get the region directly, so parse it out of the zone. + region: "{{ gce.zone | regex_replace('^(.*)-.*$', '\\\\1') }}" + type: "{{ type }}" + when: instances |length > 0 and type != "node" + - name: Add new instances to groups and set variables needed add_host: hostname: "{{ item.name }}" @@ -27,16 +46,17 @@ groups: "{{ item.tags | oo_prepend_strings_in_list('tag_') | join(',') }}" gce_public_ip: "{{ item.public_ip }}" gce_private_ip: "{{ item.private_ip }}" - with_items: gce.instance_data + openshift_node_labels: "{{ node_label }}" + with_items: gce.instance_data | default([], true) - name: Wait for ssh wait_for: port=22 host={{ item.public_ip }} - with_items: gce.instance_data + with_items: gce.instance_data | default([], true) - name: Wait for user setup command: "ssh -o StrictHostKeyChecking=no -o PasswordAuthentication=no -o ConnectTimeout=10 -o UserKnownHostsFile=/dev/null {{ hostvars[item.name].ansible_ssh_user }}@{{ item.public_ip }} echo {{ hostvars[item.name].ansible_ssh_user }} user is setup" register: result until: result.rc == 0 - retries: 20 - delay: 10 - with_items: gce.instance_data + retries: 30 + delay: 5 + with_items: gce.instance_data | default([], true) diff --git a/playbooks/gce/openshift-cluster/terminate.yml b/playbooks/gce/openshift-cluster/terminate.yml index 098b0df73..e20e0a8bc 100644 --- a/playbooks/gce/openshift-cluster/terminate.yml +++ b/playbooks/gce/openshift-cluster/terminate.yml @@ -1,25 +1,18 @@ --- - name: Terminate instance(s) hosts: localhost + connection: local gather_facts: no vars_files: - vars.yml tasks: - - set_fact: scratch_group=tag_env-host-type-{{ cluster_id }}-openshift-node + - set_fact: scratch_group=tag_env-{{ cluster_id }} - add_host: name: "{{ item }}" - groups: oo_hosts_to_terminate, oo_nodes_to_terminate + groups: oo_hosts_to_terminate ansible_ssh_user: "{{ deployment_vars[deployment_type].ssh_user | default(ansible_ssh_user, true) }}" ansible_sudo: "{{ deployment_vars[deployment_type].sudo }}" - with_items: groups[scratch_group] | default([]) | difference(['localhost']) | difference(groups.status_terminated) - - - set_fact: scratch_group=tag_env-host-type-{{ cluster_id }}-openshift-master - - add_host: - name: "{{ item }}" - groups: oo_hosts_to_terminate, oo_masters_to_terminate - ansible_ssh_user: "{{ deployment_vars[deployment_type].ssh_user | default(ansible_ssh_user, true) }}" - ansible_sudo: "{{ deployment_vars[deployment_type].sudo }}" - with_items: groups[scratch_group] | default([]) | difference(['localhost']) | difference(groups.status_terminated) + with_items: groups[scratch_group] | default([], true) | difference(['localhost']) | difference(groups.status_terminated | default([], true)) - name: Unsubscribe VMs hosts: oo_hosts_to_terminate @@ -32,14 +25,34 @@ lookup('oo_option', 'rhel_skip_subscription') | default(rhsub_skip, True) | default('no', True) | lower in ['no', 'false'] -- include: ../openshift-node/terminate.yml - vars: - gce_service_account_email: "{{ lookup('env', 'gce_service_account_email_address') }}" - gce_pem_file: "{{ lookup('env', 'gce_service_account_pem_file_path') }}" - gce_project_id: "{{ lookup('env', 'gce_project_id') }}" +- name: Terminate instances(s) + hosts: localhost + connection: local + gather_facts: no + vars_files: + - vars.yml + tasks: + + - name: Terminate instances that were previously launched + local_action: + module: gce + state: 'absent' + name: "{{ item }}" + service_account_email: "{{ lookup('env', 'gce_service_account_email_address') }}" + pem_file: "{{ lookup('env', 'gce_service_account_pem_file_path') }}" + project_id: "{{ lookup('env', 'gce_project_id') }}" + zone: "{{ lookup('env', 'zone') }}" + with_items: groups['oo_hosts_to_terminate'] | default([], true) + when: item is defined -- include: ../openshift-master/terminate.yml - vars: - gce_service_account_email: "{{ lookup('env', 'gce_service_account_email_address') }}" - gce_pem_file: "{{ lookup('env', 'gce_service_account_pem_file_path') }}" - gce_project_id: "{{ lookup('env', 'gce_project_id') }}" +#- include: ../openshift-node/terminate.yml +# vars: +# gce_service_account_email: "{{ lookup('env', 'gce_service_account_email_address') }}" +# gce_pem_file: "{{ lookup('env', 'gce_service_account_pem_file_path') }}" +# gce_project_id: "{{ lookup('env', 'gce_project_id') }}" +# +#- include: ../openshift-master/terminate.yml +# vars: +# gce_service_account_email: "{{ lookup('env', 'gce_service_account_email_address') }}" +# gce_pem_file: "{{ lookup('env', 'gce_service_account_pem_file_path') }}" +# gce_project_id: "{{ lookup('env', 'gce_project_id') }}" diff --git a/playbooks/gce/openshift-cluster/vars.yml b/playbooks/gce/openshift-cluster/vars.yml index ae33083b9..6de007807 100644 --- a/playbooks/gce/openshift-cluster/vars.yml +++ b/playbooks/gce/openshift-cluster/vars.yml @@ -1,8 +1,11 @@ --- +do_we_use_openshift_sdn: true +sdn_network_plugin: redhat/openshift-ovs-subnet +# os_sdn_network_plugin_name can be ovssubnet or multitenant, see https://docs.openshift.org/latest/architecture/additional_concepts/sdn.html#ovssubnet-plugin-operation deployment_vars: origin: - image: centos-7 - ssh_user: + image: preinstalled-slave-50g-v5 + ssh_user: root sudo: yes online: image: libra-rhel7 @@ -12,4 +15,3 @@ deployment_vars: image: rhel-7 ssh_user: sudo: yes - diff --git a/playbooks/libvirt/openshift-cluster/tasks/launch_instances.yml b/playbooks/libvirt/openshift-cluster/tasks/launch_instances.yml index 2a0c90b46..4b91c6da8 100644 --- a/playbooks/libvirt/openshift-cluster/tasks/launch_instances.yml +++ b/playbooks/libvirt/openshift-cluster/tasks/launch_instances.yml @@ -64,7 +64,7 @@ register: nb_allocated_ips until: nb_allocated_ips.stdout == '{{ instances | length }}' retries: 60 - delay: 1 + delay: 3 when: instances | length != 0 - name: Collect IP addresses of the VMs diff --git a/playbooks/libvirt/openshift-cluster/templates/user-data b/playbooks/libvirt/openshift-cluster/templates/user-data index 77b788109..eacae7c7e 100644 --- a/playbooks/libvirt/openshift-cluster/templates/user-data +++ b/playbooks/libvirt/openshift-cluster/templates/user-data @@ -19,5 +19,5 @@ system_info: ssh_authorized_keys: - {{ lookup('file', '~/.ssh/id_rsa.pub') }} -bootcmd: +runcmd: - NETWORK_CONFIG=/etc/sysconfig/network-scripts/ifcfg-eth0; if ! grep DHCP_HOSTNAME ${NETWORK_CONFIG}; then echo 'DHCP_HOSTNAME="{{ item[0] }}.example.com"' >> ${NETWORK_CONFIG}; fi; pkill -9 dhclient; service network restart diff --git a/roles/cockpit/defaults/main.yml b/roles/cockpit/defaults/main.yml new file mode 100644 index 000000000..ffd55f1dd --- /dev/null +++ b/roles/cockpit/defaults/main.yml @@ -0,0 +1,5 @@ +--- +os_firewall_use_firewalld: false +os_firewall_allow: +- service: cockpit-ws + port: 9090/tcp diff --git a/roles/cockpit/meta/main.yml b/roles/cockpit/meta/main.yml new file mode 100644 index 000000000..1e3948b19 --- /dev/null +++ b/roles/cockpit/meta/main.yml @@ -0,0 +1,15 @@ +--- +galaxy_info: + author: Scott Dodson + description: Deploy and Enable cockpit-ws plus optional plugins + company: Red Hat, Inc. + license: Apache License, Version 2.0 + min_ansible_version: 1.7 + platforms: + - name: EL + versions: + - 7 + categories: + - cloud +dependencies: + - { role: os_firewall } diff --git a/roles/cockpit/tasks/main.yml b/roles/cockpit/tasks/main.yml new file mode 100644 index 000000000..875cbad21 --- /dev/null +++ b/roles/cockpit/tasks/main.yml @@ -0,0 +1,16 @@ +--- +- name: Install cockpit-ws + yum: + name: "{{ item }}" + state: present + with_items: + - cockpit-ws + - cockpit-shell + - cockpit-bridge + - "{{ cockpit_plugins }}" + +- name: Enable cockpit-ws + service: + name: cockpit.socket + enabled: true + state: started diff --git a/roles/lib_zabbix/library/zbx_item.py b/roles/lib_zabbix/library/zbx_item.py index 2ccc21292..6faa82dfc 100644 --- a/roles/lib_zabbix/library/zbx_item.py +++ b/roles/lib_zabbix/library/zbx_item.py @@ -53,6 +53,8 @@ def get_value_type(value_type): vtype = 0 if 'int' in value_type: vtype = 3 + elif 'log' in value_type: + vtype = 2 elif 'char' in value_type: vtype = 1 elif 'str' in value_type: diff --git a/roles/lib_zabbix/library/zbx_itemprototype.py b/roles/lib_zabbix/library/zbx_itemprototype.py index 4ec1b8e02..e7fd6fa21 100644 --- a/roles/lib_zabbix/library/zbx_itemprototype.py +++ b/roles/lib_zabbix/library/zbx_itemprototype.py @@ -128,12 +128,12 @@ def get_status(status): return _status -def get_app_ids(zapi, application_names): +def get_app_ids(zapi, application_names, templateid): ''' get application ids from names ''' app_ids = [] for app_name in application_names: - content = zapi.get_content('application', 'get', {'search': {'name': app_name}}) + content = zapi.get_content('application', 'get', {'filter': {'name': app_name}, 'templateids': templateid}) if content.has_key('result'): app_ids.append(content['result'][0]['applicationid']) return app_ids @@ -212,7 +212,7 @@ def main(): 'ruleid': get_rule_id(zapi, module.params['discoveryrule_key'], template['templateid']), 'type': get_type(module.params['ztype']), 'value_type': get_value_type(module.params['value_type']), - 'applications': get_app_ids(zapi, module.params['applications']), + 'applications': get_app_ids(zapi, module.params['applications'], template['templateid']), 'description': module.params['description'], } diff --git a/roles/lib_zabbix/library/zbx_trigger.py b/roles/lib_zabbix/library/zbx_trigger.py index 21d0fcfd2..ab7731faa 100644 --- a/roles/lib_zabbix/library/zbx_trigger.py +++ b/roles/lib_zabbix/library/zbx_trigger.py @@ -86,6 +86,24 @@ def get_trigger_status(inc_status): return r_status +def get_template_id(zapi, template_name): + ''' + get related templates + ''' + template_ids = [] + app_ids = {} + # Fetch templates by name + content = zapi.get_content('template', + 'get', + {'search': {'host': template_name}, + 'selectApplications': ['applicationid', 'name']}) + if content.has_key('result'): + template_ids.append(content['result'][0]['templateid']) + for app in content['result'][0]['applications']: + app_ids[app['name']] = app['applicationid'] + + return template_ids, app_ids + def main(): ''' Create a trigger in zabbix @@ -117,6 +135,7 @@ def main(): url=dict(default=None, type='str'), status=dict(default=None, type='str'), state=dict(default='present', type='str'), + template_name=dict(default=None, type='str'), ), #supports_check_mode=True ) @@ -132,11 +151,16 @@ def main(): state = module.params['state'] tname = module.params['name'] + templateid = None + if module.params['template_name']: + templateid, _ = get_template_id(zapi, module.params['template_name']) + content = zapi.get_content(zbx_class_name, 'get', {'filter': {'description': tname}, 'expandExpression': True, 'selectDependencies': 'triggerid', + 'templateids': templateid, }) # Get diff --git a/roles/lib_zabbix/library/zbx_user_media.py b/roles/lib_zabbix/library/zbx_user_media.py index 9ed838f81..8895c78c3 100644 --- a/roles/lib_zabbix/library/zbx_user_media.py +++ b/roles/lib_zabbix/library/zbx_user_media.py @@ -260,6 +260,9 @@ def main(): for user in params['users']: diff['users']['userid'] = user['userid'] + # Medias have no real unique key so therefore we need to make it like the incoming user's request + diff['medias'] = medias + # We have differences and need to update content = zapi.get_content(zbx_class_name, 'updatemedia', diff) diff --git a/roles/openshift_facts/library/openshift_facts.py b/roles/openshift_facts/library/openshift_facts.py index 987f7f7da..69bb49c9b 100755 --- a/roles/openshift_facts/library/openshift_facts.py +++ b/roles/openshift_facts/library/openshift_facts.py @@ -480,11 +480,11 @@ def set_deployment_facts_if_unset(facts): if role in facts: deployment_type = facts['common']['deployment_type'] if 'registry_url' not in facts[role]: - registry_url = 'aos3/aos-${component}:${version}' - if deployment_type in ['enterprise', 'online']: + registry_url = 'openshift/origin-${component}:${version}' + if deployment_type in ['enterprise', 'online', 'openshift-enterprise']: registry_url = 'openshift3/ose-${component}:${version}' - elif deployment_type == 'origin': - registry_url = 'openshift/origin-${component}:${version}' + elif deployment_type == 'atomic-enterprise': + registry_url = 'aep3/aep-${component}:${version}' facts[role]['registry_url'] = registry_url return facts diff --git a/roles/openshift_facts/tasks/main.yml b/roles/openshift_facts/tasks/main.yml index fd3d20800..6301d4fc0 100644 --- a/roles/openshift_facts/tasks/main.yml +++ b/roles/openshift_facts/tasks/main.yml @@ -1,5 +1,5 @@ --- -- name: Verify Ansible version is greater than 1.8.0 and not 1.9.0 +- name: Verify Ansible version is greater than 1.8.0 and not 1.9.0 and not 1.9.0.1 assert: that: - ansible_version | version_compare('1.8.0', 'ge') diff --git a/roles/openshift_manage_node/tasks/main.yml b/roles/openshift_manage_node/tasks/main.yml index 7c4f45ce6..637e494ea 100644 --- a/roles/openshift_manage_node/tasks/main.yml +++ b/roles/openshift_manage_node/tasks/main.yml @@ -1,21 +1,21 @@ - name: Wait for Node Registration command: > - {{ openshift.common.client_binary }} get node {{ item }} + {{ openshift.common.client_binary }} get node {{ item | lower }} register: omd_get_node until: omd_get_node.rc == 0 - retries: 10 + retries: 20 delay: 5 with_items: openshift_nodes - name: Set node schedulability command: > - {{ openshift.common.admin_binary }} manage-node {{ item.openshift.common.hostname }} --schedulable={{ 'true' if item.openshift.node.schedulable | bool else 'false' }} + {{ openshift.common.admin_binary }} manage-node {{ item.openshift.common.hostname | lower }} --schedulable={{ 'true' if item.openshift.node.schedulable | bool else 'false' }} with_items: - "{{ openshift_node_vars }}" - name: Label nodes command: > - {{ openshift.common.client_binary }} label --overwrite node {{ item.openshift.common.hostname }} {{ item.openshift.node.labels | oo_combine_dict }} + {{ openshift.common.client_binary }} label --overwrite node {{ item.openshift.common.hostname | lower }} {{ item.openshift.node.labels | oo_combine_dict }} with_items: - "{{ openshift_node_vars }}" when: "'labels' in item.openshift.node and item.openshift.node.labels != {}" diff --git a/roles/openshift_master/tasks/main.yml b/roles/openshift_master/tasks/main.yml index fa12005ab..73c04cb08 100644 --- a/roles/openshift_master/tasks/main.yml +++ b/roles/openshift_master/tasks/main.yml @@ -169,13 +169,17 @@ shell: echo {{ openshift_master_cluster_password | quote }} | passwd --stdin hacluster when: install_result | changed +- name: Lookup default group for ansible_ssh_user + command: "/usr/bin/id -g {{ ansible_ssh_user }}" + register: _ansible_ssh_user_gid + - name: Create the client config dir(s) file: path: "~{{ item }}/.kube" state: directory mode: 0700 owner: "{{ item }}" - group: "{{ item }}" + group: "{{ 'root' if item == 'root' else _ansible_ssh_user_gid.stdout }}" with_items: - root - "{{ ansible_ssh_user }}" @@ -196,7 +200,7 @@ state: file mode: 0700 owner: "{{ item }}" - group: "{{ item }}" + group: "{{ 'root' if item == 'root' else _ansible_ssh_user_gid.stdout }}" with_items: - root - "{{ ansible_ssh_user }}" diff --git a/roles/openshift_master/templates/master.yaml.v1.j2 b/roles/openshift_master/templates/master.yaml.v1.j2 index 500690523..6e45eaad7 100644 --- a/roles/openshift_master/templates/master.yaml.v1.j2 +++ b/roles/openshift_master/templates/master.yaml.v1.j2 @@ -87,7 +87,9 @@ masterPublicURL: {{ openshift.master.public_api_url }} networkConfig: clusterNetworkCIDR: {{ openshift.master.sdn_cluster_network_cidr }} hostSubnetLength: {{ openshift.master.sdn_host_subnet_length }} +{% if openshift.common.use_openshift_sdn %} networkPluginName: {{ openshift.common.sdn_network_plugin_name }} +{% endif %} # serviceNetworkCIDR must match kubernetesMasterConfig.servicesSubnet serviceNetworkCIDR: {{ openshift.master.portal_net }} {% include 'v1_partials/oauthConfig.j2' %} diff --git a/roles/openshift_master/templates/v1_partials/oauthConfig.j2 b/roles/openshift_master/templates/v1_partials/oauthConfig.j2 index 72889bc29..8a4f5a746 100644 --- a/roles/openshift_master/templates/v1_partials/oauthConfig.j2 +++ b/roles/openshift_master/templates/v1_partials/oauthConfig.j2 @@ -80,6 +80,7 @@ oauthConfig: provider: {{ identity_provider_config(identity_provider) }} {%- endfor %} + masterCA: ca.crt masterPublicURL: {{ openshift.master.public_api_url }} masterURL: {{ openshift.master.api_url }} sessionConfig: diff --git a/roles/openshift_node/tasks/main.yml b/roles/openshift_node/tasks/main.yml index e8cc499c0..d45dd8073 100644 --- a/roles/openshift_node/tasks/main.yml +++ b/roles/openshift_node/tasks/main.yml @@ -22,7 +22,7 @@ deployment_type: "{{ openshift_deployment_type }}" - role: node local_facts: - labels: "{{ openshift_node_labels | default(none) }}" + labels: "{{ lookup('oo_option', 'openshift_node_labels') | default( openshift_node_labels | default(none), true) }}" annotations: "{{ openshift_node_annotations | default(none) }}" registry_url: "{{ oreg_url | default(none) }}" debug_level: "{{ openshift_node_debug_level | default(openshift.common.debug_level) }}" diff --git a/roles/openshift_node/templates/node.yaml.v1.j2 b/roles/openshift_node/templates/node.yaml.v1.j2 index 07d80f99b..4931d127e 100644 --- a/roles/openshift_node/templates/node.yaml.v1.j2 +++ b/roles/openshift_node/templates/node.yaml.v1.j2 @@ -12,13 +12,17 @@ kind: NodeConfig kubeletArguments: {{ openshift.node.kubelet_args | to_json }} {% endif %} masterKubeConfig: system:node:{{ openshift.common.hostname }}.kubeconfig +{% if openshift.common.use_openshift_sdn %} networkPluginName: {{ openshift.common.sdn_network_plugin_name }} +{% endif %} # networkConfig struct introduced in origin 1.0.6 and OSE 3.0.2 which # deprecates networkPluginName above. The two should match. networkConfig: mtu: {{ openshift.node.sdn_mtu }} +{% if openshift.common.use_openshift_sdn %} networkPluginName: {{ openshift.common.sdn_network_plugin_name }} -nodeName: {{ openshift.common.hostname }} +{% endif %} +nodeName: {{ openshift.common.hostname | lower }} podManifestConfig: servingInfo: bindAddress: 0.0.0.0:10250 diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml index 28e900255..a503b24d7 100644 --- a/roles/os_zabbix/tasks/main.yml +++ b/roles/os_zabbix/tasks/main.yml @@ -13,6 +13,8 @@ - include_vars: template_openshift_master.yml - include_vars: template_openshift_node.yml - include_vars: template_ops_tools.yml +- include_vars: template_app_zabbix_server.yml +- include_vars: template_app_zabbix_agent.yml - name: Include Template Heartbeat include: ../../lib_zabbix/tasks/create_template.yml @@ -61,3 +63,19 @@ server: "{{ ozb_server }}" user: "{{ ozb_user }}" password: "{{ ozb_password }}" + +- name: Include Template App Zabbix Server + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_app_zabbix_server }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" + +- name: Include Template App Zabbix Agent + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_app_zabbix_agent }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" diff --git a/roles/os_zabbix/vars/template_app_zabbix_agent.yml b/roles/os_zabbix/vars/template_app_zabbix_agent.yml new file mode 100644 index 000000000..06c4eda8b --- /dev/null +++ b/roles/os_zabbix/vars/template_app_zabbix_agent.yml @@ -0,0 +1,23 @@ +--- +g_template_app_zabbix_agent: + name: Template App Zabbix Agent + zitems: + - key: agent.hostname + applications: + - Zabbix agent + value_type: character + zabbix_type: '0' + + - key: agent.ping + applications: + - Zabbix agent + description: The agent always returns 1 for this item. It could be used in combination with nodata() for availability check. + value_type: int + zabbix_type: '0' + + ztriggers: + - name: '[Reboot] Zabbix agent on {HOST.NAME} is unreachable for 15 minutes' + description: Zabbix agent is unreachable for 15 minutes. + expression: '{Template App Zabbix Agent:agent.ping.nodata(15m)}=1' + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_ping.asciidoc diff --git a/roles/os_zabbix/vars/template_app_zabbix_server.yml b/roles/os_zabbix/vars/template_app_zabbix_server.yml new file mode 100644 index 000000000..dace2aa29 --- /dev/null +++ b/roles/os_zabbix/vars/template_app_zabbix_server.yml @@ -0,0 +1,408 @@ +--- +g_template_app_zabbix_server: + name: Template App Zabbix Server + zitems: + - key: housekeeper_creates + applications: + - Zabbix server + description: A simple count of the number of partition creates output by the housekeeper script. + units: '' + value_type: int + zabbix_type: '2' + + - key: housekeeper_drops + applications: + - Zabbix server + description: A simple count of the number of partition drops output by the housekeeper script. + units: '' + value_type: int + zabbix_type: '2' + + - key: housekeeper_errors + applications: + - Zabbix server + description: A simple count of the number of errors output by the housekeeper script. + units: '' + value_type: int + zabbix_type: '2' + + - key: housekeeper_total + applications: + - Zabbix server + description: A simple count of the total number of lines output by the housekeeper + script. + units: '' + value_type: int + zabbix_type: '2' + + - key: zabbix[process,alerter,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,configuration syncer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,db watchdog,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,discoverer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,escalator,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,history syncer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,housekeeper,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,http poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,icmp pinger,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,ipmi poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,java poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,node watcher,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,proxy poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,self-monitoring,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,snmp trapper,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,timer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,trapper,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,unreachable poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[queue,10m] + applications: + - Zabbix server + description: '' + units: '' + value_type: int + zabbix_type: '5' + + - key: zabbix[queue] + applications: + - Zabbix server + description: '' + units: '' + value_type: int + zabbix_type: '5' + + - key: zabbix[rcache,buffer,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: '5' + + - key: zabbix[wcache,history,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: '5' + + - key: zabbix[wcache,text,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: '5' + + - key: zabbix[wcache,trend,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: '5' + + - key: zabbix[wcache,values] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: '5' + ztriggers: + - description: "There has been unexpected output while running the housekeeping script\ + \ on the Zabbix. There are only three kinds of lines we expect to see in the output,\ + \ and we've gotten something enw.\r\n\r\nCheck the script's output in /var/lib/zabbix/state\ + \ for more details." + expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}+{Template App Zabbix Server:housekeeper_creates.last(0)}+{Template App Zabbix Server:housekeeper_drops.last(0)}<>{Template App Zabbix Server:housekeeper_total.last(0)}' + name: Unexpected output in Zabbix DB Housekeeping + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_DB_Housekeeping.asciidoc + + - description: An error has occurred during running the housekeeping script on the Zabbix. Check the script's output in /var/lib/zabbix/state for more details. + expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}>0' + name: Errors during Zabbix DB Housekeeping + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,alerter,avg,busy].min(600)}>75' + name: Zabbix alerter processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,configuration syncer,avg,busy].min(600)}>75' + name: Zabbix configuration syncer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,db watchdog,avg,busy].min(600)}>75' + name: Zabbix db watchdog processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,discoverer,avg,busy].min(600)}>75' + name: Zabbix discoverer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,escalator,avg,busy].min(600)}>75' + name: Zabbix escalator processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,history syncer,avg,busy].min(600)}>75' + name: Zabbix history syncer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,housekeeper,avg,busy].min(1800)}>75' + name: Zabbix housekeeper processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,http poller,avg,busy].min(600)}>75' + name: Zabbix http poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,icmp pinger,avg,busy].min(600)}>75' + name: Zabbix icmp pinger processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,ipmi poller,avg,busy].min(600)}>75' + name: Zabbix ipmi poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,java poller,avg,busy].min(600)}>75' + name: Zabbix java poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,node watcher,avg,busy].min(600)}>75' + name: Zabbix node watcher processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,poller,avg,busy].min(600)}>75' + name: Zabbix poller processes more than 75% busy + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,proxy poller,avg,busy].min(600)}>75' + name: Zabbix proxy poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,self-monitoring,avg,busy].min(600)}>75' + name: Zabbix self-monitoring processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,snmp trapper,avg,busy].min(600)}>75' + name: Zabbix snmp trapper processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: Timer processes usually are busy because they have to process time + based trigger functions + expression: '{Template App Zabbix Server:zabbix[process,timer,avg,busy].min(600)}>75' + name: Zabbix timer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,trapper,avg,busy].min(600)}>75' + name: Zabbix trapper processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,unreachable poller,avg,busy].min(600)}>75' + name: Zabbix unreachable poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: "This alert generally indicates a performance problem or a problem\ + \ with the zabbix-server or proxy.\r\n\r\nThe first place to check for issues\ + \ is Administration > Queue. Be sure to check the general view and the per-proxy\ + \ view." + expression: '{Template App Zabbix Server:zabbix[queue,10m].min(600)}>1000' + name: More than 1000 items having missing data for more than 10 minutes + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/data_lost_overview_plugin.asciidoc + + - description: Consider increasing CacheSize in the zabbix_server.conf configuration + file + expression: '{Template App Zabbix Server:zabbix[rcache,buffer,pfree].min(600)}<5' + name: Less than 5% free in the configuration cache + priority: info + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[wcache,history,pfree].min(600)}<25' + name: Less than 25% free in the history cache + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[wcache,text,pfree].min(600)}<25' + name: Less than 25% free in the text history cache + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[wcache,trend,pfree].min(600)}<25' + name: Less than 25% free in the trends cache + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml index c71e07910..1de4fefbb 100644 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -13,6 +13,24 @@ g_template_openshift_master: applications: - Openshift Master + - key: openshift.master.user.count + description: Shows number of users in a cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pod.running.count + description: Shows number of pods running + type: int + applications: + - Openshift Master + + - key: openshift.project.counter + description: Shows number of projects on a cluster + type: int + applications: + - Openshift Master + ztriggers: - name: 'Application creation has failed on {HOST.NAME}' expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1' @@ -28,3 +46,13 @@ g_template_openshift_master: expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: high + + - name: 'Number of users for Openshift Master on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.user.count.last()}=0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: info + + - name: 'There are no projects running on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.project.counter.last()}=0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: info diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml index 36f9cc4a3..ce28b1048 100644 --- a/roles/os_zabbix/vars/template_openshift_node.yml +++ b/roles/os_zabbix/vars/template_openshift_node.yml @@ -8,13 +8,37 @@ g_template_openshift_node: applications: - Openshift Node + - key: openshift.node.ovs.pids.count + description: Shows number of ovs process ids running + type: int + applications: + - Openshift Node + + - key: openshift.node.ovs.ports.count + description: Shows number of OVS ports defined + type: int + applications: + - Openshift Node + ztriggers: - name: 'Openshift Node process not running on {HOST.NAME}' expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' priority: high - name: 'Too many Openshift Node processes running on {HOST.NAME}' expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' + priority: high + + - name: 'OVS may not be running on {HOST.NAME}' + expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last()}<>4' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' priority: high + + - name: 'Number of OVS ports is 0 on {HOST.NAME}' + expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' + priority: high + + diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml index 70c3809bd..69432273f 100644 --- a/roles/os_zabbix/vars/template_os_linux.yml +++ b/roles/os_zabbix/vars/template_os_linux.yml @@ -188,18 +188,6 @@ g_template_os_linux: multiplier: 1024 units: B - # Disk items - - key: filesys.full.xvda2 - applications: - - Disk - value_type: float - - - key: filesys.full.xvda3 - applications: - - Disk - value_type: float - - zdiscoveryrules: - name: disc.filesys key: disc.filesys @@ -215,38 +203,36 @@ g_template_os_linux: applications: - Disk - ztriggerprototypes: - - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free on {HOST.NAME}' - expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' - priority: warn - - - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free on {HOST.NAME}' - expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>95' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' - priority: high + - discoveryrule_key: disc.filesys + name: "Percentage of used inodes on {#OSO_FILESYS}" + key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]" + value_type: float + description: "PCP derived value of percentage of used inodes on a filesystem." + applications: + - Disk - ztriggers: - - name: 'Filesystem: / has less than 10% free on {HOST.NAME}' - expression: '{Template OS Linux:filesys.full.xvda2.last()}>90' + ztriggerprototypes: + - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: warn - - name: 'Filesystem: / has less than 5% free on {HOST.NAME}' - expression: '{Template OS Linux:filesys.full.xvda2.last()}>95' + - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: high - - name: 'Filesystem: /var has less than 10% free on {HOST.NAME}' - expression: '{Template OS Linux:filesys.full.xvda3.last()}>90' + - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: warn - - name: 'Filesystem: /var has less than 5% free on {HOST.NAME}' - expression: '{Template OS Linux:filesys.full.xvda3.last()}>95' + - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: high + ztriggers: - name: 'Too many TOTAL processes on {HOST.NAME}' expression: '{Template OS Linux:proc.nprocs.last()}>5000' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc' |