summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix
diff options
context:
space:
mode:
Diffstat (limited to 'roles/os_zabbix')
-rw-r--r--roles/os_zabbix/tasks/main.yml112
-rw-r--r--roles/os_zabbix/vars/template_app_zabbix_agent.yml23
-rw-r--r--roles/os_zabbix/vars/template_app_zabbix_server.yml412
-rw-r--r--roles/os_zabbix/vars/template_aws.yml25
-rw-r--r--roles/os_zabbix/vars/template_config_loop.yml14
-rw-r--r--roles/os_zabbix/vars/template_docker.yml27
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml374
-rw-r--r--roles/os_zabbix/vars/template_openshift_node.yml54
-rw-r--r--roles/os_zabbix/vars/template_ops_tools.yml54
-rw-r--r--roles/os_zabbix/vars/template_os_linux.yml134
-rw-r--r--roles/os_zabbix/vars/template_performance_copilot.yml14
-rw-r--r--roles/os_zabbix/vars/template_zagg_server.yml46
12 files changed, 1258 insertions, 31 deletions
diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml
index 8347e9a61..1c8d88854 100644
--- a/roles/os_zabbix/tasks/main.yml
+++ b/roles/os_zabbix/tasks/main.yml
@@ -8,10 +8,42 @@
register: templates
- include_vars: template_heartbeat.yml
+ tags:
+ - heartbeat
- include_vars: template_os_linux.yml
+ tags:
+ - linux
- include_vars: template_docker.yml
+ tags:
+ - docker
- include_vars: template_openshift_master.yml
+ tags:
+ - openshift_master
- include_vars: template_openshift_node.yml
+ tags:
+ - openshift_node
+- include_vars: template_ops_tools.yml
+ tags:
+ - ops_tools
+- include_vars: template_app_zabbix_server.yml
+ tags:
+ - zabbix_server
+- include_vars: template_app_zabbix_agent.yml
+ tags:
+ - zabbix_agent
+- include_vars: template_performance_copilot.yml
+ tags:
+ - pcp
+- include_vars: template_aws.yml
+ tags:
+ - aws
+- include_vars: template_zagg_server.yml
+ tags:
+ - zagg_server
+
+- include_vars: template_config_loop.yml
+ tags:
+ - config_loop
- name: Include Template Heartbeat
include: ../../lib_zabbix/tasks/create_template.yml
@@ -20,6 +52,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - heartbeat
- name: Include Template os_linux
include: ../../lib_zabbix/tasks/create_template.yml
@@ -28,6 +62,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - linux
- name: Include Template docker
include: ../../lib_zabbix/tasks/create_template.yml
@@ -36,6 +72,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - docker
- name: Include Template Openshift Master
include: ../../lib_zabbix/tasks/create_template.yml
@@ -44,6 +82,8 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - openshift_master
- name: Include Template Openshift Node
include: ../../lib_zabbix/tasks/create_template.yml
@@ -52,3 +92,75 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+ tags:
+ - openshift_node
+
+- name: Include Template Ops Tools
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_ops_tools }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+ tags:
+ - ops_tools
+
+- name: Include Template App Zabbix Server
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_app_zabbix_server }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+ tags:
+ - zabbix_server
+
+- name: Include Template App Zabbix Agent
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_app_zabbix_agent }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+ tags:
+ - zabbix_agent
+
+- name: Include Template Performance Copilot
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_performance_copilot }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+ tags:
+ - pcp
+
+- name: Include Template AWS
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_aws }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+ tags:
+ - aws
+
+- name: Include Template Zagg Server
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_zagg_server }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+ tags:
+ - zagg_server
+
+- name: Include Template Config Loop
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_config_loop }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+ tags:
+ - config_loop
diff --git a/roles/os_zabbix/vars/template_app_zabbix_agent.yml b/roles/os_zabbix/vars/template_app_zabbix_agent.yml
new file mode 100644
index 000000000..d636d4822
--- /dev/null
+++ b/roles/os_zabbix/vars/template_app_zabbix_agent.yml
@@ -0,0 +1,23 @@
+---
+g_template_app_zabbix_agent:
+ name: Template App Zabbix Agent
+ zitems:
+ - key: agent.hostname
+ applications:
+ - Zabbix agent
+ value_type: character
+ zabbix_type: agent
+
+ - key: agent.ping
+ applications:
+ - Zabbix agent
+ description: The agent always returns 1 for this item. It could be used in combination with nodata() for availability check.
+ value_type: int
+ zabbix_type: agent
+
+ ztriggers:
+ - name: '[Reboot] Zabbix agent on {HOST.NAME} is unreachable for 15 minutes'
+ description: Zabbix agent is unreachable for 15 minutes.
+ expression: '{Template App Zabbix Agent:agent.ping.nodata(15m)}=1'
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_ping.asciidoc
diff --git a/roles/os_zabbix/vars/template_app_zabbix_server.yml b/roles/os_zabbix/vars/template_app_zabbix_server.yml
new file mode 100644
index 000000000..43517113b
--- /dev/null
+++ b/roles/os_zabbix/vars/template_app_zabbix_server.yml
@@ -0,0 +1,412 @@
+---
+g_template_app_zabbix_server:
+ name: Template App Zabbix Server
+ zitems:
+ - key: housekeeper_creates
+ applications:
+ - Zabbix server
+ description: A simple count of the number of partition creates output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: housekeeper_drops
+ applications:
+ - Zabbix server
+ description: A simple count of the number of partition drops output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: housekeeper_errors
+ applications:
+ - Zabbix server
+ description: A simple count of the number of errors output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: housekeeper_total
+ applications:
+ - Zabbix server
+ description: A simple count of the total number of lines output by the housekeeper
+ script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: zabbix[process,alerter,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,configuration syncer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,db watchdog,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,discoverer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,escalator,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,history syncer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,housekeeper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,http poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,icmp pinger,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,ipmi poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,java poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,node watcher,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,proxy poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,self-monitoring,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,snmp trapper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,timer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,trapper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,unreachable poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[queue,10m]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: int
+ zabbix_type: internal
+ interval: 600
+
+ - key: zabbix[queue]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: int
+ zabbix_type: internal
+ interval: 600
+
+ - key: zabbix[rcache,buffer,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,history,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,text,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,trend,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,values]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+ delta: 1 # speed per second
+
+ ztriggers:
+ - description: "There has been unexpected output while running the housekeeping script\
+ \ on the Zabbix. There are only three kinds of lines we expect to see in the output,\
+ \ and we've gotten something enw.\r\n\r\nCheck the script's output in /var/lib/zabbix/state\
+ \ for more details."
+ expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}+{Template App Zabbix Server:housekeeper_creates.last(0)}+{Template App Zabbix Server:housekeeper_drops.last(0)}<>{Template App Zabbix Server:housekeeper_total.last(0)}'
+ name: Unexpected output in Zabbix DB Housekeeping
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_DB_Housekeeping.asciidoc
+
+ - description: An error has occurred during running the housekeeping script on the Zabbix. Check the script's output in /var/lib/zabbix/state for more details.
+ expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}>0'
+ name: Errors during Zabbix DB Housekeeping
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,alerter,avg,busy].min(600)}>75'
+ name: Zabbix alerter processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,configuration syncer,avg,busy].min(600)}>75'
+ name: Zabbix configuration syncer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,db watchdog,avg,busy].min(600)}>75'
+ name: Zabbix db watchdog processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,discoverer,avg,busy].min(600)}>75'
+ name: Zabbix discoverer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,escalator,avg,busy].min(600)}>75'
+ name: Zabbix escalator processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,history syncer,avg,busy].min(600)}>75'
+ name: Zabbix history syncer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,housekeeper,avg,busy].min(1800)}>75'
+ name: Zabbix housekeeper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,http poller,avg,busy].min(600)}>75'
+ name: Zabbix http poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,icmp pinger,avg,busy].min(600)}>75'
+ name: Zabbix icmp pinger processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,ipmi poller,avg,busy].min(600)}>75'
+ name: Zabbix ipmi poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,java poller,avg,busy].min(600)}>75'
+ name: Zabbix java poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,node watcher,avg,busy].min(600)}>75'
+ name: Zabbix node watcher processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,poller,avg,busy].min(600)}>75'
+ name: Zabbix poller processes more than 75% busy
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,proxy poller,avg,busy].min(600)}>75'
+ name: Zabbix proxy poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,self-monitoring,avg,busy].min(600)}>75'
+ name: Zabbix self-monitoring processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,snmp trapper,avg,busy].min(600)}>75'
+ name: Zabbix snmp trapper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: Timer processes usually are busy because they have to process time
+ based trigger functions
+ expression: '{Template App Zabbix Server:zabbix[process,timer,avg,busy].min(600)}>75'
+ name: Zabbix timer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,trapper,avg,busy].min(600)}>75'
+ name: Zabbix trapper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,unreachable poller,avg,busy].min(600)}>75'
+ name: Zabbix unreachable poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: "This alert generally indicates a performance problem or a problem\
+ \ with the zabbix-server or proxy.\r\n\r\nThe first place to check for issues\
+ \ is Administration > Queue. Be sure to check the general view and the per-proxy\
+ \ view."
+ expression: '{Template App Zabbix Server:zabbix[queue,10m].min(600)}>1000'
+ name: More than 1000 items having missing data for more than 10 minutes
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/data_lost_overview_plugin.asciidoc
+
+ - description: Consider increasing CacheSize in the zabbix_server.conf configuration
+ file
+ expression: '{Template App Zabbix Server:zabbix[rcache,buffer,pfree].min(600)}<5'
+ name: Less than 5% free in the configuration cache
+ priority: info
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,history,pfree].min(600)}<25'
+ name: Less than 25% free in the history cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,text,pfree].min(600)}<25'
+ name: Less than 25% free in the text history cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,trend,pfree].min(600)}<25'
+ name: Less than 25% free in the trends cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
diff --git a/roles/os_zabbix/vars/template_aws.yml b/roles/os_zabbix/vars/template_aws.yml
new file mode 100644
index 000000000..57832a3fe
--- /dev/null
+++ b/roles/os_zabbix/vars/template_aws.yml
@@ -0,0 +1,25 @@
+---
+g_template_aws:
+ name: Template AWS
+ zdiscoveryrules:
+ - name: disc.aws
+ key: disc.aws
+ lifetime: 14
+ description: "Dynamically register AWS bucket info"
+
+ zitemprototypes:
+ - discoveryrule_key: disc.aws
+ name: "S3 bucket size (GB) [{#S3_BUCKET}]"
+ key: "disc.aws.size[{#S3_BUCKET}]"
+ value_type: int
+ description: "Size of S3 bucket"
+ applications:
+ - AWS
+
+ - discoveryrule_key: disc.aws
+ name: "S3 bucket object count [{#S3_BUCKET}]"
+ key: "disc.aws.objects[{#S3_BUCKET}]"
+ value_type: int
+ description: "Objects in S3 bucket"
+ applications:
+ - AWS
diff --git a/roles/os_zabbix/vars/template_config_loop.yml b/roles/os_zabbix/vars/template_config_loop.yml
new file mode 100644
index 000000000..823da1868
--- /dev/null
+++ b/roles/os_zabbix/vars/template_config_loop.yml
@@ -0,0 +1,14 @@
+---
+g_template_config_loop:
+ name: Template Config Loop
+ zitems:
+ - key: config_loop.run.exit_code
+ applications:
+ - Config Loop
+ value_type: int
+
+ ztriggers:
+ - name: 'config_loop.run.exit_code not zero on {HOST.NAME}'
+ expression: '{Template Config Loop:config_loop.run.exit_code.min(#2)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_config_loop.asciidoc'
+ priority: average
diff --git a/roles/os_zabbix/vars/template_docker.yml b/roles/os_zabbix/vars/template_docker.yml
index 395e054de..dd13e76f7 100644
--- a/roles/os_zabbix/vars/template_docker.yml
+++ b/roles/os_zabbix/vars/template_docker.yml
@@ -7,6 +7,21 @@ g_template_docker:
- Docker Daemon
value_type: int
+ - key: docker.info_elapsed_ms
+ applications:
+ - Docker Daemon
+ value_type: int
+
+ - key: docker.container.dns.resolution
+ applications:
+ - Docker Daemon
+ value_type: int
+
+ - key: docker.container.existing.dns.resolution.failed
+ applications:
+ - Docker Daemon
+ value_type: int
+
- key: docker.storage.is_loopback
applications:
- Docker Storage
@@ -57,6 +72,18 @@ g_template_docker:
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc'
priority: high
+ # Re-enable for OpenShift 3.1.1 (https://bugzilla.redhat.com/show_bug.cgi?id=1292971#c6)
+ - name: 'docker.container.dns.resolution failed on {HOST.NAME}'
+ expression: '{Template Docker:docker.container.dns.resolution.min(#3)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_dns.asciidoc'
+ priority: average
+ status: disabled
+
+ - name: 'docker.container.existing.dns.resolution.failed on {HOST.NAME}'
+ expression: '{Template Docker:docker.container.existing.dns.resolution.failed.min(#3)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_dns.asciidoc'
+ priority: average
+
- name: 'Docker storage is using LOOPBACK on {HOST.NAME}'
expression: '{Template Docker:docker.storage.is_loopback.last()}<>0'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc'
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
index c71e07910..e36f23a2b 100644
--- a/roles/os_zabbix/vars/template_openshift_master.yml
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -2,10 +2,10 @@
g_template_openshift_master:
name: Template Openshift Master
zitems:
- - name: create_app
+ - name: openshift.master.app.create
applications:
- Openshift Master
- key: create_app
+ key: openshift.master.app.create
- key: openshift.master.process.count
description: Shows number of master processes running
@@ -13,12 +13,255 @@ g_template_openshift_master:
applications:
- Openshift Master
- ztriggers:
- - name: 'Application creation has failed on {HOST.NAME}'
- expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
- priority: avg
+ - key: openshift.master.api.ping
+ description: "Verify that the Openshift API is up (uses the cluster API URL)"
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.local.api.ping
+ description: "Verify that the Openshift API is up on the host (uses the API URL as the https://127.0.0.1)"
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.api.healthz
+ description: "Checks the healthz check of the master's api: https://<cluster_api_url>/healthz"
+ type: int
+ data_type: bool
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.local.api.healthz
+ description: "Checks the healthz check of the master's api: https://127.0.0.1/healthz"
+ type: int
+ data_type: bool
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.user.count
+ description: Shows number of users in a cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pod.running.count
+ description: Shows number of pods running
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pod.user.running.count
+ description: Shows number of user pods running (non infrastructure pods)
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pod.total.count
+ description: Shows total number of pods (running and non running)
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.node.count
+ description: Shows the total number of nodes found in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.project.count
+ description: Shows number of projects on a cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pv.total.count
+ description: Total number of Persistent Volumes in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pv.available.count
+ description: Total number of Available Persistent Volumes in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pv.released.count
+ description: Total number of Released Persistent Volumes in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pv.bound.count
+ description: Total number of Bound Persistent Volumes in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pv.failed.count
+ description: Total number of Failed Persistent Volumes in the Openshift Cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.skydns.port.open
+ description: State of the SkyDNS port open and listening
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.skydns.query
+ description: SkyDNS can be queried or not
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.etcd.create.success
+ description: Show number of successful create actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.create.fail
+ description: Show number of failed create actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.delete.success
+ description: Show number of successful delete actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.delete.fail
+ description: Show number of failed delete actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.get.success
+ description: Show number of successful get actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.get.fail
+ description: Show number of failed get actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.set.success
+ description: Show number of successful set actions
+ type: int
+ applications:
+ - Openshift Etcd
+ - key: openshift.master.etcd.set.fail
+ description: Show number of failed set actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.update.success
+ description: Show number of successful update actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.update.fail
+ description: Show number of failed update actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.watchers
+ description: Show number of etcd watchers
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.ping
+ description: etcd ping
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.metric.ping
+ description: "This check verifies that the https://master/metrics check is alive and communicating properly."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.nodesnotready.count
+ description: "This check shows how many nodes in a cluster are in NotReady state."
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.nodesnotschedulable.count
+ description: "This check shows how many nodes in a cluster are not schedulable."
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.list.9
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.list.99
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the end to end scheduling operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the end to end scheduling operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
+ description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the end to end scheduling operations have taken to completed."
+ type: int
+ applications:
+ - Openshift Master Metrics
+
+ ztriggers:
- name: 'Openshift Master process not running on {HOST.NAME}'
expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
@@ -28,3 +271,120 @@ g_template_openshift_master:
expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
priority: high
+
+ - name: 'Etcd ping failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+ priority: high
+
+ - name: 'Number of users for Openshift Master on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.user.count.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: info
+
+ - name: 'There are no projects running on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.project.count.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: info
+
+ # Put triggers that depend on other triggers here (deps must be created first)
+ - name: 'Application creation has failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.app.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.create.last(#2)}=1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: avg
+
+ - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.app.create.sum(1h)}>3'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ description: The application create loop has failed 4 or more times in the last hour
+ priority: avg
+
+ - name: 'Openshift Master API health check is failing on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: high
+
+ - name: 'Openshift Master Local API health check is failing on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.local.api.healthz.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: high
+
+ - name: 'Openshift Master API PING check is failing on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: high
+
+ - name: 'Openshift Master Local API PING check is failing on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.local.api.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: high
+
+ - name: 'Openshift Master metric PING check is failing on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: avg
+
+ - name: 'SkyDNS port not listening on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: high
+
+ - name: 'SkyDNS query failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.skydns.query.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ dependencies:
+ - 'Openshift Master API health check is failing on {HOST.NAME}'
+ priority: high
+
+ - name: 'Hosts not ready according to {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.nodesnotready.count.last(#2)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: high
+
+ zgraphs:
+ - name: Openshift Master API Server Latency Pods LIST Quantiles
+ width: 900
+ height: 200
+ graph_items:
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.5
+ color: red
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.9
+ color: blue
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.99
+ color: orange
+
+ - name: Openshift Master API Server Latency Pods WATCHLIST Quantiles
+ width: 900
+ height: 200
+ graph_items:
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
+ color: red
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
+ color: blue
+ - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
+ color: orange
+
+ - name: Openshift Master Scheduler End to End Latency Quantiles
+ width: 900
+ height: 200
+ graph_items:
+ - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
+ color: red
+ - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
+ color: blue
+ - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
+ color: orange
diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml
index 36f9cc4a3..66bd3a147 100644
--- a/roles/os_zabbix/vars/template_openshift_node.yml
+++ b/roles/os_zabbix/vars/template_openshift_node.yml
@@ -8,13 +8,63 @@ g_template_openshift_node:
applications:
- Openshift Node
+ - key: openshift.node.ovs.pids.count
+ description: Shows number of ovs process ids running
+ type: int
+ applications:
+ - Openshift Node
+
+ - key: openshift.node.ovs.ports.count
+ description: Shows number of OVS ports defined
+ type: int
+ applications:
+ - Openshift Node
+
+ - key: openshift.node.ovs.stray.rules
+ description: Number of OVS stray rules found/removed
+ type: int
+ applications:
+ - Openshift Node
+
+ - key: openshift.node.registry-pods.healthy_pct
+ description: Shows the percentage of healthy registries in the cluster
+ type: int
+ applications:
+ - Openshift Node
+
+ - key: openshift.node.registry.service.ping
+ description: Ping docker-registry service from node
+ type: int
+ applications:
+ - Openshift Node
+
ztriggers:
+ - name: 'One or more Docker Registries is unhealthy according to {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#2)}<100 and {Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#1)}<100'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
+ priority: avg
+
+ - name: 'Docker Registry service is unhealthy according to {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.registry.service.ping.last(#2)}<1 and {Template Openshift Node:openshift.node.registry.service.ping.last(#1)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
+ priority: avg
+
- name: 'Openshift Node process not running on {HOST.NAME}'
expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
priority: high
- name: 'Too many Openshift Node processes running on {HOST.NAME}'
expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+ - name: '[Heal] OVS may not be running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last(#1)}<>4 and {Template Openshift Node:openshift.node.ovs.pids.count.last(#2)}<>4'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+ - name: 'Number of OVS ports is 0 on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
priority: high
diff --git a/roles/os_zabbix/vars/template_ops_tools.yml b/roles/os_zabbix/vars/template_ops_tools.yml
new file mode 100644
index 000000000..a0a5a4d03
--- /dev/null
+++ b/roles/os_zabbix/vars/template_ops_tools.yml
@@ -0,0 +1,54 @@
+---
+g_template_ops_tools:
+ name: Template Operations Tools
+ zdiscoveryrules:
+ - name: disc.ops.runner
+ key: disc.ops.runner
+ lifetime: 1
+ description: "Dynamically register operations runner items"
+
+ zitemprototypes:
+ - discoveryrule_key: disc.ops.runner
+ name: "Exit code of ops-runner[{#OSO_COMMAND}]"
+ key: "disc.ops.runner.command.exitcode[{#OSO_COMMAND}]"
+ value_type: int
+ description: "The exit code of the command run from ops-runner"
+ applications:
+ - Ops Runner
+
+ ztriggerprototypes:
+ - name: 'ops-runner[{#OSO_COMMAND}]: non-zero exit code on {HOST.NAME}'
+ expression: '{Template Operations Tools:disc.ops.runner.command.exitcode[{#OSO_COMMAND}].last()}<>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_ops_runner_command.asciidoc'
+ priority: average
+
+ zactions:
+ - name: 'Remote command for [Heal] triggers'
+ status: enabled
+ escalation_time: 60
+ conditions_filter:
+ calculation_type: "and/or"
+ conditions:
+ - conditiontype: maintenance status
+ operator: not in
+ - conditiontype: trigger name
+ operator: like
+ value: "[Heal]"
+ - conditiontype: trigger value
+ operator: "="
+ value: PROBLEM
+ operations:
+ - esc_step_from: 1
+ esc_step_to: 1
+ esc_period: 0
+ operationtype: remote command
+ opcommand:
+ command: 'ssh -i /etc/openshift_tools/scriptrunner_id_rsa {{ ozb_scriptrunner_user }}@{{ ozb_scriptrunner_bastion_host }} remote-healer --host \"{HOST.NAME}\" --trigger \"{TRIGGER.NAME}\" --trigger-val \"{TRIGGER.VALUE}\"'
+ execute_on: "zabbix server"
+ type: 'custom script'
+ target_hosts:
+ - target_type: 'zabbix server'
+ opconditions:
+ - conditiontype: 'event acknowledged'
+ operator: '='
+ value: 'not acknowledged'
diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml
index 3173c79b2..c6e557f12 100644
--- a/roles/os_zabbix/vars/template_os_linux.yml
+++ b/roles/os_zabbix/vars/template_os_linux.yml
@@ -10,17 +10,20 @@ g_template_os_linux:
- key: kernel.all.cpu.wait.total
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.cpu.irq.hard
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.cpu.idle
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.uname.distro
applications:
@@ -35,7 +38,8 @@ g_template_os_linux:
- key: kernel.all.cpu.irq.soft
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.load.15_minute
applications:
@@ -45,7 +49,8 @@ g_template_os_linux:
- key: kernel.all.cpu.sys
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.load.5_minute
applications:
@@ -55,7 +60,8 @@ g_template_os_linux:
- key: kernel.all.cpu.nice
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.load.1_minute
applications:
@@ -75,7 +81,8 @@ g_template_os_linux:
- key: kernel.all.cpu.user
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.uname.machine
applications:
@@ -90,7 +97,8 @@ g_template_os_linux:
- key: kernel.all.cpu.steal
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.pswitch
applications:
@@ -180,38 +188,105 @@ g_template_os_linux:
multiplier: 1024
units: B
- # Disk items
- - key: filesys.full.xvda2
+ zdiscoveryrules:
+ - name: disc.filesys
+ key: disc.filesys
+ lifetime: 1
+ description: "Dynamically register the filesystems"
+
+ - name: disc.disk
+ key: disc.disk
+ lifetime: 1
+ description: "Dynamically register disks on a node"
+
+ - name: disc.network
+ key: disc.network
+ lifetime: 1
+ description: "Dynamically register network interfaces on a node"
+
+ zitemprototypes:
+ - discoveryrule_key: disc.filesys
+ name: "disc.filesys.full.{#OSO_FILESYS}"
+ key: "disc.filesys.full[{#OSO_FILESYS}]"
+ value_type: float
+ description: "PCP filesys.full option. This is the percent full returned from pcp filesys.full"
applications:
- Disk
+
+ - discoveryrule_key: disc.filesys
+ name: "Percentage of used inodes on {#OSO_FILESYS}"
+ key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]"
value_type: float
+ description: "PCP derived value of percentage of used inodes on a filesystem."
+ applications:
+ - Disk
- - key: filesys.full.xvda3
+ - discoveryrule_key: disc.disk
+ name: "TPS (IOPS) for disk {#OSO_DISK}"
+ key: "disc.disk.tps[{#OSO_DISK}]"
+ value_type: int
+ description: "PCP disk.dev.totals metric measured over a period of time. This shows how many disk transactions per second the disk is using"
applications:
- Disk
+
+ - discoveryrule_key: disc.disk
+ name: "Percent Utilized for disk {#OSO_DISK}"
+ key: "disc.disk.putil[{#OSO_DISK}]"
value_type: float
+ description: "PCP disk.dev.avactive metric measured over a period of time. This is the '%util' in the iostat command"
+ applications:
+ - Disk
- ztriggers:
- - name: 'Filesystem: / has less than 10% free on {HOST.NAME}'
- expression: '{Template OS Linux:filesys.full.xvda2.last()}>90'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
- priority: warn
+ - discoveryrule_key: disc.network
+ name: "Bytes per second IN on network interface {#OSO_NET_INTERFACE}"
+ key: "disc.network.in.bytes[{#OSO_NET_INTERFACE}]"
+ value_type: int
+ units: B
+ delta: 1
+ description: "PCP network.interface.in.bytes metric. This is setup as a delta in Zabbix to measure the speed per second"
+ applications:
+ - Network
- - name: 'Filesystem: / has less than 5% free on {HOST.NAME}'
- expression: '{Template OS Linux:filesys.full.xvda2.last()}>95'
+ - discoveryrule_key: disc.network
+ name: "Bytes per second OUT on network interface {#OSO_NET_INTERFACE}"
+ key: "disc.network.out.bytes[{#OSO_NET_INTERFACE}]"
+ value_type: int
+ units: B
+ delta: 1
+ description: "PCP network.interface.out.bytes metric. This is setup as a delta in Zabbix to measure the speed per second"
+ applications:
+ - Network
+
+ ztriggerprototypes:
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
priority: high
- - name: 'Filesystem: /var has less than 10% free on {HOST.NAME}'
- expression: '{Template OS Linux:filesys.full.xvda3.last()}>90'
+ # This has a dependency on the previous trigger
+ # Trigger Prototypes do not work in 2.4. They will work in Zabbix 3.0
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
priority: warn
+ dependencies:
+ - 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
- - name: 'Filesystem: /var has less than 5% free on {HOST.NAME}'
- expression: '{Template OS Linux:filesys.full.xvda3.last()}>95'
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
priority: high
+ # This has a dependency on the previous trigger
+ # Trigger Prototypes do not work in 2.4. They will work in Zabbix 3.0
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: warn
+ dependencies:
+ - 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
+
+ ztriggers:
- name: 'Too many TOTAL processes on {HOST.NAME}'
expression: '{Template OS Linux:proc.nprocs.last()}>5000'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc'
@@ -222,3 +297,18 @@ g_template_os_linux:
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc'
priority: warn
description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024'
+
+ # CPU Utilization #
+ - name: 'CPU idle less than 5% on {HOST.NAME}'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<5'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
+ priority: average
+ description: 'CPU is less than 5% idle'
+
+ - name: 'CPU idle less than 10% on {HOST.NAME}'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<10'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
+ priority: average
+ description: 'CPU is less than 10% idle'
+ dependencies:
+ - 'CPU idle less than 5% on {HOST.NAME}'
diff --git a/roles/os_zabbix/vars/template_performance_copilot.yml b/roles/os_zabbix/vars/template_performance_copilot.yml
new file mode 100644
index 000000000..b62fa0228
--- /dev/null
+++ b/roles/os_zabbix/vars/template_performance_copilot.yml
@@ -0,0 +1,14 @@
+---
+g_template_performance_copilot:
+ name: Template Performance Copilot
+ zitems:
+ - key: pcp.ping
+ applications:
+ - Performance Copilot
+ value_type: int
+
+ ztriggers:
+ - name: 'pcp.ping failed on {HOST.NAME}'
+ expression: '{Template Performance Copilot:pcp.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_pcp_ping.asciidoc'
+ priority: average
diff --git a/roles/os_zabbix/vars/template_zagg_server.yml b/roles/os_zabbix/vars/template_zagg_server.yml
new file mode 100644
index 000000000..db5665993
--- /dev/null
+++ b/roles/os_zabbix/vars/template_zagg_server.yml
@@ -0,0 +1,46 @@
+---
+g_template_zagg_server:
+ name: Template Zagg Server
+ zitems:
+ - key: zagg.server.metrics.count
+ applications:
+ - Zagg Server
+ value_type: int
+
+ - key: zagg.server.metrics.errors
+ applications:
+ - Zagg Server
+ value_type: int
+
+ - key: zagg.server.heartbeat.errors
+ applications:
+ - Zagg Server
+ value_type: int
+
+ - key: zagg.server.heartbeat.count
+ applications:
+ - Zagg Server
+ value_type: int
+
+ ztriggers:
+ - name: 'Error processing metrics on {HOST.NAME}'
+ expression: '{Template Zagg Server:zagg.server.metrics.errors.min(#3)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
+ priority: average
+
+ - name: 'Error processing heartbeats on {HOST.NAME}'
+ expression: '{Template Zagg Server:zagg.server.heartbeat.errors.min(#3)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
+ priority: average
+
+ - name: 'Critically High number of metrics in Zagg queue {HOST.NAME}'
+ expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>10000'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
+ priority: high
+
+ - name: 'High number of metrics in Zagg queue {HOST.NAME}'
+ expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>5000'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
+ dependencies:
+ - 'Critically High number of metrics in Zagg queue {HOST.NAME}'
+ priority: average