summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix
diff options
context:
space:
mode:
Diffstat (limited to 'roles/os_zabbix')
-rw-r--r--roles/os_zabbix/tasks/main.yml55
-rw-r--r--roles/os_zabbix/vars/template_app_zabbix_agent.yml23
-rw-r--r--roles/os_zabbix/vars/template_app_zabbix_server.yml408
-rw-r--r--roles/os_zabbix/vars/template_docker.yml89
-rw-r--r--roles/os_zabbix/vars/template_heartbeat.yml3
-rw-r--r--roles/os_zabbix/vars/template_host.yml27
-rw-r--r--roles/os_zabbix/vars/template_master.yml27
-rw-r--r--roles/os_zabbix/vars/template_node.yml27
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml58
-rw-r--r--roles/os_zabbix/vars/template_openshift_node.yml44
-rw-r--r--roles/os_zabbix/vars/template_ops_tools.yml23
-rw-r--r--roles/os_zabbix/vars/template_os_linux.yml199
-rw-r--r--roles/os_zabbix/vars/template_router.yml27
13 files changed, 843 insertions, 167 deletions
diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml
index 7111c778b..a503b24d7 100644
--- a/roles/os_zabbix/tasks/main.yml
+++ b/roles/os_zabbix/tasks/main.yml
@@ -7,10 +7,14 @@
state: list
register: templates
-- debug: var=templates
-
- include_vars: template_heartbeat.yml
- include_vars: template_os_linux.yml
+- include_vars: template_docker.yml
+- include_vars: template_openshift_master.yml
+- include_vars: template_openshift_node.yml
+- include_vars: template_ops_tools.yml
+- include_vars: template_app_zabbix_server.yml
+- include_vars: template_app_zabbix_agent.yml
- name: Include Template Heartbeat
include: ../../lib_zabbix/tasks/create_template.yml
@@ -28,3 +32,50 @@
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+- name: Include Template docker
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_docker }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template Openshift Master
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_openshift_master }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template Openshift Node
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_openshift_node }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template Ops Tools
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_ops_tools }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template App Zabbix Server
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_app_zabbix_server }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+
+- name: Include Template App Zabbix Agent
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_app_zabbix_agent }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
diff --git a/roles/os_zabbix/vars/template_app_zabbix_agent.yml b/roles/os_zabbix/vars/template_app_zabbix_agent.yml
new file mode 100644
index 000000000..06c4eda8b
--- /dev/null
+++ b/roles/os_zabbix/vars/template_app_zabbix_agent.yml
@@ -0,0 +1,23 @@
+---
+g_template_app_zabbix_agent:
+ name: Template App Zabbix Agent
+ zitems:
+ - key: agent.hostname
+ applications:
+ - Zabbix agent
+ value_type: character
+ zabbix_type: '0'
+
+ - key: agent.ping
+ applications:
+ - Zabbix agent
+ description: The agent always returns 1 for this item. It could be used in combination with nodata() for availability check.
+ value_type: int
+ zabbix_type: '0'
+
+ ztriggers:
+ - name: '[Reboot] Zabbix agent on {HOST.NAME} is unreachable for 15 minutes'
+ description: Zabbix agent is unreachable for 15 minutes.
+ expression: '{Template App Zabbix Agent:agent.ping.nodata(15m)}=1'
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_ping.asciidoc
diff --git a/roles/os_zabbix/vars/template_app_zabbix_server.yml b/roles/os_zabbix/vars/template_app_zabbix_server.yml
new file mode 100644
index 000000000..dace2aa29
--- /dev/null
+++ b/roles/os_zabbix/vars/template_app_zabbix_server.yml
@@ -0,0 +1,408 @@
+---
+g_template_app_zabbix_server:
+ name: Template App Zabbix Server
+ zitems:
+ - key: housekeeper_creates
+ applications:
+ - Zabbix server
+ description: A simple count of the number of partition creates output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: '2'
+
+ - key: housekeeper_drops
+ applications:
+ - Zabbix server
+ description: A simple count of the number of partition drops output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: '2'
+
+ - key: housekeeper_errors
+ applications:
+ - Zabbix server
+ description: A simple count of the number of errors output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: '2'
+
+ - key: housekeeper_total
+ applications:
+ - Zabbix server
+ description: A simple count of the total number of lines output by the housekeeper
+ script.
+ units: ''
+ value_type: int
+ zabbix_type: '2'
+
+ - key: zabbix[process,alerter,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,configuration syncer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,db watchdog,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,discoverer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,escalator,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,history syncer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,housekeeper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,http poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,icmp pinger,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,ipmi poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,java poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,node watcher,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,proxy poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,self-monitoring,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,snmp trapper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,timer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,trapper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,unreachable poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[queue,10m]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: int
+ zabbix_type: '5'
+
+ - key: zabbix[queue]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: int
+ zabbix_type: '5'
+
+ - key: zabbix[rcache,buffer,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[wcache,history,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[wcache,text,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[wcache,trend,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[wcache,values]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: '5'
+ ztriggers:
+ - description: "There has been unexpected output while running the housekeeping script\
+ \ on the Zabbix. There are only three kinds of lines we expect to see in the output,\
+ \ and we've gotten something enw.\r\n\r\nCheck the script's output in /var/lib/zabbix/state\
+ \ for more details."
+ expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}+{Template App Zabbix Server:housekeeper_creates.last(0)}+{Template App Zabbix Server:housekeeper_drops.last(0)}<>{Template App Zabbix Server:housekeeper_total.last(0)}'
+ name: Unexpected output in Zabbix DB Housekeeping
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_DB_Housekeeping.asciidoc
+
+ - description: An error has occurred during running the housekeeping script on the Zabbix. Check the script's output in /var/lib/zabbix/state for more details.
+ expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}>0'
+ name: Errors during Zabbix DB Housekeeping
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,alerter,avg,busy].min(600)}>75'
+ name: Zabbix alerter processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,configuration syncer,avg,busy].min(600)}>75'
+ name: Zabbix configuration syncer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,db watchdog,avg,busy].min(600)}>75'
+ name: Zabbix db watchdog processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,discoverer,avg,busy].min(600)}>75'
+ name: Zabbix discoverer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,escalator,avg,busy].min(600)}>75'
+ name: Zabbix escalator processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,history syncer,avg,busy].min(600)}>75'
+ name: Zabbix history syncer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,housekeeper,avg,busy].min(1800)}>75'
+ name: Zabbix housekeeper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,http poller,avg,busy].min(600)}>75'
+ name: Zabbix http poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,icmp pinger,avg,busy].min(600)}>75'
+ name: Zabbix icmp pinger processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,ipmi poller,avg,busy].min(600)}>75'
+ name: Zabbix ipmi poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,java poller,avg,busy].min(600)}>75'
+ name: Zabbix java poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,node watcher,avg,busy].min(600)}>75'
+ name: Zabbix node watcher processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,poller,avg,busy].min(600)}>75'
+ name: Zabbix poller processes more than 75% busy
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,proxy poller,avg,busy].min(600)}>75'
+ name: Zabbix proxy poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,self-monitoring,avg,busy].min(600)}>75'
+ name: Zabbix self-monitoring processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,snmp trapper,avg,busy].min(600)}>75'
+ name: Zabbix snmp trapper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: Timer processes usually are busy because they have to process time
+ based trigger functions
+ expression: '{Template App Zabbix Server:zabbix[process,timer,avg,busy].min(600)}>75'
+ name: Zabbix timer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,trapper,avg,busy].min(600)}>75'
+ name: Zabbix trapper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,unreachable poller,avg,busy].min(600)}>75'
+ name: Zabbix unreachable poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: "This alert generally indicates a performance problem or a problem\
+ \ with the zabbix-server or proxy.\r\n\r\nThe first place to check for issues\
+ \ is Administration > Queue. Be sure to check the general view and the per-proxy\
+ \ view."
+ expression: '{Template App Zabbix Server:zabbix[queue,10m].min(600)}>1000'
+ name: More than 1000 items having missing data for more than 10 minutes
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/data_lost_overview_plugin.asciidoc
+
+ - description: Consider increasing CacheSize in the zabbix_server.conf configuration
+ file
+ expression: '{Template App Zabbix Server:zabbix[rcache,buffer,pfree].min(600)}<5'
+ name: Less than 5% free in the configuration cache
+ priority: info
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,history,pfree].min(600)}<25'
+ name: Less than 25% free in the history cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,text,pfree].min(600)}<25'
+ name: Less than 25% free in the text history cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,trend,pfree].min(600)}<25'
+ name: Less than 25% free in the trends cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
diff --git a/roles/os_zabbix/vars/template_docker.yml b/roles/os_zabbix/vars/template_docker.yml
new file mode 100644
index 000000000..395e054de
--- /dev/null
+++ b/roles/os_zabbix/vars/template_docker.yml
@@ -0,0 +1,89 @@
+---
+g_template_docker:
+ name: Template Docker
+ zitems:
+ - key: docker.ping
+ applications:
+ - Docker Daemon
+ value_type: int
+
+ - key: docker.storage.is_loopback
+ applications:
+ - Docker Storage
+ value_type: int
+
+ - key: docker.storage.data.space.total
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.data.space.used
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.data.space.available
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.data.space.percent_available
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.total
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.used
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.available
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.percent_available
+ applications:
+ - Docker Storage
+ value_type: float
+ ztriggers:
+ - name: 'docker.ping failed on {HOST.NAME}'
+ expression: '{Template Docker:docker.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc'
+ priority: high
+
+ - name: 'Docker storage is using LOOPBACK on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.is_loopback.last()}<>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc'
+ priority: high
+
+ - name: 'Critically low docker storage data space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.data.space.available.max(#3)}<5' # < 5% or < 5GB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ priority: high
+
+ - name: 'Critically low docker storage metadata space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.005' # < 5% or < 5MB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ priority: high
+
+ # Put triggers that depend on other triggers here (deps must be created first)
+ - name: 'Low docker storage data space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.data.space.available.max(#3)}<10' # < 10% or < 10GB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ dependencies:
+ - 'Critically low docker storage data space on {HOST.NAME}'
+ priority: average
+
+ - name: 'Low docker storage metadata space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.01' # < 10% or < 10MB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ dependencies:
+ - 'Critically low docker storage metadata space on {HOST.NAME}'
+ priority: average
+
diff --git a/roles/os_zabbix/vars/template_heartbeat.yml b/roles/os_zabbix/vars/template_heartbeat.yml
index 3d0f3d51a..8dbe0d0d6 100644
--- a/roles/os_zabbix/vars/template_heartbeat.yml
+++ b/roles/os_zabbix/vars/template_heartbeat.yml
@@ -7,6 +7,7 @@ g_template_heartbeat:
- Heartbeat
key: heartbeat.ping
ztriggers:
- - description: 'Heartbeat.ping has failed on {HOST.NAME}'
+ - name: 'Heartbeat.ping has failed on {HOST.NAME}'
expression: '{Template Heartbeat:heartbeat.ping.nodata(20m)}=1'
priority: avg
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_node_heartbeat.asciidoc'
diff --git a/roles/os_zabbix/vars/template_host.yml b/roles/os_zabbix/vars/template_host.yml
deleted file mode 100644
index e7cc667cb..000000000
--- a/roles/os_zabbix/vars/template_host.yml
+++ /dev/null
@@ -1,27 +0,0 @@
----
-g_template_host:
- params:
- name: Template Host
- host: Template Host
- groups:
- - groupid: 1 # FIXME (not real)
- output: extend
- search:
- name: Template Host
- zitems:
- - name: Host Ping
- hostid:
- key_: host.ping
- type: 2
- value_type: 0
- output: extend
- search:
- key_: host.ping
- ztriggers:
- - description: 'Host ping has failed on {HOST.NAME}'
- expression: '{Template Host:host.ping.last()}<>0'
- priority: 3
- searchWildcardsEnabled: True
- search:
- description: 'Host ping has failed on*'
- expandExpression: True
diff --git a/roles/os_zabbix/vars/template_master.yml b/roles/os_zabbix/vars/template_master.yml
deleted file mode 100644
index 5f9b41a4f..000000000
--- a/roles/os_zabbix/vars/template_master.yml
+++ /dev/null
@@ -1,27 +0,0 @@
----
-g_template_master:
- params:
- name: Template Master
- host: Template Master
- groups:
- - groupid: 1 # FIXME (not real)
- output: extend
- search:
- name: Template Master
- zitems:
- - name: Master Etcd Ping
- hostid:
- key_: master.etcd.ping
- type: 2
- value_type: 0
- output: extend
- search:
- key_: master.etcd.ping
- ztriggers:
- - description: 'Master Etcd ping has failed on {HOST.NAME}'
- expression: '{Template Master:master.etcd.ping.last()}<>0'
- priority: 3
- searchWildcardsEnabled: True
- search:
- description: 'Master Etcd ping has failed on*'
- expandExpression: True
diff --git a/roles/os_zabbix/vars/template_node.yml b/roles/os_zabbix/vars/template_node.yml
deleted file mode 100644
index 98c343a24..000000000
--- a/roles/os_zabbix/vars/template_node.yml
+++ /dev/null
@@ -1,27 +0,0 @@
----
-g_template_node:
- params:
- name: Template Node
- host: Template Node
- groups:
- - groupid: 1 # FIXME (not real)
- output: extend
- search:
- name: Template Node
- zitems:
- - name: Kubelet Ping
- hostid:
- key_: kubelet.ping
- type: 2
- value_type: 0
- output: extend
- search:
- key_: kubelet.ping
- ztriggers:
- - description: 'Kubelet ping has failed on {HOST.NAME}'
- expression: '{Template Node:kubelet.ping.last()}<>0'
- priority: 3
- searchWildcardsEnabled: True
- search:
- description: 'Kubelet ping has failed on*'
- expandExpression: True
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
new file mode 100644
index 000000000..1de4fefbb
--- /dev/null
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -0,0 +1,58 @@
+---
+g_template_openshift_master:
+ name: Template Openshift Master
+ zitems:
+ - name: create_app
+ applications:
+ - Openshift Master
+ key: create_app
+
+ - key: openshift.master.process.count
+ description: Shows number of master processes running
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.user.count
+ description: Shows number of users in a cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pod.running.count
+ description: Shows number of pods running
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.project.counter
+ description: Shows number of projects on a cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ ztriggers:
+ - name: 'Application creation has failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
+ priority: avg
+
+ - name: 'Openshift Master process not running on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: high
+
+ - name: 'Too many Openshift Master processes running on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: high
+
+ - name: 'Number of users for Openshift Master on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.user.count.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: info
+
+ - name: 'There are no projects running on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.project.counter.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: info
diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml
new file mode 100644
index 000000000..ce28b1048
--- /dev/null
+++ b/roles/os_zabbix/vars/template_openshift_node.yml
@@ -0,0 +1,44 @@
+---
+g_template_openshift_node:
+ name: Template Openshift Node
+ zitems:
+ - key: openshift.node.process.count
+ description: Shows number of OpenShift Node processes running
+ type: int
+ applications:
+ - Openshift Node
+
+ - key: openshift.node.ovs.pids.count
+ description: Shows number of ovs process ids running
+ type: int
+ applications:
+ - Openshift Node
+
+ - key: openshift.node.ovs.ports.count
+ description: Shows number of OVS ports defined
+ type: int
+ applications:
+ - Openshift Node
+
+ ztriggers:
+ - name: 'Openshift Node process not running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+ - name: 'Too many Openshift Node processes running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+ - name: 'OVS may not be running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last()}<>4'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+ - name: 'Number of OVS ports is 0 on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+
diff --git a/roles/os_zabbix/vars/template_ops_tools.yml b/roles/os_zabbix/vars/template_ops_tools.yml
new file mode 100644
index 000000000..d1b8a2514
--- /dev/null
+++ b/roles/os_zabbix/vars/template_ops_tools.yml
@@ -0,0 +1,23 @@
+---
+g_template_ops_tools:
+ name: Template Operations Tools
+ zdiscoveryrules:
+ - name: disc.ops.runner
+ key: disc.ops.runner
+ lifetime: 1
+ description: "Dynamically register operations runner items"
+
+ zitemprototypes:
+ - discoveryrule_key: disc.ops.runner
+ name: "Exit code of ops-runner[{#OSO_COMMAND}]"
+ key: "disc.ops.runner.command.exitcode[{#OSO_COMMAND}]"
+ value_type: int
+ description: "The exit code of the command run from ops-runner"
+ applications:
+ - Ops Runner
+
+ ztriggerprototypes:
+ - name: 'ops-runner[{#OSO_COMMAND}]: non-zero exit code on {HOST.NAME}'
+ expression: '{Template Operations Tools:disc.ops.runner.command.exitcode[{#OSO_COMMAND}].last()}<>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_ops_runner_command.asciidoc'
+ priority: average
diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml
index 1c9d10bb0..69432273f 100644
--- a/roles/os_zabbix/vars/template_os_linux.yml
+++ b/roles/os_zabbix/vars/template_os_linux.yml
@@ -10,17 +10,20 @@ g_template_os_linux:
- key: kernel.all.cpu.wait.total
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.cpu.irq.hard
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.cpu.idle
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.uname.distro
applications:
@@ -35,7 +38,8 @@ g_template_os_linux:
- key: kernel.all.cpu.irq.soft
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.load.15_minute
applications:
@@ -45,32 +49,19 @@ g_template_os_linux:
- key: kernel.all.cpu.sys
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.load.5_minute
applications:
- Kernel
value_type: float
- - key: mem.freemem
- applications:
- - Memory
- value_type: int
-
- key: kernel.all.cpu.nice
applications:
- Kernel
- value_type: int
-
- - key: mem.util.bufmem
- applications:
- - Memory
- value_type: int
-
- - key: swap.used
- applications:
- - Memory
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.load.1_minute
applications:
@@ -82,92 +73,188 @@ g_template_os_linux:
- Kernel
value_type: string
- - key: swap.length
+ - key: kernel.all.uptime
applications:
- - Memory
+ - Kernel
value_type: int
- - key: mem.physmem
+ - key: kernel.all.cpu.user
applications:
- - Memory
- value_type: int
+ - Kernel
+ value_type: float
+ units: '%'
- - key: kernel.all.uptime
+ - key: kernel.uname.machine
applications:
- Kernel
- value_type: int
+ value_type: string
- - key: swap.free
+ - key: hinv.ncpu
applications:
- - Memory
+ - Kernel
value_type: int
- - key: mem.util.used
+ - key: kernel.all.cpu.steal
applications:
- - Memory
- value_type: int
+ - Kernel
+ value_type: float
+ units: '%'
- - key: kernel.all.cpu.user
+ - key: kernel.all.pswitch
applications:
- Kernel
value_type: int
- - key: kernel.uname.machine
+ - key: kernel.uname.release
applications:
- Kernel
value_type: string
- - key: hinv.ncpu
+ - key: proc.nprocs
applications:
- Kernel
value_type: int
- - key: mem.util.cached
+ # Memory Items
+ - key: mem.freemem
applications:
- Memory
value_type: int
+ description: "PCP: free system memory metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: kernel.all.cpu.steal
+ - key: mem.util.bufmem
applications:
- - Kernel
+ - Memory
value_type: int
+ description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: kernel.all.pswitch
+ - key: swap.used
applications:
- - Kernel
+ - Memory
value_type: int
+ description: "PCP: swap used metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: kernel.uname.release
+ - key: swap.length
applications:
- - Kernel
- value_type: string
+ - Memory
+ value_type: int
+ description: "PCP: total swap available metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: proc.nprocs
+ - key: mem.physmem
applications:
- - Kernel
+ - Memory
value_type: int
+ description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM."
+ multiplier: 1024
+ units: B
- - key: filesys.avail
+ - key: swap.free
applications:
- - Disk
+ - Memory
value_type: int
+ description: "PCP: swap free metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: filesys.capacity
+ - key: mem.util.available
applications:
- - Disk
+ - Memory
value_type: int
+ description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: filesys.free
+ - key: mem.util.used
applications:
- - Disk
+ - Memory
value_type: int
+ description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: filesys.full
+ - key: mem.util.cached
applications:
- - Disk
+ - Memory
+ value_type: int
+ description: "PCP: Memory used by the page cache, including buffered file data. This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ zdiscoveryrules:
+ - name: disc.filesys
+ key: disc.filesys
+ lifetime: 1
+ description: "Dynamically register the filesystems"
+
+ zitemprototypes:
+ - discoveryrule_key: disc.filesys
+ name: "disc.filesys.full.{#OSO_FILESYS}"
+ key: "disc.filesys.full[{#OSO_FILESYS}]"
value_type: float
-
- - key: filesys.used
+ description: "PCP filesys.full option. This is the percent full returned from pcp filesys.full"
applications:
- Disk
+
+ - discoveryrule_key: disc.filesys
+ name: "Percentage of used inodes on {#OSO_FILESYS}"
+ key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]"
value_type: float
+ description: "PCP derived value of percentage of used inodes on a filesystem."
+ applications:
+ - Disk
+
+ ztriggerprototypes:
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: warn
+
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: high
+
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: warn
+
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: high
+
+ ztriggers:
+ - name: 'Too many TOTAL processes on {HOST.NAME}'
+ expression: '{Template OS Linux:proc.nprocs.last()}>5000'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc'
+ priority: warn
+
+ - name: 'Lack of available memory on {HOST.NAME}'
+ expression: '{Template OS Linux:mem.freemem.last()}<30720000'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc'
+ priority: warn
+ description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024'
+
+ # CPU Utilization #
+ - name: 'CPU idle less than 5% on {HOST.NAME}'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<5 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<5'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
+ priority: high
+ description: 'CPU is less than 5% idle'
+
+ - name: 'CPU idle less than 10% on {HOST.NAME}'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<10 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<10'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
+ priority: warn
+ description: 'CPU is less than 10% idle'
+ dependencies:
+ - 'CPU idle less than 5% on {HOST.NAME}'
diff --git a/roles/os_zabbix/vars/template_router.yml b/roles/os_zabbix/vars/template_router.yml
deleted file mode 100644
index 4dae7da1e..000000000
--- a/roles/os_zabbix/vars/template_router.yml
+++ /dev/null
@@ -1,27 +0,0 @@
----
-g_template_router:
- params:
- name: Template Router
- host: Template Router
- groups:
- - groupid: 1 # FIXME (not real)
- output: extend
- search:
- name: Template Router
- zitems:
- - name: Router Backends down
- hostid:
- key_: router.backends.down
- type: 2
- value_type: 0
- output: extend
- search:
- key_: router.backends.down
- ztriggers:
- - description: 'Number of router backends down on {HOST.NAME}'
- expression: '{Template Router:router.backends.down.last()}<>0'
- priority: 3
- searchWildcardsEnabled: True
- search:
- description: 'Number of router backends down on {HOST.NAME}'
- expandExpression: True