summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix/vars
diff options
context:
space:
mode:
Diffstat (limited to 'roles/os_zabbix/vars')
-rw-r--r--roles/os_zabbix/vars/template_app_zabbix_server.yml408
-rw-r--r--roles/os_zabbix/vars/template_ops_tools.yml23
-rw-r--r--roles/os_zabbix/vars/template_os_linux.yml52
3 files changed, 473 insertions, 10 deletions
diff --git a/roles/os_zabbix/vars/template_app_zabbix_server.yml b/roles/os_zabbix/vars/template_app_zabbix_server.yml
new file mode 100644
index 000000000..dace2aa29
--- /dev/null
+++ b/roles/os_zabbix/vars/template_app_zabbix_server.yml
@@ -0,0 +1,408 @@
+---
+g_template_app_zabbix_server:
+ name: Template App Zabbix Server
+ zitems:
+ - key: housekeeper_creates
+ applications:
+ - Zabbix server
+ description: A simple count of the number of partition creates output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: '2'
+
+ - key: housekeeper_drops
+ applications:
+ - Zabbix server
+ description: A simple count of the number of partition drops output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: '2'
+
+ - key: housekeeper_errors
+ applications:
+ - Zabbix server
+ description: A simple count of the number of errors output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: '2'
+
+ - key: housekeeper_total
+ applications:
+ - Zabbix server
+ description: A simple count of the total number of lines output by the housekeeper
+ script.
+ units: ''
+ value_type: int
+ zabbix_type: '2'
+
+ - key: zabbix[process,alerter,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,configuration syncer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,db watchdog,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,discoverer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,escalator,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,history syncer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,housekeeper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,http poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,icmp pinger,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,ipmi poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,java poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,node watcher,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,proxy poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,self-monitoring,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,snmp trapper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,timer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,trapper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[process,unreachable poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[queue,10m]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: int
+ zabbix_type: '5'
+
+ - key: zabbix[queue]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: int
+ zabbix_type: '5'
+
+ - key: zabbix[rcache,buffer,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[wcache,history,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[wcache,text,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[wcache,trend,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: '5'
+
+ - key: zabbix[wcache,values]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: '5'
+ ztriggers:
+ - description: "There has been unexpected output while running the housekeeping script\
+ \ on the Zabbix. There are only three kinds of lines we expect to see in the output,\
+ \ and we've gotten something enw.\r\n\r\nCheck the script's output in /var/lib/zabbix/state\
+ \ for more details."
+ expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}+{Template App Zabbix Server:housekeeper_creates.last(0)}+{Template App Zabbix Server:housekeeper_drops.last(0)}<>{Template App Zabbix Server:housekeeper_total.last(0)}'
+ name: Unexpected output in Zabbix DB Housekeeping
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_DB_Housekeeping.asciidoc
+
+ - description: An error has occurred during running the housekeeping script on the Zabbix. Check the script's output in /var/lib/zabbix/state for more details.
+ expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}>0'
+ name: Errors during Zabbix DB Housekeeping
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,alerter,avg,busy].min(600)}>75'
+ name: Zabbix alerter processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,configuration syncer,avg,busy].min(600)}>75'
+ name: Zabbix configuration syncer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,db watchdog,avg,busy].min(600)}>75'
+ name: Zabbix db watchdog processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,discoverer,avg,busy].min(600)}>75'
+ name: Zabbix discoverer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,escalator,avg,busy].min(600)}>75'
+ name: Zabbix escalator processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,history syncer,avg,busy].min(600)}>75'
+ name: Zabbix history syncer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,housekeeper,avg,busy].min(1800)}>75'
+ name: Zabbix housekeeper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,http poller,avg,busy].min(600)}>75'
+ name: Zabbix http poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,icmp pinger,avg,busy].min(600)}>75'
+ name: Zabbix icmp pinger processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,ipmi poller,avg,busy].min(600)}>75'
+ name: Zabbix ipmi poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,java poller,avg,busy].min(600)}>75'
+ name: Zabbix java poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,node watcher,avg,busy].min(600)}>75'
+ name: Zabbix node watcher processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,poller,avg,busy].min(600)}>75'
+ name: Zabbix poller processes more than 75% busy
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,proxy poller,avg,busy].min(600)}>75'
+ name: Zabbix proxy poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,self-monitoring,avg,busy].min(600)}>75'
+ name: Zabbix self-monitoring processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,snmp trapper,avg,busy].min(600)}>75'
+ name: Zabbix snmp trapper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: Timer processes usually are busy because they have to process time
+ based trigger functions
+ expression: '{Template App Zabbix Server:zabbix[process,timer,avg,busy].min(600)}>75'
+ name: Zabbix timer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,trapper,avg,busy].min(600)}>75'
+ name: Zabbix trapper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,unreachable poller,avg,busy].min(600)}>75'
+ name: Zabbix unreachable poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: "This alert generally indicates a performance problem or a problem\
+ \ with the zabbix-server or proxy.\r\n\r\nThe first place to check for issues\
+ \ is Administration > Queue. Be sure to check the general view and the per-proxy\
+ \ view."
+ expression: '{Template App Zabbix Server:zabbix[queue,10m].min(600)}>1000'
+ name: More than 1000 items having missing data for more than 10 minutes
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/data_lost_overview_plugin.asciidoc
+
+ - description: Consider increasing CacheSize in the zabbix_server.conf configuration
+ file
+ expression: '{Template App Zabbix Server:zabbix[rcache,buffer,pfree].min(600)}<5'
+ name: Less than 5% free in the configuration cache
+ priority: info
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,history,pfree].min(600)}<25'
+ name: Less than 25% free in the history cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,text,pfree].min(600)}<25'
+ name: Less than 25% free in the text history cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,trend,pfree].min(600)}<25'
+ name: Less than 25% free in the trends cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
diff --git a/roles/os_zabbix/vars/template_ops_tools.yml b/roles/os_zabbix/vars/template_ops_tools.yml
new file mode 100644
index 000000000..d1b8a2514
--- /dev/null
+++ b/roles/os_zabbix/vars/template_ops_tools.yml
@@ -0,0 +1,23 @@
+---
+g_template_ops_tools:
+ name: Template Operations Tools
+ zdiscoveryrules:
+ - name: disc.ops.runner
+ key: disc.ops.runner
+ lifetime: 1
+ description: "Dynamically register operations runner items"
+
+ zitemprototypes:
+ - discoveryrule_key: disc.ops.runner
+ name: "Exit code of ops-runner[{#OSO_COMMAND}]"
+ key: "disc.ops.runner.command.exitcode[{#OSO_COMMAND}]"
+ value_type: int
+ description: "The exit code of the command run from ops-runner"
+ applications:
+ - Ops Runner
+
+ ztriggerprototypes:
+ - name: 'ops-runner[{#OSO_COMMAND}]: non-zero exit code on {HOST.NAME}'
+ expression: '{Template Operations Tools:disc.ops.runner.command.exitcode[{#OSO_COMMAND}].last()}<>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_ops_runner_command.asciidoc'
+ priority: average
diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml
index 84a7740b0..70c3809bd 100644
--- a/roles/os_zabbix/vars/template_os_linux.yml
+++ b/roles/os_zabbix/vars/template_os_linux.yml
@@ -10,17 +10,20 @@ g_template_os_linux:
- key: kernel.all.cpu.wait.total
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.cpu.irq.hard
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.cpu.idle
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.uname.distro
applications:
@@ -35,7 +38,8 @@ g_template_os_linux:
- key: kernel.all.cpu.irq.soft
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.load.15_minute
applications:
@@ -45,7 +49,8 @@ g_template_os_linux:
- key: kernel.all.cpu.sys
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.load.5_minute
applications:
@@ -55,7 +60,8 @@ g_template_os_linux:
- key: kernel.all.cpu.nice
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.load.1_minute
applications:
@@ -75,7 +81,8 @@ g_template_os_linux:
- key: kernel.all.cpu.user
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.uname.machine
applications:
@@ -90,7 +97,8 @@ g_template_os_linux:
- key: kernel.all.cpu.steal
applications:
- Kernel
- value_type: int
+ value_type: float
+ units: '%'
- key: kernel.all.pswitch
applications:
@@ -196,12 +204,10 @@ g_template_os_linux:
- name: disc.filesys
key: disc.filesys
lifetime: 1
- template_name: Template OS Linux
description: "Dynamically register the filesystems"
zitemprototypes:
- discoveryrule_key: disc.filesys
- template_name: Template OS Linux
name: "disc.filesys.full.{#OSO_FILESYS}"
key: "disc.filesys.full[{#OSO_FILESYS}]"
value_type: float
@@ -209,6 +215,17 @@ g_template_os_linux:
applications:
- Disk
+ ztriggerprototypes:
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: warn
+
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>95'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: high
+
ztriggers:
- name: 'Filesystem: / has less than 10% free on {HOST.NAME}'
expression: '{Template OS Linux:filesys.full.xvda2.last()}>90'
@@ -240,3 +257,18 @@ g_template_os_linux:
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc'
priority: warn
description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024'
+
+ # CPU Utilization #
+ - name: 'CPU idle less than 5% on {HOST.NAME}'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<5 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<5'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
+ priority: high
+ description: 'CPU is less than 5% idle'
+
+ - name: 'CPU idle less than 10% on {HOST.NAME}'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<10 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<10'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
+ priority: warn
+ description: 'CPU is less than 10% idle'
+ dependencies:
+ - 'CPU idle less than 5% on {HOST.NAME}'