diff options
Diffstat (limited to 'roles/os_zabbix/vars')
-rw-r--r-- | roles/os_zabbix/vars/template_app_zabbix_agent.yml | 23 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_app_zabbix_server.yml | 408 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_master.yml | 18 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_node.yml | 28 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_os_linux.yml | 48 |
5 files changed, 492 insertions, 33 deletions
diff --git a/roles/os_zabbix/vars/template_app_zabbix_agent.yml b/roles/os_zabbix/vars/template_app_zabbix_agent.yml new file mode 100644 index 000000000..06c4eda8b --- /dev/null +++ b/roles/os_zabbix/vars/template_app_zabbix_agent.yml @@ -0,0 +1,23 @@ +--- +g_template_app_zabbix_agent: + name: Template App Zabbix Agent + zitems: + - key: agent.hostname + applications: + - Zabbix agent + value_type: character + zabbix_type: '0' + + - key: agent.ping + applications: + - Zabbix agent + description: The agent always returns 1 for this item. It could be used in combination with nodata() for availability check. + value_type: int + zabbix_type: '0' + + ztriggers: + - name: '[Reboot] Zabbix agent on {HOST.NAME} is unreachable for 15 minutes' + description: Zabbix agent is unreachable for 15 minutes. + expression: '{Template App Zabbix Agent:agent.ping.nodata(15m)}=1' + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_ping.asciidoc diff --git a/roles/os_zabbix/vars/template_app_zabbix_server.yml b/roles/os_zabbix/vars/template_app_zabbix_server.yml new file mode 100644 index 000000000..dace2aa29 --- /dev/null +++ b/roles/os_zabbix/vars/template_app_zabbix_server.yml @@ -0,0 +1,408 @@ +--- +g_template_app_zabbix_server: + name: Template App Zabbix Server + zitems: + - key: housekeeper_creates + applications: + - Zabbix server + description: A simple count of the number of partition creates output by the housekeeper script. + units: '' + value_type: int + zabbix_type: '2' + + - key: housekeeper_drops + applications: + - Zabbix server + description: A simple count of the number of partition drops output by the housekeeper script. + units: '' + value_type: int + zabbix_type: '2' + + - key: housekeeper_errors + applications: + - Zabbix server + description: A simple count of the number of errors output by the housekeeper script. + units: '' + value_type: int + zabbix_type: '2' + + - key: housekeeper_total + applications: + - Zabbix server + description: A simple count of the total number of lines output by the housekeeper + script. + units: '' + value_type: int + zabbix_type: '2' + + - key: zabbix[process,alerter,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,configuration syncer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,db watchdog,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,discoverer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,escalator,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,history syncer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,housekeeper,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,http poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,icmp pinger,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,ipmi poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,java poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,node watcher,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,proxy poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,self-monitoring,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,snmp trapper,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,timer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,trapper,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[process,unreachable poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: '5' + + - key: zabbix[queue,10m] + applications: + - Zabbix server + description: '' + units: '' + value_type: int + zabbix_type: '5' + + - key: zabbix[queue] + applications: + - Zabbix server + description: '' + units: '' + value_type: int + zabbix_type: '5' + + - key: zabbix[rcache,buffer,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: '5' + + - key: zabbix[wcache,history,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: '5' + + - key: zabbix[wcache,text,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: '5' + + - key: zabbix[wcache,trend,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: '5' + + - key: zabbix[wcache,values] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: '5' + ztriggers: + - description: "There has been unexpected output while running the housekeeping script\ + \ on the Zabbix. There are only three kinds of lines we expect to see in the output,\ + \ and we've gotten something enw.\r\n\r\nCheck the script's output in /var/lib/zabbix/state\ + \ for more details." + expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}+{Template App Zabbix Server:housekeeper_creates.last(0)}+{Template App Zabbix Server:housekeeper_drops.last(0)}<>{Template App Zabbix Server:housekeeper_total.last(0)}' + name: Unexpected output in Zabbix DB Housekeeping + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_DB_Housekeeping.asciidoc + + - description: An error has occurred during running the housekeeping script on the Zabbix. Check the script's output in /var/lib/zabbix/state for more details. + expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}>0' + name: Errors during Zabbix DB Housekeeping + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,alerter,avg,busy].min(600)}>75' + name: Zabbix alerter processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,configuration syncer,avg,busy].min(600)}>75' + name: Zabbix configuration syncer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,db watchdog,avg,busy].min(600)}>75' + name: Zabbix db watchdog processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,discoverer,avg,busy].min(600)}>75' + name: Zabbix discoverer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,escalator,avg,busy].min(600)}>75' + name: Zabbix escalator processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,history syncer,avg,busy].min(600)}>75' + name: Zabbix history syncer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,housekeeper,avg,busy].min(1800)}>75' + name: Zabbix housekeeper processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,http poller,avg,busy].min(600)}>75' + name: Zabbix http poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,icmp pinger,avg,busy].min(600)}>75' + name: Zabbix icmp pinger processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,ipmi poller,avg,busy].min(600)}>75' + name: Zabbix ipmi poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,java poller,avg,busy].min(600)}>75' + name: Zabbix java poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,node watcher,avg,busy].min(600)}>75' + name: Zabbix node watcher processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,poller,avg,busy].min(600)}>75' + name: Zabbix poller processes more than 75% busy + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,proxy poller,avg,busy].min(600)}>75' + name: Zabbix proxy poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,self-monitoring,avg,busy].min(600)}>75' + name: Zabbix self-monitoring processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,snmp trapper,avg,busy].min(600)}>75' + name: Zabbix snmp trapper processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: Timer processes usually are busy because they have to process time + based trigger functions + expression: '{Template App Zabbix Server:zabbix[process,timer,avg,busy].min(600)}>75' + name: Zabbix timer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,trapper,avg,busy].min(600)}>75' + name: Zabbix trapper processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,unreachable poller,avg,busy].min(600)}>75' + name: Zabbix unreachable poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: "This alert generally indicates a performance problem or a problem\ + \ with the zabbix-server or proxy.\r\n\r\nThe first place to check for issues\ + \ is Administration > Queue. Be sure to check the general view and the per-proxy\ + \ view." + expression: '{Template App Zabbix Server:zabbix[queue,10m].min(600)}>1000' + name: More than 1000 items having missing data for more than 10 minutes + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/data_lost_overview_plugin.asciidoc + + - description: Consider increasing CacheSize in the zabbix_server.conf configuration + file + expression: '{Template App Zabbix Server:zabbix[rcache,buffer,pfree].min(600)}<5' + name: Less than 5% free in the configuration cache + priority: info + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[wcache,history,pfree].min(600)}<25' + name: Less than 25% free in the history cache + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[wcache,text,pfree].min(600)}<25' + name: Less than 25% free in the text history cache + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[wcache,trend,pfree].min(600)}<25' + name: Less than 25% free in the trends cache + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml index c71e07910..4ae918ec6 100644 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -13,6 +13,18 @@ g_template_openshift_master: applications: - Openshift Master + - key: openshift.master.user.count + description: Shows number of users in a cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pod.running.count + description: Shows number of pods running + type: int + applications: + - Openshift Master + ztriggers: - name: 'Application creation has failed on {HOST.NAME}' expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1' @@ -28,3 +40,9 @@ g_template_openshift_master: expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: high + + - name: 'Number of users for Openshift Master on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.user.count.last()}=0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: info + diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml index 36f9cc4a3..ce28b1048 100644 --- a/roles/os_zabbix/vars/template_openshift_node.yml +++ b/roles/os_zabbix/vars/template_openshift_node.yml @@ -8,13 +8,37 @@ g_template_openshift_node: applications: - Openshift Node + - key: openshift.node.ovs.pids.count + description: Shows number of ovs process ids running + type: int + applications: + - Openshift Node + + - key: openshift.node.ovs.ports.count + description: Shows number of OVS ports defined + type: int + applications: + - Openshift Node + ztriggers: - name: 'Openshift Node process not running on {HOST.NAME}' expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' priority: high - name: 'Too many Openshift Node processes running on {HOST.NAME}' expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' + priority: high + + - name: 'OVS may not be running on {HOST.NAME}' + expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last()}<>4' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' priority: high + + - name: 'Number of OVS ports is 0 on {HOST.NAME}' + expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' + priority: high + + diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml index 70c3809bd..69432273f 100644 --- a/roles/os_zabbix/vars/template_os_linux.yml +++ b/roles/os_zabbix/vars/template_os_linux.yml @@ -188,18 +188,6 @@ g_template_os_linux: multiplier: 1024 units: B - # Disk items - - key: filesys.full.xvda2 - applications: - - Disk - value_type: float - - - key: filesys.full.xvda3 - applications: - - Disk - value_type: float - - zdiscoveryrules: - name: disc.filesys key: disc.filesys @@ -215,38 +203,36 @@ g_template_os_linux: applications: - Disk - ztriggerprototypes: - - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free on {HOST.NAME}' - expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' - priority: warn - - - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free on {HOST.NAME}' - expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>95' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' - priority: high + - discoveryrule_key: disc.filesys + name: "Percentage of used inodes on {#OSO_FILESYS}" + key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]" + value_type: float + description: "PCP derived value of percentage of used inodes on a filesystem." + applications: + - Disk - ztriggers: - - name: 'Filesystem: / has less than 10% free on {HOST.NAME}' - expression: '{Template OS Linux:filesys.full.xvda2.last()}>90' + ztriggerprototypes: + - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: warn - - name: 'Filesystem: / has less than 5% free on {HOST.NAME}' - expression: '{Template OS Linux:filesys.full.xvda2.last()}>95' + - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: high - - name: 'Filesystem: /var has less than 10% free on {HOST.NAME}' - expression: '{Template OS Linux:filesys.full.xvda3.last()}>90' + - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: warn - - name: 'Filesystem: /var has less than 5% free on {HOST.NAME}' - expression: '{Template OS Linux:filesys.full.xvda3.last()}>95' + - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: high + ztriggers: - name: 'Too many TOTAL processes on {HOST.NAME}' expression: '{Template OS Linux:proc.nprocs.last()}>5000' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc' |