summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix/vars
diff options
context:
space:
mode:
Diffstat (limited to 'roles/os_zabbix/vars')
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml82
-rw-r--r--roles/os_zabbix/vars/template_os_linux.yml19
-rw-r--r--roles/os_zabbix/vars/template_performance_copilot.yml14
3 files changed, 112 insertions, 3 deletions
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
index 1de4fefbb..6defc4989 100644
--- a/roles/os_zabbix/vars/template_openshift_master.yml
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -31,6 +31,78 @@ g_template_openshift_master:
applications:
- Openshift Master
+ - key: openshift.master.etcd.create.success
+ description: Show number of successful create actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.create.fail
+ description: Show number of failed create actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.delete.success
+ description: Show number of successful delete actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.delete.fail
+ description: Show number of failed delete actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.get.success
+ description: Show number of successful get actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.get.fail
+ description: Show number of failed get actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.set.success
+ description: Show number of successful set actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.set.fail
+ description: Show number of failed set actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.update.success
+ description: Show number of successful update actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.update.fail
+ description: Show number of failed update actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.watchers
+ description: Show number of etcd watchers
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.ping
+ description: etcd ping
+ type: int
+ applications:
+ - Openshift Etcd
+
ztriggers:
- name: 'Application creation has failed on {HOST.NAME}'
expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
@@ -56,3 +128,13 @@ g_template_openshift_master:
expression: '{Template Openshift Master:openshift.project.counter.last()}=0'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
priority: info
+
+ - name: 'Low number of etcd watchers on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+ priority: avg
+
+ - name: 'Etcd ping failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+ priority: high
diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml
index 3ae1500bc..68de2dde5 100644
--- a/roles/os_zabbix/vars/template_os_linux.yml
+++ b/roles/os_zabbix/vars/template_os_linux.yml
@@ -194,6 +194,11 @@ g_template_os_linux:
lifetime: 1
description: "Dynamically register the filesystems"
+ - name: disc.disk
+ key: disc.disk
+ lifetime: 1
+ description: "Dynamically register disks on a node"
+
zitemprototypes:
- discoveryrule_key: disc.filesys
name: "disc.filesys.full.{#OSO_FILESYS}"
@@ -211,6 +216,14 @@ g_template_os_linux:
applications:
- Disk
+ - discoveryrule_key: disc.disk
+ name: "TPS (IOPS) for disk {#OSO_DISK}"
+ key: "disc.disk.tps[{#OSO_FILESYS}]"
+ value_type: int
+ description: "PCP disk.dev.totals metric measured over a period of time. This shows how many disk transactions per second the disk is using"
+ applications:
+ - Disk
+
ztriggerprototypes:
- name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}'
expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85'
@@ -246,15 +259,15 @@ g_template_os_linux:
# CPU Utilization #
- name: 'CPU idle less than 5% on {HOST.NAME}'
- expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<5 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<5'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<5'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
priority: average
description: 'CPU is less than 5% idle'
- name: 'CPU idle less than 10% on {HOST.NAME}'
- expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<10 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<10'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<10'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
- priority: warn
+ priority: average
description: 'CPU is less than 10% idle'
dependencies:
- 'CPU idle less than 5% on {HOST.NAME}'
diff --git a/roles/os_zabbix/vars/template_performance_copilot.yml b/roles/os_zabbix/vars/template_performance_copilot.yml
new file mode 100644
index 000000000..b62fa0228
--- /dev/null
+++ b/roles/os_zabbix/vars/template_performance_copilot.yml
@@ -0,0 +1,14 @@
+---
+g_template_performance_copilot:
+ name: Template Performance Copilot
+ zitems:
+ - key: pcp.ping
+ applications:
+ - Performance Copilot
+ value_type: int
+
+ ztriggers:
+ - name: 'pcp.ping failed on {HOST.NAME}'
+ expression: '{Template Performance Copilot:pcp.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_pcp_ping.asciidoc'
+ priority: average