summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix
diff options
context:
space:
mode:
Diffstat (limited to 'roles/os_zabbix')
-rw-r--r--roles/os_zabbix/tasks/main.yml9
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml82
-rw-r--r--roles/os_zabbix/vars/template_os_linux.yml6
-rw-r--r--roles/os_zabbix/vars/template_performance_copilot.yml14
4 files changed, 108 insertions, 3 deletions
diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml
index a503b24d7..82bf78b57 100644
--- a/roles/os_zabbix/tasks/main.yml
+++ b/roles/os_zabbix/tasks/main.yml
@@ -15,6 +15,7 @@
- include_vars: template_ops_tools.yml
- include_vars: template_app_zabbix_server.yml
- include_vars: template_app_zabbix_agent.yml
+- include_vars: template_performance_copilot.yml
- name: Include Template Heartbeat
include: ../../lib_zabbix/tasks/create_template.yml
@@ -79,3 +80,11 @@
server: "{{ ozb_server }}"
user: "{{ ozb_user }}"
password: "{{ ozb_password }}"
+
+- name: Include Template Performance Copilot
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_performance_copilot }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
index 1de4fefbb..6defc4989 100644
--- a/roles/os_zabbix/vars/template_openshift_master.yml
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -31,6 +31,78 @@ g_template_openshift_master:
applications:
- Openshift Master
+ - key: openshift.master.etcd.create.success
+ description: Show number of successful create actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.create.fail
+ description: Show number of failed create actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.delete.success
+ description: Show number of successful delete actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.delete.fail
+ description: Show number of failed delete actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.get.success
+ description: Show number of successful get actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.get.fail
+ description: Show number of failed get actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.set.success
+ description: Show number of successful set actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.set.fail
+ description: Show number of failed set actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.update.success
+ description: Show number of successful update actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.update.fail
+ description: Show number of failed update actions
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.watchers
+ description: Show number of etcd watchers
+ type: int
+ applications:
+ - Openshift Etcd
+
+ - key: openshift.master.etcd.ping
+ description: etcd ping
+ type: int
+ applications:
+ - Openshift Etcd
+
ztriggers:
- name: 'Application creation has failed on {HOST.NAME}'
expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
@@ -56,3 +128,13 @@ g_template_openshift_master:
expression: '{Template Openshift Master:openshift.project.counter.last()}=0'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
priority: info
+
+ - name: 'Low number of etcd watchers on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+ priority: avg
+
+ - name: 'Etcd ping failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+ priority: high
diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml
index 3ae1500bc..aeeec4b8d 100644
--- a/roles/os_zabbix/vars/template_os_linux.yml
+++ b/roles/os_zabbix/vars/template_os_linux.yml
@@ -246,15 +246,15 @@ g_template_os_linux:
# CPU Utilization #
- name: 'CPU idle less than 5% on {HOST.NAME}'
- expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<5 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<5'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<5'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
priority: average
description: 'CPU is less than 5% idle'
- name: 'CPU idle less than 10% on {HOST.NAME}'
- expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<10 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<10'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<10'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
- priority: warn
+ priority: average
description: 'CPU is less than 10% idle'
dependencies:
- 'CPU idle less than 5% on {HOST.NAME}'
diff --git a/roles/os_zabbix/vars/template_performance_copilot.yml b/roles/os_zabbix/vars/template_performance_copilot.yml
new file mode 100644
index 000000000..b62fa0228
--- /dev/null
+++ b/roles/os_zabbix/vars/template_performance_copilot.yml
@@ -0,0 +1,14 @@
+---
+g_template_performance_copilot:
+ name: Template Performance Copilot
+ zitems:
+ - key: pcp.ping
+ applications:
+ - Performance Copilot
+ value_type: int
+
+ ztriggers:
+ - name: 'pcp.ping failed on {HOST.NAME}'
+ expression: '{Template Performance Copilot:pcp.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_pcp_ping.asciidoc'
+ priority: average