summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix/vars
diff options
context:
space:
mode:
Diffstat (limited to 'roles/os_zabbix/vars')
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml45
-rw-r--r--roles/os_zabbix/vars/template_openshift_node.yml26
2 files changed, 47 insertions, 24 deletions
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
index 9d20eb012..e36f23a2b 100644
--- a/roles/os_zabbix/vars/template_openshift_master.yml
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -2,16 +2,10 @@
g_template_openshift_master:
name: Template Openshift Master
zitems:
- - name: create_app
- applications:
- - Openshift Master
- key: create_app
-
- - key: openshift.master.registry.healthz
- description: "Shows the health status of the cluster's docker registry"
- type: int
+ - name: openshift.master.app.create
applications:
- Openshift Master
+ key: openshift.master.app.create
- key: openshift.master.process.count
description: Shows number of master processes running
@@ -201,6 +195,18 @@ g_template_openshift_master:
applications:
- Openshift Master Metrics
+ - key: openshift.master.nodesnotready.count
+ description: "This check shows how many nodes in a cluster are in NotReady state."
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.nodesnotschedulable.count
+ description: "This check shows how many nodes in a cluster are not schedulable."
+ type: int
+ applications:
+ - Openshift Master
+
- key: openshift.master.apiserver.latency.summary.pods.quantile.list.5
description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
type: int
@@ -266,11 +272,6 @@ g_template_openshift_master:
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
priority: high
- - name: 'Low number of etcd watchers on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
- priority: avg
-
- name: 'Etcd ping failed on {HOST.NAME}'
expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
@@ -288,14 +289,14 @@ g_template_openshift_master:
# Put triggers that depend on other triggers here (deps must be created first)
- name: 'Application creation has failed on {HOST.NAME}'
- expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
+ expression: '{Template Openshift Master:openshift.master.app.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.create.last(#2)}=1'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
dependencies:
- 'Openshift Master process not running on {HOST.NAME}'
priority: avg
- name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}'
- expression: '{Template Openshift Master:create_app.sum(1h)}>3'
+ expression: '{Template Openshift Master:openshift.master.app.create.sum(1h)}>3'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
dependencies:
- 'Openshift Master process not running on {HOST.NAME}'
@@ -333,13 +334,6 @@ g_template_openshift_master:
- 'Openshift Master process not running on {HOST.NAME}'
priority: avg
- - name: 'Docker Registry check failed on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.registry.healthz.max(#2)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: high
-
- name: 'SkyDNS port not listening on {HOST.NAME}'
expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
@@ -354,6 +348,13 @@ g_template_openshift_master:
- 'Openshift Master API health check is failing on {HOST.NAME}'
priority: high
+ - name: 'Hosts not ready according to {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.nodesnotready.count.last(#2)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: high
+
zgraphs:
- name: Openshift Master API Server Latency Pods LIST Quantiles
width: 900
diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml
index 252339366..e6daee8e4 100644
--- a/roles/os_zabbix/vars/template_openshift_node.yml
+++ b/roles/os_zabbix/vars/template_openshift_node.yml
@@ -26,7 +26,29 @@ g_template_openshift_node:
applications:
- Openshift Node
+ - key: openshift.node.registry-pods.healthy_pct
+ description: Shows the percentage of healthy registries in the cluster
+ type: int
+ applications:
+ - Openshift Node
+
+ - key: openshift.node.registry.service.ping
+ description: Ping docker-registry service from node
+ type: int
+ applications:
+ - Openshift Node
+
ztriggers:
+ - name: 'One or more Docker Registries is unhealthy according to {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#2)}<100 and {Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#1)}<100'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
+ priority: avg
+
+ - name: 'Docker Registry service is unhealthy according to {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.registry.service.ping.last(#2)}<1 and {Template Openshift Node:openshift.node.registry.service.ping.last(#1)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
+ priority: avg
+
- name: 'Openshift Node process not running on {HOST.NAME}'
expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1'
url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
@@ -37,8 +59,8 @@ g_template_openshift_node:
url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
priority: high
- - name: 'OVS may not be running on {HOST.NAME}'
- expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last()}<>4'
+ - name: '[HEAL] OVS may not be running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last(#1)}<>4 and {Template Openshift Node:openshift.node.ovs.pids.count.last(#2)}<>4'
url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
priority: high