summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix/vars
diff options
context:
space:
mode:
Diffstat (limited to 'roles/os_zabbix/vars')
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml19
-rw-r--r--roles/os_zabbix/vars/template_openshift_node.yml4
2 files changed, 21 insertions, 2 deletions
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
index 5aae2496a..1824d7881 100644
--- a/roles/os_zabbix/vars/template_openshift_master.yml
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -201,6 +201,18 @@ g_template_openshift_master:
applications:
- Openshift Master Metrics
+ - key: openshift.master.nodesnotready.count
+ description: "This check shows how many nodes in a cluster are in NotReady state."
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.nodesnotschedulable.count
+ description: "This check shows how many nodes in a cluster are not schedulable."
+ type: int
+ applications:
+ - Openshift Master
+
- key: openshift.master.apiserver.latency.summary.pods.quantile.list.5
description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
type: int
@@ -361,6 +373,13 @@ g_template_openshift_master:
- 'Openshift Master API health check is failing on {HOST.NAME}'
priority: high
+ - name: 'Hosts not ready according to {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.nodesnotready.count.last(#2)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: high
+
zgraphs:
- name: Openshift Master API Server Latency Pods LIST Quantiles
width: 900
diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml
index ff65ef158..b0488656d 100644
--- a/roles/os_zabbix/vars/template_openshift_node.yml
+++ b/roles/os_zabbix/vars/template_openshift_node.yml
@@ -37,8 +37,8 @@ g_template_openshift_node:
url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
priority: high
- - name: 'OVS may not be running on {HOST.NAME}'
- expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last()}<>4'
+ - name: '[HEAL] OVS may not be running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last(#1)}<>4 and {Template Openshift Node:openshift.node.ovs.pids.count.last(#2)}<>4'
url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
priority: high