diff options
Diffstat (limited to 'roles/os_zabbix/vars/template_openshift_master.yml')
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_master.yml | 45 |
1 files changed, 23 insertions, 22 deletions
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml index 9d20eb012..e36f23a2b 100644 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -2,16 +2,10 @@ g_template_openshift_master: name: Template Openshift Master zitems: - - name: create_app - applications: - - Openshift Master - key: create_app - - - key: openshift.master.registry.healthz - description: "Shows the health status of the cluster's docker registry" - type: int + - name: openshift.master.app.create applications: - Openshift Master + key: openshift.master.app.create - key: openshift.master.process.count description: Shows number of master processes running @@ -201,6 +195,18 @@ g_template_openshift_master: applications: - Openshift Master Metrics + - key: openshift.master.nodesnotready.count + description: "This check shows how many nodes in a cluster are in NotReady state." + type: int + applications: + - Openshift Master + + - key: openshift.master.nodesnotschedulable.count + description: "This check shows how many nodes in a cluster are not schedulable." + type: int + applications: + - Openshift Master + - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5 description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed." type: int @@ -266,11 +272,6 @@ g_template_openshift_master: url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: high - - name: 'Low number of etcd watchers on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc' - priority: avg - - name: 'Etcd ping failed on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc' @@ -288,14 +289,14 @@ g_template_openshift_master: # Put triggers that depend on other triggers here (deps must be created first) - name: 'Application creation has failed on {HOST.NAME}' - expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1' + expression: '{Template Openshift Master:openshift.master.app.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.create.last(#2)}=1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' priority: avg - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}' - expression: '{Template Openshift Master:create_app.sum(1h)}>3' + expression: '{Template Openshift Master:openshift.master.app.create.sum(1h)}>3' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' @@ -333,13 +334,6 @@ g_template_openshift_master: - 'Openshift Master process not running on {HOST.NAME}' priority: avg - - name: 'Docker Registry check failed on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.registry.healthz.max(#2)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - dependencies: - - 'Openshift Master process not running on {HOST.NAME}' - priority: high - - name: 'SkyDNS port not listening on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' @@ -354,6 +348,13 @@ g_template_openshift_master: - 'Openshift Master API health check is failing on {HOST.NAME}' priority: high + - name: 'Hosts not ready according to {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.nodesnotready.count.last(#2)}>0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: high + zgraphs: - name: Openshift Master API Server Latency Pods LIST Quantiles width: 900 |