summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix/vars
diff options
context:
space:
mode:
Diffstat (limited to 'roles/os_zabbix/vars')
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml25
-rw-r--r--roles/os_zabbix/vars/template_openshift_node.yml31
2 files changed, 30 insertions, 26 deletions
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
index 1824d7881..e36f23a2b 100644
--- a/roles/os_zabbix/vars/template_openshift_master.yml
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -7,12 +7,6 @@ g_template_openshift_master:
- Openshift Master
key: openshift.master.app.create
- - key: openshift.master.registry.healthy_pct
- description: "Shows the percentage of healthy registries in the cluster"
- type: int
- applications:
- - Openshift Master
-
- key: openshift.master.process.count
description: Shows number of master processes running
type: int
@@ -278,11 +272,6 @@ g_template_openshift_master:
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
priority: high
- - name: 'Low number of etcd watchers on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
- priority: avg
-
- name: 'Etcd ping failed on {HOST.NAME}'
expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
@@ -345,20 +334,6 @@ g_template_openshift_master:
- 'Openshift Master process not running on {HOST.NAME}'
priority: avg
- - name: 'One or more Docker Registries is unhealthy according to {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.registry.healthy_pct.last(#2)}<100 and {Template Openshift Master:openshift.master.registry.healthy_pct.max(#2)}>50'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: avg
-
- - name: 'Multiple Docker Registries are unhealthy according to {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.registry.healthy_pct.last(#2)}<51'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: high
-
- name: 'SkyDNS port not listening on {HOST.NAME}'
expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml
index c36c593df..e6daee8e4 100644
--- a/roles/os_zabbix/vars/template_openshift_node.yml
+++ b/roles/os_zabbix/vars/template_openshift_node.yml
@@ -69,4 +69,33 @@ g_template_openshift_node:
url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
priority: high
-
+ zactions:
+ - name: '[HEAL] OVS may not be running on {HOST.NAME}'
+ status: disabled
+ escalation_time: 60
+ conditions_filter:
+ calculation_type: "and/or"
+ conditions:
+ - conditiontype: maintenance status
+ operator: not in
+ - conditiontype: trigger name
+ operator: like
+ value: "[HEAL] OVS may not be running on"
+ - conditiontype: trigger value
+ operator: "="
+ value: PROBLEM
+ operations:
+ - esc_step_from: 1
+ esc_step_to: 1
+ esc_period: 0
+ operationtype: remote command
+ opcommand:
+ command: 'ssh -i /etc/openshift_tools/scriptrunner_id_rsa {{ ozb_scriptrunner_user }}@{{ ozb_scriptrunner_bastion_host }} remote-healer --host \"{HOST.NAME}\" --trigger \"{TRIGGER.NAME}\" --trigger-val \"{TRIGGER.VALUE}\"'
+ execute_on: "zabbix server"
+ type: 'custom script'
+ target_hosts:
+ - target_type: 'zabbix server'
+ opconditions:
+ - conditiontype: 'event acknowledged'
+ operator: '='
+ value: 'not acknowledged'