blob: e6daee8e472a7e1570fd08591a578d722619b968 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
---
g_template_openshift_node:
name: Template Openshift Node
zitems:
- key: openshift.node.process.count
description: Shows number of OpenShift Node processes running
type: int
applications:
- Openshift Node
- key: openshift.node.ovs.pids.count
description: Shows number of ovs process ids running
type: int
applications:
- Openshift Node
- key: openshift.node.ovs.ports.count
description: Shows number of OVS ports defined
type: int
applications:
- Openshift Node
- key: openshift.node.ovs.stray.rules
description: Number of OVS stray rules found/removed
type: int
applications:
- Openshift Node
- key: openshift.node.registry-pods.healthy_pct
description: Shows the percentage of healthy registries in the cluster
type: int
applications:
- Openshift Node
- key: openshift.node.registry.service.ping
description: Ping docker-registry service from node
type: int
applications:
- Openshift Node
ztriggers:
- name: 'One or more Docker Registries is unhealthy according to {HOST.NAME}'
expression: '{Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#2)}<100 and {Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#1)}<100'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
priority: avg
- name: 'Docker Registry service is unhealthy according to {HOST.NAME}'
expression: '{Template Openshift Node:openshift.node.registry.service.ping.last(#2)}<1 and {Template Openshift Node:openshift.node.registry.service.ping.last(#1)}<1'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
priority: avg
- name: 'Openshift Node process not running on {HOST.NAME}'
expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1'
url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
priority: high
- name: 'Too many Openshift Node processes running on {HOST.NAME}'
expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1'
url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
priority: high
- name: '[HEAL] OVS may not be running on {HOST.NAME}'
expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last(#1)}<>4 and {Template Openshift Node:openshift.node.ovs.pids.count.last(#2)}<>4'
url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
priority: high
- name: 'Number of OVS ports is 0 on {HOST.NAME}'
expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0'
url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
priority: high
zactions:
- name: '[HEAL] OVS may not be running on {HOST.NAME}'
status: disabled
escalation_time: 60
conditions_filter:
calculation_type: "and/or"
conditions:
- conditiontype: maintenance status
operator: not in
- conditiontype: trigger name
operator: like
value: "[HEAL] OVS may not be running on"
- conditiontype: trigger value
operator: "="
value: PROBLEM
operations:
- esc_step_from: 1
esc_step_to: 1
esc_period: 0
operationtype: remote command
opcommand:
command: 'ssh -i /etc/openshift_tools/scriptrunner_id_rsa {{ ozb_scriptrunner_user }}@{{ ozb_scriptrunner_bastion_host }} remote-healer --host \"{HOST.NAME}\" --trigger \"{TRIGGER.NAME}\" --trigger-val \"{TRIGGER.VALUE}\"'
execute_on: "zabbix server"
type: 'custom script'
target_hosts:
- target_type: 'zabbix server'
opconditions:
- conditiontype: 'event acknowledged'
operator: '='
value: 'not acknowledged'
|