summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix/vars/template_os_linux.yml
blob: 778e4341df607658bc93a14100f4f8dddeb3e05a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
---
g_template_os_linux:
  name: Template OS Linux
  zitems:
  - key: kernel.uname.sysname
    applications:
    - Kernel
    value_type: string

  - key: kernel.all.cpu.wait.total
    applications:
    - Kernel
    value_type: float
    units: '%'

  - key: kernel.all.cpu.irq.hard
    applications:
    - Kernel
    value_type: float
    units: '%'

  - key: kernel.all.cpu.idle
    applications:
    - Kernel
    value_type: float
    units: '%'

  - key: kernel.uname.distro
    applications:
    - Kernel
    value_type: string

  - key: kernel.uname.nodename
    applications:
    - Kernel
    value_type: string

  - key: kernel.all.cpu.irq.soft
    applications:
    - Kernel
    value_type: float
    units: '%'

  - key: kernel.all.load.15_minute
    applications:
    - Kernel
    value_type: float

  - key: kernel.all.cpu.sys
    applications:
    - Kernel
    value_type: float
    units: '%'

  - key: kernel.all.load.5_minute
    applications:
    - Kernel
    value_type: float

  - key: kernel.all.cpu.nice
    applications:
    - Kernel
    value_type: float
    units: '%'

  - key: kernel.all.load.1_minute
    applications:
    - Kernel
    value_type: float

  - key: kernel.uname.version
    applications:
    - Kernel
    value_type: string

  - key: kernel.all.uptime
    applications:
    - Kernel
    value_type: int

  - key: kernel.all.cpu.user
    applications:
    - Kernel
    value_type: float
    units: '%'

  - key: kernel.uname.machine
    applications:
    - Kernel
    value_type: string

  - key: hinv.ncpu
    applications:
    - Kernel
    value_type: int

  - key: kernel.all.cpu.steal
    applications:
    - Kernel
    value_type: float
    units: '%'

  - key: kernel.all.pswitch
    applications:
    - Kernel
    value_type: int

  - key: kernel.uname.release
    applications:
    - Kernel
    value_type: string

  - key: proc.nprocs
    applications:
    - Kernel
    value_type: int

  # Memory Items
  - key: mem.freemem
    applications:
    - Memory
    value_type: int
    description: "PCP: free system memory metric from /proc/meminfo"
    multiplier: 1024
    units: B

  - key: mem.util.bufmem
    applications:
    - Memory
    value_type: int
    description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo"
    multiplier: 1024
    units: B

  - key: swap.used
    applications:
    - Memory
    value_type: int
    description: "PCP: swap used metric from /proc/meminfo"
    multiplier: 1024
    units: B

  - key: swap.length
    applications:
    - Memory
    value_type: int
    description: "PCP: total swap available metric from /proc/meminfo"
    multiplier: 1024
    units: B

  - key: mem.physmem
    applications:
    - Memory
    value_type: int
    description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM."
    multiplier: 1024
    units: B

  - key: swap.free
    applications:
    - Memory
    value_type: int
    description: "PCP: swap free metric from /proc/meminfo"
    multiplier: 1024
    units: B

  - key: mem.util.available
    applications:
    - Memory
    value_type: int
    description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo"
    multiplier: 1024
    units: B

  - key: mem.util.used
    applications:
    - Memory
    value_type: int
    description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo"
    multiplier: 1024
    units: B

  - key: mem.util.cached
    applications:
    - Memory
    value_type: int
    description: "PCP: Memory used by the page cache, including buffered file data.  This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo"
    multiplier: 1024
    units: B

  zdiscoveryrules:
  - name: disc.filesys
    key: disc.filesys
    lifetime: 1
    description: "Dynamically register the filesystems"

  - name: disc.disk
    key: disc.disk
    lifetime: 1
    description: "Dynamically register disks on a node"

  - name: disc.network
    key: disc.network
    lifetime: 1
    description: "Dynamically register network interfaces on a node"

  zitemprototypes:
  - discoveryrule_key: disc.filesys
    name: "disc.filesys.full.{#OSO_FILESYS}"
    key: "disc.filesys.full[{#OSO_FILESYS}]"
    value_type: float
    description: "PCP filesys.full option.  This is the percent full returned from pcp filesys.full"
    applications:
    - Disk

  - discoveryrule_key: disc.filesys
    name: "Percentage of used inodes on {#OSO_FILESYS}"
    key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]"
    value_type: float
    description: "PCP derived value of percentage of used inodes on a filesystem."
    applications:
    - Disk

  - discoveryrule_key: disc.disk
    name: "TPS (IOPS) for disk {#OSO_DISK}"
    key: "disc.disk.tps[{#OSO_DISK}]"
    value_type: int
    description: "PCP disk.dev.totals metric measured over a period of time.  This shows how many disk transactions per second the disk is using"
    applications:
    - Disk

  - discoveryrule_key: disc.disk
    name: "Percent Utilized for disk {#OSO_DISK}"
    key: "disc.disk.putil[{#OSO_DISK}]"
    value_type: float
    description: "PCP disk.dev.avactive metric measured over a period of time.  This is the '%util' in the iostat command"
    applications:
    - Disk

  - discoveryrule_key: disc.network
    name: "Bytes per second IN on network interface {#OSO_NET_INTERFACE}"
    key: "disc.network.in.bytes[{#OSO_NET_INTERFACE}]"
    value_type: int
    units: B
    delta: 1
    description: "PCP network.interface.in.bytes metric.  This is setup as a delta in Zabbix to measure the speed per second"
    applications:
    - Network

  - discoveryrule_key: disc.network
    name: "Bytes per second OUT on network interface {#OSO_NET_INTERFACE}"
    key: "disc.network.out.bytes[{#OSO_NET_INTERFACE}]"
    value_type: int
    units: B
    delta: 1
    description: "PCP network.interface.out.bytes metric.  This is setup as a delta in Zabbix to measure the speed per second"
    applications:
    - Network

  ztriggerprototypes:
  - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
    expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90'
    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
    priority: high

  # This has a dependency on the previous trigger
  - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}'
    expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85'
    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
    priority: warn
    dependencies:
    - 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'

  - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
    expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95'
    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
    priority: high

  - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}'
    expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90'
    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
    priority: warn
    dependencies:
    - 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'

  ztriggers:
  - name: 'Too many TOTAL processes on {HOST.NAME}'
    expression: '{Template OS Linux:proc.nprocs.last()}>5000'
    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc'
    priority: warn

  - name: 'Lack of available memory on {HOST.NAME}'
    expression: '{Template OS Linux:mem.freemem.last()}<30720000'
    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc'
    priority: warn
    description: 'Alert on less than 30MegaBytes.  This is 30 Million Bytes.  30000 KB x 1024'

    #  CPU Utilization #
  - name: 'CPU idle less than 5% on {HOST.NAME}'
    expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<5'
    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
    priority: average
    description: 'CPU is less than 5% idle'

  - name: 'CPU idle less than 10% on {HOST.NAME}'
    expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<10'
    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
    priority: average
    description: 'CPU is less than 10% idle'
    dependencies:
    - 'CPU idle less than 5% on {HOST.NAME}'