summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix
diff options
context:
space:
mode:
Diffstat (limited to 'roles/os_zabbix')
-rw-r--r--roles/os_zabbix/tasks/main.yml27
-rw-r--r--roles/os_zabbix/vars/template_config_loop.yml14
-rw-r--r--roles/os_zabbix/vars/template_docker.yml22
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml61
-rw-r--r--roles/os_zabbix/vars/template_openshift_node.yml6
-rw-r--r--roles/os_zabbix/vars/template_zagg_server.yml46
6 files changed, 174 insertions, 2 deletions
diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml
index d0b307a3d..1c8d88854 100644
--- a/roles/os_zabbix/tasks/main.yml
+++ b/roles/os_zabbix/tasks/main.yml
@@ -37,6 +37,13 @@
- include_vars: template_aws.yml
tags:
- aws
+- include_vars: template_zagg_server.yml
+ tags:
+ - zagg_server
+
+- include_vars: template_config_loop.yml
+ tags:
+ - config_loop
- name: Include Template Heartbeat
include: ../../lib_zabbix/tasks/create_template.yml
@@ -137,3 +144,23 @@
password: "{{ ozb_password }}"
tags:
- aws
+
+- name: Include Template Zagg Server
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_zagg_server }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+ tags:
+ - zagg_server
+
+- name: Include Template Config Loop
+ include: ../../lib_zabbix/tasks/create_template.yml
+ vars:
+ template: "{{ g_template_config_loop }}"
+ server: "{{ ozb_server }}"
+ user: "{{ ozb_user }}"
+ password: "{{ ozb_password }}"
+ tags:
+ - config_loop
diff --git a/roles/os_zabbix/vars/template_config_loop.yml b/roles/os_zabbix/vars/template_config_loop.yml
new file mode 100644
index 000000000..823da1868
--- /dev/null
+++ b/roles/os_zabbix/vars/template_config_loop.yml
@@ -0,0 +1,14 @@
+---
+g_template_config_loop:
+ name: Template Config Loop
+ zitems:
+ - key: config_loop.run.exit_code
+ applications:
+ - Config Loop
+ value_type: int
+
+ ztriggers:
+ - name: 'config_loop.run.exit_code not zero on {HOST.NAME}'
+ expression: '{Template Config Loop:config_loop.run.exit_code.min(#2)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_config_loop.asciidoc'
+ priority: average
diff --git a/roles/os_zabbix/vars/template_docker.yml b/roles/os_zabbix/vars/template_docker.yml
index bfabf50c5..dd13e76f7 100644
--- a/roles/os_zabbix/vars/template_docker.yml
+++ b/roles/os_zabbix/vars/template_docker.yml
@@ -12,6 +12,16 @@ g_template_docker:
- Docker Daemon
value_type: int
+ - key: docker.container.dns.resolution
+ applications:
+ - Docker Daemon
+ value_type: int
+
+ - key: docker.container.existing.dns.resolution.failed
+ applications:
+ - Docker Daemon
+ value_type: int
+
- key: docker.storage.is_loopback
applications:
- Docker Storage
@@ -62,6 +72,18 @@ g_template_docker:
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc'
priority: high
+ # Re-enable for OpenShift 3.1.1 (https://bugzilla.redhat.com/show_bug.cgi?id=1292971#c6)
+ - name: 'docker.container.dns.resolution failed on {HOST.NAME}'
+ expression: '{Template Docker:docker.container.dns.resolution.min(#3)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_dns.asciidoc'
+ priority: average
+ status: disabled
+
+ - name: 'docker.container.existing.dns.resolution.failed on {HOST.NAME}'
+ expression: '{Template Docker:docker.container.existing.dns.resolution.failed.min(#3)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_dns.asciidoc'
+ priority: average
+
- name: 'Docker storage is using LOOPBACK on {HOST.NAME}'
expression: '{Template Docker:docker.storage.is_loopback.last()}<>0'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc'
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
index 514d6fd24..9d20eb012 100644
--- a/roles/os_zabbix/vars/template_openshift_master.yml
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -20,13 +20,26 @@ g_template_openshift_master:
- Openshift Master
- key: openshift.master.api.ping
- description: "Verify that the Openshift API is up"
+ description: "Verify that the Openshift API is up (uses the cluster API URL)"
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.local.api.ping
+ description: "Verify that the Openshift API is up on the host (uses the API URL as the https://127.0.0.1)"
type: int
applications:
- Openshift Master
- key: openshift.master.api.healthz
- description: "Checks the healthz check of the master's api: https://master_host/healthz"
+ description: "Checks the healthz check of the master's api: https://<cluster_api_url>/healthz"
+ type: int
+ data_type: bool
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.local.api.healthz
+ description: "Checks the healthz check of the master's api: https://127.0.0.1/healthz"
type: int
data_type: bool
applications:
@@ -98,6 +111,18 @@ g_template_openshift_master:
applications:
- Openshift Master
+ - key: openshift.master.skydns.port.open
+ description: State of the SkyDNS port open and listening
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.skydns.query
+ description: SkyDNS can be queried or not
+ type: int
+ applications:
+ - Openshift Master
+
- key: openshift.master.etcd.create.success
description: Show number of successful create actions
type: int
@@ -269,9 +294,22 @@ g_template_openshift_master:
- 'Openshift Master process not running on {HOST.NAME}'
priority: avg
+ - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}'
+ expression: '{Template Openshift Master:create_app.sum(1h)}>3'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ description: The application create loop has failed 4 or more times in the last hour
+ priority: avg
+
- name: 'Openshift Master API health check is failing on {HOST.NAME}'
expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: high
+
+ - name: 'Openshift Master Local API health check is failing on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.local.api.healthz.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
dependencies:
- 'Openshift Master process not running on {HOST.NAME}'
priority: high
@@ -279,6 +317,11 @@ g_template_openshift_master:
- name: 'Openshift Master API PING check is failing on {HOST.NAME}'
expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: high
+
+ - name: 'Openshift Master Local API PING check is failing on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.local.api.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
dependencies:
- 'Openshift Master process not running on {HOST.NAME}'
priority: high
@@ -297,6 +340,20 @@ g_template_openshift_master:
- 'Openshift Master process not running on {HOST.NAME}'
priority: high
+ - name: 'SkyDNS port not listening on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ dependencies:
+ - 'Openshift Master process not running on {HOST.NAME}'
+ priority: high
+
+ - name: 'SkyDNS query failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.skydns.query.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ dependencies:
+ - 'Openshift Master API health check is failing on {HOST.NAME}'
+ priority: high
+
zgraphs:
- name: Openshift Master API Server Latency Pods LIST Quantiles
width: 900
diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml
index ce28b1048..ff65ef158 100644
--- a/roles/os_zabbix/vars/template_openshift_node.yml
+++ b/roles/os_zabbix/vars/template_openshift_node.yml
@@ -20,6 +20,12 @@ g_template_openshift_node:
applications:
- Openshift Node
+ - key: openshift.node.ovs.stray.rules
+ description: Number of OVS stray rules found/removed
+ type: int
+ applications:
+ - Openshift Node
+
ztriggers:
- name: 'Openshift Node process not running on {HOST.NAME}'
expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1'
diff --git a/roles/os_zabbix/vars/template_zagg_server.yml b/roles/os_zabbix/vars/template_zagg_server.yml
new file mode 100644
index 000000000..db5665993
--- /dev/null
+++ b/roles/os_zabbix/vars/template_zagg_server.yml
@@ -0,0 +1,46 @@
+---
+g_template_zagg_server:
+ name: Template Zagg Server
+ zitems:
+ - key: zagg.server.metrics.count
+ applications:
+ - Zagg Server
+ value_type: int
+
+ - key: zagg.server.metrics.errors
+ applications:
+ - Zagg Server
+ value_type: int
+
+ - key: zagg.server.heartbeat.errors
+ applications:
+ - Zagg Server
+ value_type: int
+
+ - key: zagg.server.heartbeat.count
+ applications:
+ - Zagg Server
+ value_type: int
+
+ ztriggers:
+ - name: 'Error processing metrics on {HOST.NAME}'
+ expression: '{Template Zagg Server:zagg.server.metrics.errors.min(#3)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
+ priority: average
+
+ - name: 'Error processing heartbeats on {HOST.NAME}'
+ expression: '{Template Zagg Server:zagg.server.heartbeat.errors.min(#3)}>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
+ priority: average
+
+ - name: 'Critically High number of metrics in Zagg queue {HOST.NAME}'
+ expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>10000'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
+ priority: high
+
+ - name: 'High number of metrics in Zagg queue {HOST.NAME}'
+ expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>5000'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc'
+ dependencies:
+ - 'Critically High number of metrics in Zagg queue {HOST.NAME}'
+ priority: average