diff options
Diffstat (limited to 'roles/os_zabbix/vars')
-rw-r--r-- | roles/os_zabbix/vars/template_config_loop.yml | 14 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_docker.yml | 22 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_master.yml | 61 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_node.yml | 6 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_zagg_server.yml | 46 |
5 files changed, 147 insertions, 2 deletions
diff --git a/roles/os_zabbix/vars/template_config_loop.yml b/roles/os_zabbix/vars/template_config_loop.yml new file mode 100644 index 000000000..823da1868 --- /dev/null +++ b/roles/os_zabbix/vars/template_config_loop.yml @@ -0,0 +1,14 @@ +--- +g_template_config_loop: + name: Template Config Loop + zitems: + - key: config_loop.run.exit_code + applications: + - Config Loop + value_type: int + + ztriggers: + - name: 'config_loop.run.exit_code not zero on {HOST.NAME}' + expression: '{Template Config Loop:config_loop.run.exit_code.min(#2)}>0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_config_loop.asciidoc' + priority: average diff --git a/roles/os_zabbix/vars/template_docker.yml b/roles/os_zabbix/vars/template_docker.yml index bfabf50c5..dd13e76f7 100644 --- a/roles/os_zabbix/vars/template_docker.yml +++ b/roles/os_zabbix/vars/template_docker.yml @@ -12,6 +12,16 @@ g_template_docker: - Docker Daemon value_type: int + - key: docker.container.dns.resolution + applications: + - Docker Daemon + value_type: int + + - key: docker.container.existing.dns.resolution.failed + applications: + - Docker Daemon + value_type: int + - key: docker.storage.is_loopback applications: - Docker Storage @@ -62,6 +72,18 @@ g_template_docker: url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc' priority: high + # Re-enable for OpenShift 3.1.1 (https://bugzilla.redhat.com/show_bug.cgi?id=1292971#c6) + - name: 'docker.container.dns.resolution failed on {HOST.NAME}' + expression: '{Template Docker:docker.container.dns.resolution.min(#3)}>0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_dns.asciidoc' + priority: average + status: disabled + + - name: 'docker.container.existing.dns.resolution.failed on {HOST.NAME}' + expression: '{Template Docker:docker.container.existing.dns.resolution.failed.min(#3)}>0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_dns.asciidoc' + priority: average + - name: 'Docker storage is using LOOPBACK on {HOST.NAME}' expression: '{Template Docker:docker.storage.is_loopback.last()}<>0' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc' diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml index 514d6fd24..9d20eb012 100644 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -20,13 +20,26 @@ g_template_openshift_master: - Openshift Master - key: openshift.master.api.ping - description: "Verify that the Openshift API is up" + description: "Verify that the Openshift API is up (uses the cluster API URL)" + type: int + applications: + - Openshift Master + + - key: openshift.master.local.api.ping + description: "Verify that the Openshift API is up on the host (uses the API URL as the https://127.0.0.1)" type: int applications: - Openshift Master - key: openshift.master.api.healthz - description: "Checks the healthz check of the master's api: https://master_host/healthz" + description: "Checks the healthz check of the master's api: https://<cluster_api_url>/healthz" + type: int + data_type: bool + applications: + - Openshift Master + + - key: openshift.master.local.api.healthz + description: "Checks the healthz check of the master's api: https://127.0.0.1/healthz" type: int data_type: bool applications: @@ -98,6 +111,18 @@ g_template_openshift_master: applications: - Openshift Master + - key: openshift.master.skydns.port.open + description: State of the SkyDNS port open and listening + type: int + applications: + - Openshift Master + + - key: openshift.master.skydns.query + description: SkyDNS can be queried or not + type: int + applications: + - Openshift Master + - key: openshift.master.etcd.create.success description: Show number of successful create actions type: int @@ -269,9 +294,22 @@ g_template_openshift_master: - 'Openshift Master process not running on {HOST.NAME}' priority: avg + - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}' + expression: '{Template Openshift Master:create_app.sum(1h)}>3' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + description: The application create loop has failed 4 or more times in the last hour + priority: avg + - name: 'Openshift Master API health check is failing on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: high + + - name: 'Openshift Master Local API health check is failing on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.local.api.healthz.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' priority: high @@ -279,6 +317,11 @@ g_template_openshift_master: - name: 'Openshift Master API PING check is failing on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: high + + - name: 'Openshift Master Local API PING check is failing on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.local.api.ping.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' priority: high @@ -297,6 +340,20 @@ g_template_openshift_master: - 'Openshift Master process not running on {HOST.NAME}' priority: high + - name: 'SkyDNS port not listening on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: high + + - name: 'SkyDNS query failed on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.skydns.query.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + dependencies: + - 'Openshift Master API health check is failing on {HOST.NAME}' + priority: high + zgraphs: - name: Openshift Master API Server Latency Pods LIST Quantiles width: 900 diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml index ce28b1048..ff65ef158 100644 --- a/roles/os_zabbix/vars/template_openshift_node.yml +++ b/roles/os_zabbix/vars/template_openshift_node.yml @@ -20,6 +20,12 @@ g_template_openshift_node: applications: - Openshift Node + - key: openshift.node.ovs.stray.rules + description: Number of OVS stray rules found/removed + type: int + applications: + - Openshift Node + ztriggers: - name: 'Openshift Node process not running on {HOST.NAME}' expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1' diff --git a/roles/os_zabbix/vars/template_zagg_server.yml b/roles/os_zabbix/vars/template_zagg_server.yml new file mode 100644 index 000000000..db5665993 --- /dev/null +++ b/roles/os_zabbix/vars/template_zagg_server.yml @@ -0,0 +1,46 @@ +--- +g_template_zagg_server: + name: Template Zagg Server + zitems: + - key: zagg.server.metrics.count + applications: + - Zagg Server + value_type: int + + - key: zagg.server.metrics.errors + applications: + - Zagg Server + value_type: int + + - key: zagg.server.heartbeat.errors + applications: + - Zagg Server + value_type: int + + - key: zagg.server.heartbeat.count + applications: + - Zagg Server + value_type: int + + ztriggers: + - name: 'Error processing metrics on {HOST.NAME}' + expression: '{Template Zagg Server:zagg.server.metrics.errors.min(#3)}>0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc' + priority: average + + - name: 'Error processing heartbeats on {HOST.NAME}' + expression: '{Template Zagg Server:zagg.server.heartbeat.errors.min(#3)}>0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc' + priority: average + + - name: 'Critically High number of metrics in Zagg queue {HOST.NAME}' + expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>10000' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc' + priority: high + + - name: 'High number of metrics in Zagg queue {HOST.NAME}' + expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>5000' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc' + dependencies: + - 'Critically High number of metrics in Zagg queue {HOST.NAME}' + priority: average |