diff options
Diffstat (limited to 'roles/os_zabbix/vars/template_openshift_master.yml')
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_master.yml | 374 |
1 files changed, 367 insertions, 7 deletions
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml index c71e07910..e36f23a2b 100644 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -2,10 +2,10 @@ g_template_openshift_master: name: Template Openshift Master zitems: - - name: create_app + - name: openshift.master.app.create applications: - Openshift Master - key: create_app + key: openshift.master.app.create - key: openshift.master.process.count description: Shows number of master processes running @@ -13,12 +13,255 @@ g_template_openshift_master: applications: - Openshift Master - ztriggers: - - name: 'Application creation has failed on {HOST.NAME}' - expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' - priority: avg + - key: openshift.master.api.ping + description: "Verify that the Openshift API is up (uses the cluster API URL)" + type: int + applications: + - Openshift Master + + - key: openshift.master.local.api.ping + description: "Verify that the Openshift API is up on the host (uses the API URL as the https://127.0.0.1)" + type: int + applications: + - Openshift Master + + - key: openshift.master.api.healthz + description: "Checks the healthz check of the master's api: https://<cluster_api_url>/healthz" + type: int + data_type: bool + applications: + - Openshift Master + + - key: openshift.master.local.api.healthz + description: "Checks the healthz check of the master's api: https://127.0.0.1/healthz" + type: int + data_type: bool + applications: + - Openshift Master + + - key: openshift.master.user.count + description: Shows number of users in a cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pod.running.count + description: Shows number of pods running + type: int + applications: + - Openshift Master + + - key: openshift.master.pod.user.running.count + description: Shows number of user pods running (non infrastructure pods) + type: int + applications: + - Openshift Master + + - key: openshift.master.pod.total.count + description: Shows total number of pods (running and non running) + type: int + applications: + - Openshift Master + + - key: openshift.master.node.count + description: Shows the total number of nodes found in the Openshift Cluster + type: int + applications: + - Openshift Master + + - key: openshift.project.count + description: Shows number of projects on a cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pv.total.count + description: Total number of Persistent Volumes in the Openshift Cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pv.available.count + description: Total number of Available Persistent Volumes in the Openshift Cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pv.released.count + description: Total number of Released Persistent Volumes in the Openshift Cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pv.bound.count + description: Total number of Bound Persistent Volumes in the Openshift Cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pv.failed.count + description: Total number of Failed Persistent Volumes in the Openshift Cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.skydns.port.open + description: State of the SkyDNS port open and listening + type: int + applications: + - Openshift Master + + - key: openshift.master.skydns.query + description: SkyDNS can be queried or not + type: int + applications: + - Openshift Master + + - key: openshift.master.etcd.create.success + description: Show number of successful create actions + type: int + applications: + - Openshift Etcd + + - key: openshift.master.etcd.create.fail + description: Show number of failed create actions + type: int + applications: + - Openshift Etcd + + - key: openshift.master.etcd.delete.success + description: Show number of successful delete actions + type: int + applications: + - Openshift Etcd + + - key: openshift.master.etcd.delete.fail + description: Show number of failed delete actions + type: int + applications: + - Openshift Etcd + + - key: openshift.master.etcd.get.success + description: Show number of successful get actions + type: int + applications: + - Openshift Etcd + + - key: openshift.master.etcd.get.fail + description: Show number of failed get actions + type: int + applications: + - Openshift Etcd + + - key: openshift.master.etcd.set.success + description: Show number of successful set actions + type: int + applications: + - Openshift Etcd + - key: openshift.master.etcd.set.fail + description: Show number of failed set actions + type: int + applications: + - Openshift Etcd + + - key: openshift.master.etcd.update.success + description: Show number of successful update actions + type: int + applications: + - Openshift Etcd + + - key: openshift.master.etcd.update.fail + description: Show number of failed update actions + type: int + applications: + - Openshift Etcd + + - key: openshift.master.etcd.watchers + description: Show number of etcd watchers + type: int + applications: + - Openshift Etcd + + - key: openshift.master.etcd.ping + description: etcd ping + type: int + applications: + - Openshift Etcd + + - key: openshift.master.metric.ping + description: "This check verifies that the https://master/metrics check is alive and communicating properly." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.nodesnotready.count + description: "This check shows how many nodes in a cluster are in NotReady state." + type: int + applications: + - Openshift Master + + - key: openshift.master.nodesnotschedulable.count + description: "This check shows how many nodes in a cluster are not schedulable." + type: int + applications: + - Openshift Master + + - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.apiserver.latency.summary.pods.quantile.list.9 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.apiserver.latency.summary.pods.quantile.list.99 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.5 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the end to end scheduling operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.9 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the end to end scheduling operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.99 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the end to end scheduling operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + ztriggers: - name: 'Openshift Master process not running on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' @@ -28,3 +271,120 @@ g_template_openshift_master: expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: high + + - name: 'Etcd ping failed on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc' + priority: high + + - name: 'Number of users for Openshift Master on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.user.count.last()}=0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: info + + - name: 'There are no projects running on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.project.count.last()}=0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: info + + # Put triggers that depend on other triggers here (deps must be created first) + - name: 'Application creation has failed on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.app.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.create.last(#2)}=1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: avg + + - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.app.create.sum(1h)}>3' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + description: The application create loop has failed 4 or more times in the last hour + priority: avg + + - name: 'Openshift Master API health check is failing on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: high + + - name: 'Openshift Master Local API health check is failing on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.local.api.healthz.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: high + + - name: 'Openshift Master API PING check is failing on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: high + + - name: 'Openshift Master Local API PING check is failing on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.local.api.ping.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: high + + - name: 'Openshift Master metric PING check is failing on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: avg + + - name: 'SkyDNS port not listening on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: high + + - name: 'SkyDNS query failed on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.skydns.query.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + dependencies: + - 'Openshift Master API health check is failing on {HOST.NAME}' + priority: high + + - name: 'Hosts not ready according to {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.nodesnotready.count.last(#2)}>0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: high + + zgraphs: + - name: Openshift Master API Server Latency Pods LIST Quantiles + width: 900 + height: 200 + graph_items: + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.5 + color: red + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.9 + color: blue + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.99 + color: orange + + - name: Openshift Master API Server Latency Pods WATCHLIST Quantiles + width: 900 + height: 200 + graph_items: + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5 + color: red + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9 + color: blue + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99 + color: orange + + - name: Openshift Master Scheduler End to End Latency Quantiles + width: 900 + height: 200 + graph_items: + - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.5 + color: red + - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.9 + color: blue + - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.99 + color: orange |