--- g_template_openshift_master: name: Template Openshift Master zitems: - name: openshift.master.app.create applications: - Openshift Master key: openshift.master.app.create - key: openshift.master.app.build.create description: "check the app create with a build process" value_type: int applications: - Openshift Master - key: openshift.master.app.create.time description: "check the time it takes app create with a build process" value_type: float applications: - Openshift Master - key: openshift.master.app.build.time description: "check the time it takes app build" value_type: float applications: - Openshift Master - key: openshift.master.process.count description: Shows number of master processes running value_type: int applications: - Openshift Master - key: openshift.master.api.ping description: "Verify that the Openshift API is up (uses the cluster API URL)" value_type: int applications: - Openshift Master - key: openshift.master.local.api.ping description: "Verify that the Openshift API is up on the host (uses the API URL as the" value_type: int applications: - Openshift Master - key: openshift.master.api.healthz description: "Checks the healthz check of the master's api: https:///healthz" value_type: int data_type: bool applications: - Openshift Master - key: openshift.master.local.api.healthz description: "Checks the healthz check of the master's api:" value_type: int data_type: bool applications: - Openshift Master - key: openshift.master.user.count description: Shows number of users in a cluster value_type: int applications: - Openshift Master - key: openshift.master.pod.running.count description: Shows number of pods running value_type: int applications: - Openshift Master - key: openshift.master.pod.user.running.count description: Shows number of user pods running (non infrastructure pods) value_type: int applications: - Openshift Master - key: openshift.master.pod.total.count description: Shows total number of pods (running and non running) value_type: int applications: - Openshift Master - key: openshift.master.node.count description: Shows the total number of nodes found in the Openshift Cluster value_type: int applications: - Openshift Master - key: openshift.project.count description: Shows number of projects on a cluster value_type: int applications: - Openshift Master - key: openshift.master.pv.space.total description: Shows the total space of pv value_type: int applications: - Openshift Master - key: openshift.master.pv.space.available description: Shows the available space of pv value_type: int applications: - Openshift Master - key: openshift.master.pv.total.count description: Total number of Persistent Volumes in the Openshift Cluster value_type: int applications: - Openshift Master - key: openshift.master.pv.available.count description: Total number of Available Persistent Volumes in the Openshift Cluster value_type: int applications: - Openshift Master - key: openshift.master.pv.released.count description: Total number of Released Persistent Volumes in the Openshift Cluster value_type: int applications: - Openshift Master - key: openshift.master.pv.bound.count description: Total number of Bound Persistent Volumes in the Openshift Cluster value_type: int applications: - Openshift Master - key: openshift.master.pv.failed.count description: Total number of Failed Persistent Volumes in the Openshift Cluster value_type: int applications: - Openshift Master - key: openshift.master.skydns.port.open description: State of the SkyDNS port open and listening value_type: int applications: - Openshift Master - key: openshift.master.skydns.query description: SkyDNS can be queried or not value_type: int applications: - Openshift Master - key: openshift.master.etcd.create.success description: Show number of successful create actions value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.create.fail description: Show number of failed create actions value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.delete.success description: Show number of successful delete actions value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.delete.fail description: Show number of failed delete actions value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.get.success description: Show number of successful get actions value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.get.fail description: Show number of failed get actions value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.set.success description: Show number of successful set actions value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.set.fail description: Show number of failed set actions value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.update.success description: Show number of successful update actions value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.update.fail description: Show number of failed update actions value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.watchers description: Show number of etcd watchers value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.ping description: etcd ping value_type: int applications: - Openshift Etcd - key: openshift.master.metric.ping description: "This check verifies that the https://master/metrics check is alive and communicating properly." value_type: int applications: - Openshift Master Metrics - key: openshift.master.nodesnotready.count description: "This check shows how many nodes in a cluster are in NotReady state." value_type: int applications: - Openshift Master - key: openshift.master.nodesnotschedulable.count description: "This check shows how many nodes in a cluster are not schedulable." value_type: int applications: - Openshift Master - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5 description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed." value_type: int applications: - Openshift Master Metrics - key: openshift.master.apiserver.latency.summary.pods.quantile.list.9 description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed." value_type: int applications: - Openshift Master Metrics - key: openshift.master.apiserver.latency.summary.pods.quantile.list.99 description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed." value_type: int applications: - Openshift Master Metrics - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5 description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed." value_type: int applications: - Openshift Master Metrics - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9 description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed." value_type: int applications: - Openshift Master Metrics - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99 description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed." value_type: int applications: - Openshift Master Metrics - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.5 description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the end to end scheduling operations have taken to completed." value_type: int applications: - Openshift Master Metrics - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.9 description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the end to end scheduling operations have taken to completed." value_type: int applications: - Openshift Master Metrics - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.99 description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the end to end scheduling operations have taken to completed." value_type: int applications: - Openshift Master Metrics zdiscoveryrules: - name: disc.pv key: disc.pv lifetime: 1 description: "Dynamically register the Persistent Volumes" zitemprototypes: - discoveryrule_key: disc.pv name: "disc.pv.count.{#OSO_PV}" key: "disc.pv.count[{#OSO_PV}]" value_type: int description: "Number of PV's of this size" applications: - Openshift Master - discoveryrule_key: disc.pv name: "disc.pv.count.available.{#OSO_PV}" key: "disc.pv.count.available[{#OSO_PV}]" value_type: int description: "Number of PV's of this size that are available" applications: - Openshift Master ztriggers: - name: 'Openshift Master process not running on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: high - name: 'Too many Openshift Master processes running on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: high - name: 'Etcd ping failed on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc' priority: high - name: 'Number of users for Openshift Master on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.user.count.last()}=0' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: info - name: 'There are no projects running on {HOST.NAME}' expression: '{Template Openshift Master:openshift.project.count.last()}=0' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: info # Put triggers that depend on other triggers here (deps must be created first) - name: 'Application creation has failed on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.app.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.create.last(#2)}=1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' priority: avg - name: 'Application creation with build has failed on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.app.build.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.build.create.last(#2)}=1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' priority: avg - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.app.create.sum(1h)}>3' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' description: The application create loop has failed 4 or more times in the last hour priority: avg - name: 'Application with build creation has failed multiple times in the last 2 hour on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.app.build.create.sum(2h)}>3' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' description: The application create loop has failed 4 or more times in the last hour priority: avg - name: 'Openshift Master API health check is failing on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: high - name: 'Openshift Master Local API health check is failing on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.local.api.healthz.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' priority: high - name: 'Openshift Master API PING check is failing on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: high - name: 'Openshift Master Local API PING check is failing on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.local.api.ping.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' priority: high - name: 'Openshift Master metric PING check is failing on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' priority: avg - name: 'SkyDNS port not listening on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' priority: high - name: 'SkyDNS query failed on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.skydns.query.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' dependencies: - 'Openshift Master API health check is failing on {HOST.NAME}' priority: high - name: 'Hosts not ready according to {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.nodesnotready.count.last(#2)}>0' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' dependencies: - 'Openshift Master process not running on {HOST.NAME}' priority: high zgraphs: - name: Openshift Master API Server Latency Pods LIST Quantiles width: 900 height: 200 graph_items: - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.5 color: red - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.9 color: blue - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.99 color: orange - name: Openshift Master API Server Latency Pods WATCHLIST Quantiles width: 900 height: 200 graph_items: - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5 color: red - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9 color: blue - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99 color: orange - name: Openshift Master Scheduler End to End Latency Quantiles width: 900 height: 200 graph_items: - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.5 color: red - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.9 color: blue - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.99 color: orange