From 3de29f6d5a3017b57c553c5e2fb63a50994df840 Mon Sep 17 00:00:00 2001 From: Mangirdas Date: Sat, 27 Jan 2018 08:05:31 +0000 Subject: Rebase Prometheus example for new scrape endpoints and expose alert manager --- .../templates/prometheus.yml.j2 | 175 ++++++++++++++------- 1 file changed, 121 insertions(+), 54 deletions(-) (limited to 'roles/openshift_prometheus/templates/prometheus.yml.j2') diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2 index 63430f834..005c2c564 100644 --- a/roles/openshift_prometheus/templates/prometheus.yml.j2 +++ b/roles/openshift_prometheus/templates/prometheus.yml.j2 @@ -1,10 +1,5 @@ rule_files: - - 'prometheus.rules' -{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %} - - 'prometheus.additional.rules' -{% endif %} - - + - '*.rules' # A scrape configuration for running Prometheus on a Kubernetes cluster. # This uses separate scrape configs for cluster components (i.e. API server, node) @@ -39,31 +34,11 @@ scrape_configs: action: keep regex: default;kubernetes;https -# Scrape config for nodes. -# -# Each node exposes a /metrics endpoint that contains operational metrics for -# the Kubelet and other components. -- job_name: 'kubernetes-nodes' - - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - # Scrape config for controllers. # # Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for # the controllers. # -# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via -# endpoints. - job_name: 'kubernetes-controllers' scheme: https @@ -87,6 +62,27 @@ scrape_configs: regex: (.+)(?::\d+) replacement: $1:8444 +# Scrape config for nodes. +# +# Each node exposes a /metrics endpoint that contains operational metrics for +# the Kubelet and other components. +- job_name: 'kubernetes-nodes' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + # Drop a very high cardinality metric that is incorrect in 3.7. It will be + # fixed in 3.9. + metric_relabel_configs: + - source_labels: [__name__] + action: drop + regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)' + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + # Scrape config for cAdvisor. # # Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that @@ -107,6 +103,14 @@ scrape_configs: kubernetes_sd_configs: - role: node + # Exclude a set of high cardinality metrics that can contribute to significant + # memory use in large clusters. These can be selectively enabled as necessary + # for medium or small clusters. + metric_relabel_configs: + - source_labels: [__name__] + action: drop + regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_working_set_bytes|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))' + relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) @@ -133,38 +137,101 @@ scrape_configs: - role: endpoints relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + # only scrape infrastructure components + - source_labels: [__meta_kubernetes_namespace] + action: keep + regex: 'default|logging|metrics|kube-.+|openshift|openshift-.+' + # drop infrastructure components managed by other scrape targets + - source_labels: [__meta_kubernetes_service_name] + action: drop + regex: 'prometheus-node-exporter' + # only those that have requested scraping + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+)(?::\d+);(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + +# Scrape config for node-exporter, which is expected to be running on port 9100. +- job_name: 'kubernetes-nodes-exporter' + + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + + kubernetes_sd_configs: + - role: node + + metric_relabel_configs: + - source_labels: [__name__] + action: drop + regex: 'node_cpu|node_(disk|scrape_collector)_.+' + # preserve a subset of the network, netstat, vmstat, and filesystem series + - source_labels: [__name__] action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + regex: '(node_(netstat_Ip_.+|vmstat_(nr|thp)_.+|filesystem_(free|size|device_error)|network_(transmit|receive)_(drop|errs)))' + target_label: __name__ + replacement: renamed_$1 + - source_labels: [__name__] + action: drop + regex: 'node_(netstat|vmstat|filesystem|network)_.+' + - source_labels: [__name__] action: replace + regex: 'renamed_(.+)' + target_label: __name__ + replacement: $1 + # drop any partial expensive series + - source_labels: [__name__, device] + action: drop + regex: 'node_network_.+;veth.+' + - source_labels: [__name__, mountpoint] + action: drop + regex: 'node_filesystem_(free|size|device_error);([^/].*|/.+)' + + relabel_configs: + - source_labels: [__address__] + regex: '(.*):10250' + replacement: '${1}:9100' target_label: __address__ - regex: (.+)(?::\d+);(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username] - action: replace - target_label: __basic_auth_username__ - regex: (.+) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password] - action: replace - target_label: __basic_auth_password__ - regex: (.+) + - source_labels: [__meta_kubernetes_node_label_kubernetes_io_hostname] + target_label: __instance__ - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name + regex: __meta_kubernetes_node_label_(.+) + +# Scrape config for the template service broker +- job_name: 'openshift-template-service-broker' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt + server_name: apiserver.openshift-template-service-broker.svc + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: openshift-template-service-broker;apiserver;https + alerting: alertmanagers: -- cgit v1.2.3