summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--playbooks/gcp/openshift-cluster/build_base_image.yml3
-rw-r--r--playbooks/openshift-prometheus/private/uninstall.yml8
-rw-r--r--playbooks/openshift-prometheus/uninstall.yml2
-rw-r--r--roles/openshift_hosted/tasks/registry.yml2
-rw-r--r--roles/openshift_prometheus/defaults/main.yaml15
-rw-r--r--roles/openshift_prometheus/tasks/facts.yaml10
-rw-r--r--roles/openshift_prometheus/tasks/install_prometheus.yaml119
-rw-r--r--roles/openshift_prometheus/tasks/main.yaml4
-rw-r--r--roles/openshift_prometheus/tasks/uninstall.yaml (renamed from roles/openshift_prometheus/tasks/uninstall_prometheus.yaml)0
-rw-r--r--roles/openshift_prometheus/templates/prometheus.j292
-rw-r--r--roles/openshift_prometheus/templates/prometheus.yml.j2175
11 files changed, 286 insertions, 144 deletions
diff --git a/playbooks/gcp/openshift-cluster/build_base_image.yml b/playbooks/gcp/openshift-cluster/build_base_image.yml
index 75d0ddf9d..8e9b0024a 100644
--- a/playbooks/gcp/openshift-cluster/build_base_image.yml
+++ b/playbooks/gcp/openshift-cluster/build_base_image.yml
@@ -90,6 +90,8 @@
repo_gpgcheck: no
state: present
when: ansible_os_family == "RedHat"
+ - name: Accept GPG keys for the repos
+ command: yum -q makecache -y --disablerepo='*' --enablerepo='google-cloud,jdetiber-qemu-user-static'
- name: Install qemu-user-static
package:
name: qemu-user-static
@@ -121,7 +123,6 @@
with_items:
# required by Ansible
- PyYAML
- - docker
- google-compute-engine
- google-compute-engine-init
- google-config
diff --git a/playbooks/openshift-prometheus/private/uninstall.yml b/playbooks/openshift-prometheus/private/uninstall.yml
new file mode 100644
index 000000000..2df39c2a8
--- /dev/null
+++ b/playbooks/openshift-prometheus/private/uninstall.yml
@@ -0,0 +1,8 @@
+---
+- name: Uninstall Prometheus
+ hosts: masters[0]
+ tasks:
+ - name: Run the Prometheus Uninstall Role Tasks
+ include_role:
+ name: openshift_prometheus
+ tasks_from: uninstall
diff --git a/playbooks/openshift-prometheus/uninstall.yml b/playbooks/openshift-prometheus/uninstall.yml
new file mode 100644
index 000000000..c92ade786
--- /dev/null
+++ b/playbooks/openshift-prometheus/uninstall.yml
@@ -0,0 +1,2 @@
+---
+- import_playbook: private/uninstall.yml
diff --git a/roles/openshift_hosted/tasks/registry.yml b/roles/openshift_hosted/tasks/registry.yml
index bc4d81eb7..22294e3d4 100644
--- a/roles/openshift_hosted/tasks/registry.yml
+++ b/roles/openshift_hosted/tasks/registry.yml
@@ -43,7 +43,7 @@
- name: Update registry environment variables when pushing via dns
set_fact:
- openshift_hosted_registry_env_vars: "{{ openshift_hosted_registry_env_vars | combine({'REGISTRY_OPENSHIFT_SERVER_ADDR':'docker-registry.default.svc:5000'}) }}"
+ openshift_hosted_registry_env_vars: "{{ openshift_hosted_registry_env_vars | combine({'OPENSHIFT_DEFAULT_REGISTRY':'docker-registry.default.svc:5000'}) }}"
when: openshift_push_via_dns | bool
- name: Update registry proxy settings for dc/docker-registry
diff --git a/roles/openshift_prometheus/defaults/main.yaml b/roles/openshift_prometheus/defaults/main.yaml
index 1b21c4739..37a05f3f0 100644
--- a/roles/openshift_prometheus/defaults/main.yaml
+++ b/roles/openshift_prometheus/defaults/main.yaml
@@ -7,9 +7,24 @@ openshift_prometheus_namespace: openshift-metrics
# defaults hosts for routes
openshift_prometheus_hostname: prometheus-{{openshift_prometheus_namespace}}.{{openshift_master_default_subdomain}}
openshift_prometheus_alerts_hostname: alerts-{{openshift_prometheus_namespace}}.{{openshift_master_default_subdomain}}
+openshift_prometheus_alertmanager_hostname: alertmanager-{{openshift_prometheus_namespace}}.{{openshift_master_default_subdomain}}
+
openshift_prometheus_node_selector: {"region":"infra"}
+openshift_prometheus_service_port: 443
+openshift_prometheus_service_targetport: 8443
+openshift_prometheus_service_name: prometheus
+openshift_prometheus_alerts_service_targetport: 9443
+openshift_prometheus_alerts_service_name: alerts
+openshift_prometheus_alertmanager_service_targetport: 10443
+openshift_prometheus_alertmanager_service_name: alertmanager
+openshift_prometheus_serviceaccount_annotations: []
+l_openshift_prometheus_serviceaccount_annotations:
+ - serviceaccounts.openshift.io/oauth-redirectreference.prom='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
+ - serviceaccounts.openshift.io/oauth-redirectreference.alerts='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
+ - serviceaccounts.openshift.io/oauth-redirectreference.alertmanager='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alertmanager"}}'
+
# additional prometheus rules file
openshift_prometheus_additional_rules_file: null
diff --git a/roles/openshift_prometheus/tasks/facts.yaml b/roles/openshift_prometheus/tasks/facts.yaml
new file mode 100644
index 000000000..214089732
--- /dev/null
+++ b/roles/openshift_prometheus/tasks/facts.yaml
@@ -0,0 +1,10 @@
+---
+# The kubernetes version impacts the prometheus scraping endpoint
+# so gathering it before constructing the configmap
+- name: get oc version
+ oc_version:
+ register: oc_version
+
+- set_fact:
+ kubernetes_version: "{{ oc_version.results.kubernetes_short | float }}"
+ openshift_prometheus_serviceaccount_annotations: "{{ l_openshift_prometheus_serviceaccount_annotations + openshift_prometheus_serviceaccount_annotations|list }}"
diff --git a/roles/openshift_prometheus/tasks/install_prometheus.yaml b/roles/openshift_prometheus/tasks/install_prometheus.yaml
index 749df5152..0b565502f 100644
--- a/roles/openshift_prometheus/tasks/install_prometheus.yaml
+++ b/roles/openshift_prometheus/tasks/install_prometheus.yaml
@@ -1,4 +1,6 @@
---
+# set facts
+- include_tasks: facts.yaml
# namespace
- name: Add prometheus project
@@ -9,7 +11,7 @@
description: Prometheus
# secrets
-- name: Set alert and prometheus secrets
+- name: Set alert, alertmanager and prometheus secrets
oc_secret:
state: present
name: "{{ item }}-proxy"
@@ -20,30 +22,24 @@
with_items:
- prometheus
- alerts
+ - alertmanager
# serviceaccount
- name: create prometheus serviceaccount
oc_serviceaccount:
state: present
- name: prometheus
+ name: "{{ openshift_prometheus_service_name }}"
namespace: "{{ openshift_prometheus_namespace }}"
- # TODO add annotations when supproted
- # annotations:
- # serviceaccounts.openshift.io/oauth-redirectreference.prom: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
- # serviceaccounts.openshift.io/oauth-redirectreference.alerts: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
-
- secrets:
- - prometheus-secrets
changed_when: no
+
# TODO remove this when annotations are supported by oc_serviceaccount
- name: annotate serviceaccount
command: >
{{ openshift_client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
- serviceaccount prometheus
- serviceaccounts.openshift.io/oauth-redirectreference.prom='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
- serviceaccounts.openshift.io/oauth-redirectreference.alerts='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
-
+ serviceaccount {{ openshift_prometheus_service_name }} {{ item }}
+ with_items:
+ "{{ openshift_prometheus_serviceaccount_annotations }}"
# create clusterrolebinding for prometheus serviceaccount
- name: Set cluster-reader permissions for prometheus
@@ -52,63 +48,61 @@
namespace: "{{ openshift_prometheus_namespace }}"
resource_kind: cluster-role
resource_name: cluster-reader
- user: "system:serviceaccount:{{ openshift_prometheus_namespace }}:prometheus"
+ user: "system:serviceaccount:{{ openshift_prometheus_namespace }}:{{ openshift_prometheus_service_name }}"
+
-# create prometheus and alerts services
-# TODO join into 1 task with loop
-- name: Create prometheus service
+- name: create services for prometheus
oc_service:
- state: present
- name: "{{ item.name }}"
+ name: "{{ openshift_prometheus_service_name }}"
namespace: "{{ openshift_prometheus_namespace }}"
- selector:
- app: prometheus
labels:
- name: "{{ item.name }}"
- # TODO add annotations when supported
- # annotations:
- # service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls"
+ name: prometheus
+ annotations:
+ oprometheus.io/scrape: 'true'
+ oprometheus.io/scheme: https
+ service.alpha.openshift.io/serving-cert-secret-name: prometheus-tls
ports:
- - port: 443
- targetPort: 8443
- with_items:
- - name: prometheus
+ - name: prometheus
+ port: "{{ openshift_prometheus_service_port }}"
+ targetPort: "{{ openshift_prometheus_service_targetport }}"
+ protocol: TCP
+ selector:
+ app: prometheus
-- name: Create alerts service
+- name: create services for alert buffer
oc_service:
- state: present
- name: "{{ item.name }}"
+ name: "{{ openshift_prometheus_alerts_service_name }}"
namespace: "{{ openshift_prometheus_namespace }}"
+ labels:
+ name: prometheus
+ annotations:
+ service.alpha.openshift.io/serving-cert-secret-name: alerts-tls
+ ports:
+ - name: prometheus
+ port: "{{ openshift_prometheus_service_port }}"
+ targetPort: "{{ openshift_prometheus_alerts_service_targetport }}"
+ protocol: TCP
selector:
app: prometheus
+
+- name: create services for alertmanager
+ oc_service:
+ name: "{{ openshift_prometheus_alertmanager_service_name }}"
+ namespace: "{{ openshift_prometheus_namespace }}"
labels:
- name: "{{ item.name }}"
- # TODO add annotations when supported
- # annotations:
- # service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls"
+ name: prometheus
+ annotations:
+ service.alpha.openshift.io/serving-cert-secret-name: alertmanager-tls
ports:
- - port: 443
- targetPort: 9443
- with_items:
- - name: alerts
-
-
-# Annotate services with secret name
-# TODO remove this when annotations are supported by oc_service
-- name: annotate prometheus service
- command: >
- {{ openshift_client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
- service prometheus
- prometheus.io/scrape='true'
- prometheus.io/scheme=https
- service.alpha.openshift.io/serving-cert-secret-name=prometheus-tls
-
-- name: annotate alerts service
- command: >
- {{ openshift_client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
- service alerts 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-alerts-tls'
+ - name: prometheus
+ port: "{{ openshift_prometheus_service_port }}"
+ targetPort: "{{ openshift_prometheus_alertmanager_service_targetport }}"
+ protocol: TCP
+ selector:
+ app: prometheus
# create prometheus and alerts routes
+# TODO: oc_route module should support insecureEdgeTerminationPolicy: Redirect
- name: create prometheus and alerts routes
oc_route:
state: present
@@ -122,6 +116,8 @@
host: "{{ openshift_prometheus_hostname }}"
- name: alerts
host: "{{ openshift_prometheus_alerts_hostname }}"
+ - name: alertmanager
+ host: "{{ openshift_prometheus_alertmanager_hostname }}"
# Storage
- name: create prometheus pvc
@@ -169,15 +165,6 @@
path: "{{ tempdir }}/prometheus.additional.rules"
register: additional_rules_stat
-# The kubernetes version impacts the prometheus scraping endpoint
-# so gathering it before constructing the configmap
-- name: get oc version
- oc_version:
- register: oc_version
-
-- set_fact:
- kubernetes_version: "{{ oc_version.results.kubernetes_short | float }}"
-
- template:
src: prometheus.yml.j2
dest: "{{ tempdir }}/prometheus.yml"
@@ -219,7 +206,7 @@
- name: Set alertmanager configmap
oc_configmap:
state: present
- name: "prometheus-alerts"
+ name: "alertmanager"
namespace: "{{ openshift_prometheus_namespace }}"
from_file:
alertmanager.yml: "{{ tempdir }}/alertmanager.yml"
diff --git a/roles/openshift_prometheus/tasks/main.yaml b/roles/openshift_prometheus/tasks/main.yaml
index b859eb111..66d65a3f2 100644
--- a/roles/openshift_prometheus/tasks/main.yaml
+++ b/roles/openshift_prometheus/tasks/main.yaml
@@ -16,9 +16,11 @@
- name: Create templates subdirectory
file:
state: directory
- path: "{{ tempdir }}/templates"
+ path: "{{ tempdir }}/{{ item }}"
mode: 0755
changed_when: False
+ with_items:
+ - templates
- include_tasks: install_prometheus.yaml
when: openshift_prometheus_state == 'present'
diff --git a/roles/openshift_prometheus/tasks/uninstall_prometheus.yaml b/roles/openshift_prometheus/tasks/uninstall.yaml
index d746402db..d746402db 100644
--- a/roles/openshift_prometheus/tasks/uninstall_prometheus.yaml
+++ b/roles/openshift_prometheus/tasks/uninstall.yaml
diff --git a/roles/openshift_prometheus/templates/prometheus.j2 b/roles/openshift_prometheus/templates/prometheus.j2
index d780550b8..c0abd483b 100644
--- a/roles/openshift_prometheus/templates/prometheus.j2
+++ b/roles/openshift_prometheus/templates/prometheus.j2
@@ -19,7 +19,7 @@ spec:
labels:
app: prometheus
spec:
- serviceAccountName: prometheus
+ serviceAccountName: "{{ openshift_prometheus_service_name }}"
{% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %}
nodeSelector:
{% for key, value in openshift_prometheus_node_selector.items() %}
@@ -47,15 +47,15 @@ spec:
cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
{% endif %}
ports:
- - containerPort: 8443
+ - containerPort: {{ openshift_prometheus_service_targetport }}
name: web
args:
- -provider=openshift
- - -https-address=:8443
+ - -https-address=:{{ openshift_prometheus_service_targetport }}
- -http-address=
- -email-domain=*
- -upstream=http://localhost:9090
- - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+ - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
- '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
- '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
- -tls-cert=/etc/tls/private/tls.crt
@@ -67,9 +67,9 @@ spec:
- -skip-auth-regex=^/metrics
volumeMounts:
- mountPath: /etc/tls/private
- name: prometheus-tls
+ name: prometheus-tls-secret
- mountPath: /etc/proxy/secrets
- name: prometheus-secrets
+ name: prometheus-proxy-secret
- mountPath: /prometheus
name: prometheus-data
@@ -104,7 +104,7 @@ spec:
- mountPath: /prometheus
name: prometheus-data
- # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy
+ # Deploy alert-buffer behind oauth alerts-proxy
- name: alerts-proxy
image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}"
imagePullPolicy: IfNotPresent
@@ -124,15 +124,15 @@ spec:
cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
{% endif %}
ports:
- - containerPort: 9443
+ - containerPort: {{ openshift_prometheus_alerts_service_targetport }}
name: web
args:
- -provider=openshift
- - -https-address=:9443
+ - -https-address=:{{ openshift_prometheus_alerts_service_targetport }}
- -http-address=
- -email-domain=*
- -upstream=http://localhost:9099
- - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+ - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
- '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
- '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
- -tls-cert=/etc/tls/private/tls.crt
@@ -143,9 +143,9 @@ spec:
- -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
volumeMounts:
- mountPath: /etc/tls/private
- name: alerts-tls
+ name: alerts-tls-secret
- mountPath: /etc/proxy/secrets
- name: alerts-secrets
+ name: alerts-proxy-secret
- name: alert-buffer
args:
@@ -169,11 +169,54 @@ spec:
{% endif %}
volumeMounts:
- mountPath: /alert-buffer
- name: alert-buffer-data
+ name: alerts-data
ports:
- containerPort: 9099
name: alert-buf
+ # Deploy alertmanager behind oauth alertmanager-proxy
+ - name: alertmanager-proxy
+ image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}"
+ imagePullPolicy: IfNotPresent
+ requests:
+{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %}
+ memory: "{{ openshift_prometheus_oauth_proxy_memory_requests }}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %}
+ cpu: "{{ openshift_prometheus_oauth_proxy_cpu_requests }}"
+{% endif %}
+ limits:
+{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %}
+ memory: "{{ openshift_prometheus_oauth_proxy_memory_limit }}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %}
+ cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
+{% endif %}
+ ports:
+ - containerPort: {{ openshift_prometheus_alertmanager_service_targetport }}
+ name: web
+ args:
+ - -provider=openshift
+ - -https-address=:{{ openshift_prometheus_alertmanager_service_targetport }}
+ - -http-address=
+ - -email-domain=*
+ - -upstream=http://localhost:9093
+ - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
+ - -openshift-ca=/etc/pki/tls/cert.pem
+ - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
+ - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
+ - -tls-cert=/etc/tls/private/tls.crt
+ - -tls-key=/etc/tls/private/tls.key
+ - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token
+ - -cookie-secret-file=/etc/proxy/secrets/session_secret
+ - -skip-auth-regex=^/metrics
+ volumeMounts:
+ - mountPath: /etc/tls/private
+ name: alertmanager-tls-secret
+ - mountPath: /etc/proxy/secrets
+ name: alertmanager-proxy-secret
+
- name: alertmanager
args:
- -config.file=/etc/alertmanager/alertmanager.yml
@@ -205,14 +248,15 @@ spec:
restartPolicy: Always
volumes:
+
- name: prometheus-config
configMap:
defaultMode: 420
name: prometheus
- - name: prometheus-secrets
+ - name: prometheus-proxy-secret
secret:
secretName: prometheus-proxy
- - name: prometheus-tls
+ - name: prometheus-tls-secret
secret:
secretName: prometheus-tls
- name: prometheus-data
@@ -225,13 +269,19 @@ spec:
- name: alertmanager-config
configMap:
defaultMode: 420
- name: prometheus-alerts
- - name: alerts-secrets
+ name: alertmanager
+ - name: alertmanager-proxy-secret
secret:
- secretName: alerts-proxy
- - name: alerts-tls
+ secretName: alertmanager-proxy
+ - name: alertmanager-tls-secret
+ secret:
+ secretName: alertmanager-tls
+ - name: alerts-tls-secret
secret:
- secretName: prometheus-alerts-tls
+ secretName: alerts-tls
+ - name: alerts-proxy-secret
+ secret:
+ secretName: alerts-proxy
- name: alertmanager-data
{% if openshift_prometheus_alertmanager_storage_type == 'pvc' %}
persistentVolumeClaim:
@@ -239,7 +289,7 @@ spec:
{% else %}
emptydir: {}
{% endif %}
- - name: alert-buffer-data
+ - name: alerts-data
{% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %}
persistentVolumeClaim:
claimName: {{ openshift_prometheus_alertbuffer_pvc_name }}
diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2
index 63430f834..005c2c564 100644
--- a/roles/openshift_prometheus/templates/prometheus.yml.j2
+++ b/roles/openshift_prometheus/templates/prometheus.yml.j2
@@ -1,10 +1,5 @@
rule_files:
- - 'prometheus.rules'
-{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %}
- - 'prometheus.additional.rules'
-{% endif %}
-
-
+ - '*.rules'
# A scrape configuration for running Prometheus on a Kubernetes cluster.
# This uses separate scrape configs for cluster components (i.e. API server, node)
@@ -39,31 +34,11 @@ scrape_configs:
action: keep
regex: default;kubernetes;https
-# Scrape config for nodes.
-#
-# Each node exposes a /metrics endpoint that contains operational metrics for
-# the Kubelet and other components.
-- job_name: 'kubernetes-nodes'
-
- scheme: https
- tls_config:
- ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
- kubernetes_sd_configs:
- - role: node
-
- relabel_configs:
- - action: labelmap
- regex: __meta_kubernetes_node_label_(.+)
-
# Scrape config for controllers.
#
# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
# the controllers.
#
-# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via
-# endpoints.
- job_name: 'kubernetes-controllers'
scheme: https
@@ -87,6 +62,27 @@ scrape_configs:
regex: (.+)(?::\d+)
replacement: $1:8444
+# Scrape config for nodes.
+#
+# Each node exposes a /metrics endpoint that contains operational metrics for
+# the Kubelet and other components.
+- job_name: 'kubernetes-nodes'
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+ kubernetes_sd_configs:
+ - role: node
+ # Drop a very high cardinality metric that is incorrect in 3.7. It will be
+ # fixed in 3.9.
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)'
+ relabel_configs:
+ - action: labelmap
+ regex: __meta_kubernetes_node_label_(.+)
+
# Scrape config for cAdvisor.
#
# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
@@ -107,6 +103,14 @@ scrape_configs:
kubernetes_sd_configs:
- role: node
+ # Exclude a set of high cardinality metrics that can contribute to significant
+ # memory use in large clusters. These can be selectively enabled as necessary
+ # for medium or small clusters.
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_working_set_bytes|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))'
+
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
@@ -133,38 +137,101 @@ scrape_configs:
- role: endpoints
relabel_configs:
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
- action: keep
- regex: true
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
- action: replace
- target_label: __scheme__
- regex: (https?)
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+ # only scrape infrastructure components
+ - source_labels: [__meta_kubernetes_namespace]
+ action: keep
+ regex: 'default|logging|metrics|kube-.+|openshift|openshift-.+'
+ # drop infrastructure components managed by other scrape targets
+ - source_labels: [__meta_kubernetes_service_name]
+ action: drop
+ regex: 'prometheus-node-exporter'
+ # only those that have requested scraping
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
+ action: keep
+ regex: true
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
+ action: replace
+ target_label: __scheme__
+ regex: (https?)
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+ action: replace
+ target_label: __metrics_path__
+ regex: (.+)
+ - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+ action: replace
+ target_label: __address__
+ regex: (.+)(?::\d+);(\d+)
+ replacement: $1:$2
+ - action: labelmap
+ regex: __meta_kubernetes_service_label_(.+)
+ - source_labels: [__meta_kubernetes_namespace]
+ action: replace
+ target_label: kubernetes_namespace
+ - source_labels: [__meta_kubernetes_service_name]
+ action: replace
+ target_label: kubernetes_name
+
+# Scrape config for node-exporter, which is expected to be running on port 9100.
+- job_name: 'kubernetes-nodes-exporter'
+
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+
+ kubernetes_sd_configs:
+ - role: node
+
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'node_cpu|node_(disk|scrape_collector)_.+'
+ # preserve a subset of the network, netstat, vmstat, and filesystem series
+ - source_labels: [__name__]
action: replace
- target_label: __metrics_path__
- regex: (.+)
- - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+ regex: '(node_(netstat_Ip_.+|vmstat_(nr|thp)_.+|filesystem_(free|size|device_error)|network_(transmit|receive)_(drop|errs)))'
+ target_label: __name__
+ replacement: renamed_$1
+ - source_labels: [__name__]
+ action: drop
+ regex: 'node_(netstat|vmstat|filesystem|network)_.+'
+ - source_labels: [__name__]
action: replace
+ regex: 'renamed_(.+)'
+ target_label: __name__
+ replacement: $1
+ # drop any partial expensive series
+ - source_labels: [__name__, device]
+ action: drop
+ regex: 'node_network_.+;veth.+'
+ - source_labels: [__name__, mountpoint]
+ action: drop
+ regex: 'node_filesystem_(free|size|device_error);([^/].*|/.+)'
+
+ relabel_configs:
+ - source_labels: [__address__]
+ regex: '(.*):10250'
+ replacement: '${1}:9100'
target_label: __address__
- regex: (.+)(?::\d+);(\d+)
- replacement: $1:$2
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username]
- action: replace
- target_label: __basic_auth_username__
- regex: (.+)
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password]
- action: replace
- target_label: __basic_auth_password__
- regex: (.+)
+ - source_labels: [__meta_kubernetes_node_label_kubernetes_io_hostname]
+ target_label: __instance__
- action: labelmap
- regex: __meta_kubernetes_service_label_(.+)
- - source_labels: [__meta_kubernetes_namespace]
- action: replace
- target_label: kubernetes_namespace
- - source_labels: [__meta_kubernetes_service_name]
- action: replace
- target_label: kubernetes_name
+ regex: __meta_kubernetes_node_label_(.+)
+
+# Scrape config for the template service broker
+- job_name: 'openshift-template-service-broker'
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt
+ server_name: apiserver.openshift-template-service-broker.svc
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+ kubernetes_sd_configs:
+ - role: endpoints
+
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+ action: keep
+ regex: openshift-template-service-broker;apiserver;https
+
alerting:
alertmanagers: