diff options
Diffstat (limited to 'roles')
-rw-r--r-- | roles/docker/handlers/main.yml | 2 | ||||
-rw-r--r-- | roles/docker/tasks/main.yml | 10 | ||||
-rw-r--r-- | roles/etcd/tasks/main.yml | 6 | ||||
-rw-r--r-- | roles/openshift_docker_facts/tasks/main.yml | 17 | ||||
-rwxr-xr-x | roles/openshift_facts/library/openshift_facts.py | 5 | ||||
-rw-r--r-- | roles/openshift_facts/tasks/main.yml | 3 | ||||
-rw-r--r-- | roles/openshift_master/handlers/main.yml | 3 | ||||
-rw-r--r-- | roles/openshift_master/tasks/main.yml | 3 | ||||
-rw-r--r-- | roles/openshift_master/templates/docker-cluster/atomic-openshift-master-api.service.j2 | 2 | ||||
-rw-r--r-- | roles/openshift_master/templates/docker-cluster/atomic-openshift-master-controllers.service.j2 | 2 | ||||
-rw-r--r-- | roles/openshift_master/templates/docker/master.docker.service.j2 | 2 | ||||
-rw-r--r-- | roles/openshift_node/tasks/main.yml | 4 | ||||
-rw-r--r-- | roles/openshift_node/templates/openshift.docker.node.service | 2 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_master.yml | 117 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_node.yml | 12 |
15 files changed, 121 insertions, 69 deletions
diff --git a/roles/docker/handlers/main.yml b/roles/docker/handlers/main.yml index 9f827417f..aff905bc8 100644 --- a/roles/docker/handlers/main.yml +++ b/roles/docker/handlers/main.yml @@ -4,7 +4,7 @@ service: name: docker state: restarted - when: not docker_service_status_changed | default(false) + when: not docker_service_status_changed | default(false) | bool - name: restart udev service: diff --git a/roles/docker/tasks/main.yml b/roles/docker/tasks/main.yml index 506cecfea..9709c5014 100644 --- a/roles/docker/tasks/main.yml +++ b/roles/docker/tasks/main.yml @@ -49,16 +49,16 @@ dest: /etc/sysconfig/docker regexp: '^{{ item.reg_conf_var }}=.*$' line: "{{ item.reg_conf_var }}='{{ item.reg_fact_val | oo_prepend_strings_in_list(item.reg_flag ~ ' ') | join(' ') }}'" - when: item.reg_fact_val is defined and docker_check.stat.isreg + when: item.reg_fact_val != '' and docker_check.stat.isreg with_items: - reg_conf_var: ADD_REGISTRY - reg_fact_val: "{{ docker_additional_registries }}" + reg_fact_val: "{{ docker_additional_registries | default(None, true)}}" reg_flag: --add-registry - reg_conf_var: BLOCK_REGISTRY - reg_fact_val: "{{ docker_blocked_registries }}" + reg_fact_val: "{{ docker_blocked_registries| default(None, true) }}" reg_flag: --block-registry - reg_conf_var: INSECURE_REGISTRY - reg_fact_val: "{{ docker_insecure_registries }}" + reg_fact_val: "{{ docker_insecure_registries| default(None, true) }}" reg_flag: --insecure-registry notify: - restart docker @@ -76,3 +76,5 @@ when: docker_check.stat.isreg notify: - restart docker + +- meta: flush_handlers diff --git a/roles/etcd/tasks/main.yml b/roles/etcd/tasks/main.yml index d6956de71..064544b03 100644 --- a/roles/etcd/tasks/main.yml +++ b/roles/etcd/tasks/main.yml @@ -36,8 +36,12 @@ state: stopped enabled: no +- name: Check for etcd service presence + command: systemctl show etcd.service + register: etcd_show + - name: Mask system etcd when containerized - when: openshift.common.is_containerized | bool + when: openshift.common.is_containerized | bool and 'LoadState=not-found' not in etcd_show.stdout command: systemctl mask etcd - name: Reload systemd units diff --git a/roles/openshift_docker_facts/tasks/main.yml b/roles/openshift_docker_facts/tasks/main.yml index ad7ad3748..26b46aa94 100644 --- a/roles/openshift_docker_facts/tasks/main.yml +++ b/roles/openshift_docker_facts/tasks/main.yml @@ -13,11 +13,9 @@ log_options: "{{ openshift_docker_log_options | default(None) }}" options: "{{ openshift_docker_options | default(None) }}" disable_push_dockerhub: "{{ openshift_disable_push_dockerhub | default(None) }}" - - role: node - local_facts: - portal_net: "{{ openshift_master_portal_net | default(None) }}" + hosted_registry_insecure: "{{ openshift_docker_hosted_registry_insecure | default(None) }}" + hosted_registry_network: "{{ openshift_docker_hosted_registry_network | default(None) }}" -# TODO: append openshift.node.portal_net to docker_insecure_registries - set_fact: docker_additional_registries: "{{ openshift.docker.additional_registries | default(omit) }}" @@ -27,6 +25,15 @@ | default(omit) }}" docker_log_driver: "{{ openshift.docker.log_driver | default(omit) }}" docker_log_options: "{{ openshift.docker.log_options | default(omit) }}" - docker_options: "{{ openshift.docker.options | default(omit) }}" docker_push_dockerhub: "{{ openshift.docker.disable_push_dockerhub | default(omit) }}" + +- set_fact: + docker_options: > + --insecure-registry={{ openshift.docker.hosted_registry_network }} + {{ openshift.docker.options | default ('') }} + when: openshift.docker.hosted_registry_insecure | default(False) | bool + +- set_fact: + docker_options: "{{ openshift.docker.options | default(omit) }}" + when: not openshift.docker.hosted_registry_insecure | default(False) | bool diff --git a/roles/openshift_facts/library/openshift_facts.py b/roles/openshift_facts/library/openshift_facts.py index b06900681..263daf210 100755 --- a/roles/openshift_facts/library/openshift_facts.py +++ b/roles/openshift_facts/library/openshift_facts.py @@ -50,6 +50,10 @@ def migrate_docker_facts(facts): old_param = 'docker_' + param if old_param in facts[role]: facts['docker'][param] = facts[role].pop(old_param) + + if 'node' in facts and 'portal_net' in facts['node']: + facts['docker']['hosted_registry_insecure'] = True + facts['docker']['hosted_registry_network'] = facts['node'].pop('portal_net') return facts def migrate_local_facts(facts): @@ -1402,7 +1406,6 @@ class OpenShiftFacts(object): if 'node' in roles: defaults['node'] = dict(labels={}, annotations={}, - portal_net='172.30.0.0/16', iptables_sync_period='5s', set_node_ip=False) diff --git a/roles/openshift_facts/tasks/main.yml b/roles/openshift_facts/tasks/main.yml index 89ddb0d98..50e7e5747 100644 --- a/roles/openshift_facts/tasks/main.yml +++ b/roles/openshift_facts/tasks/main.yml @@ -25,7 +25,8 @@ openshift_facts: role: common local_facts: - deployment_type: "{{ openshift_deployment_type }}" + # TODO: Deprecate deployment_type in favor of openshift_deployment_type + deployment_type: "{{ openshift_deployment_type | default(deployment_type) }}" cluster_id: "{{ openshift_cluster_id | default('default') }}" hostname: "{{ openshift_hostname | default(None) }}" ip: "{{ openshift_ip | default(None) }}" diff --git a/roles/openshift_master/handlers/main.yml b/roles/openshift_master/handlers/main.yml index 4d1216aae..e5b9e4977 100644 --- a/roles/openshift_master/handlers/main.yml +++ b/roles/openshift_master/handlers/main.yml @@ -17,7 +17,8 @@ # Using curl here since the uri module requires python-httplib2 and # wait_for port doesn't provide health information. command: > - curl -k --silent {{ openshift.master.api_url }}/healthz/ready + curl --silent --cacert {{ openshift.common.config_base }}/master/ca.crt + {{ openshift.master.api_url }}/healthz/ready register: api_available_output until: api_available_output.stdout == 'ok' retries: 120 diff --git a/roles/openshift_master/tasks/main.yml b/roles/openshift_master/tasks/main.yml index 1f499dc93..9c3d09d09 100644 --- a/roles/openshift_master/tasks/main.yml +++ b/roles/openshift_master/tasks/main.yml @@ -282,7 +282,8 @@ # Using curl here since the uri module requires python-httplib2 and # wait_for port doesn't provide health information. command: > - curl -k --silent {{ openshift.master.api_url }}/healthz/ready + curl --silent --cacert {{ openshift.common.config_base }}/master/ca.crt + {{ openshift.master.api_url }}/healthz/ready register: api_available_output until: api_available_output.stdout == 'ok' retries: 120 diff --git a/roles/openshift_master/templates/docker-cluster/atomic-openshift-master-api.service.j2 b/roles/openshift_master/templates/docker-cluster/atomic-openshift-master-api.service.j2 index f777f7657..6a21a04ab 100644 --- a/roles/openshift_master/templates/docker-cluster/atomic-openshift-master-api.service.j2 +++ b/roles/openshift_master/templates/docker-cluster/atomic-openshift-master-api.service.j2 @@ -12,7 +12,7 @@ Requires=docker.service EnvironmentFile=/etc/sysconfig/{{ openshift.common.service_type }}-master-api Environment=GOTRACEBACK=crash ExecStartPre=-/usr/bin/docker rm -f {{ openshift.common.service_type}}-master-api -ExecStart=/usr/bin/docker run --rm --privileged --net=host --name {{ openshift.common.service_type }}-master-api -v {{ openshift.common.data_dir }}:{{ openshift.common.data_dir }} -v /var/run/docker.sock:/var/run/docker.sock -v {{ openshift.common.config_base }}:{{ openshift.common.config_base }} {{ openshift.master.master_image }}:${IMAGE_VERSION} start master api --config=${CONFIG_FILE} $OPTIONS +ExecStart=/usr/bin/docker run --rm --privileged --net=host --name {{ openshift.common.service_type }}-master-api --env-file=/etc/sysconfig/{{ openshift.common.service_type }}-master-api -v {{ openshift.common.data_dir }}:{{ openshift.common.data_dir }} -v /var/run/docker.sock:/var/run/docker.sock -v {{ openshift.common.config_base }}:{{ openshift.common.config_base }} {{ openshift.master.master_image }}:${IMAGE_VERSION} start master api --config=${CONFIG_FILE} $OPTIONS ExecStartPost=/usr/bin/sleep 10 ExecStop=/usr/bin/docker stop {{ openshift.common.service_type }}-master-api LimitNOFILE=131072 diff --git a/roles/openshift_master/templates/docker-cluster/atomic-openshift-master-controllers.service.j2 b/roles/openshift_master/templates/docker-cluster/atomic-openshift-master-controllers.service.j2 index 4fc48ba15..69f68d843 100644 --- a/roles/openshift_master/templates/docker-cluster/atomic-openshift-master-controllers.service.j2 +++ b/roles/openshift_master/templates/docker-cluster/atomic-openshift-master-controllers.service.j2 @@ -11,7 +11,7 @@ PartOf=docker.service EnvironmentFile=/etc/sysconfig/{{ openshift.common.service_type }}-master-controllers Environment=GOTRACEBACK=crash ExecStartPre=-/usr/bin/docker rm -f {{ openshift.common.service_type}}-master-controllers -ExecStart=/usr/bin/docker run --rm --privileged --net=host --name {{ openshift.common.service_type }}-master-controllers -v {{ openshift.common.data_dir }}:{{ openshift.common.data_dir }} -v /var/run/docker.sock:/var/run/docker.sock -v {{ openshift.common.config_base }}:{{ openshift.common.config_base }} {{ openshift.master.master_image }}:${IMAGE_VERSION} start master controllers --config=${CONFIG_FILE} $OPTIONS +ExecStart=/usr/bin/docker run --rm --privileged --net=host --name {{ openshift.common.service_type }}-master-controllers --env-file=/etc/sysconfig/{{ openshift.common.service_type }}-master-controllers -v {{ openshift.common.data_dir }}:{{ openshift.common.data_dir }} -v /var/run/docker.sock:/var/run/docker.sock -v {{ openshift.common.config_base }}:{{ openshift.common.config_base }} {{ openshift.master.master_image }}:${IMAGE_VERSION} start master controllers --config=${CONFIG_FILE} $OPTIONS ExecStartPost=/usr/bin/sleep 10 ExecStop=/usr/bin/docker stop {{ openshift.common.service_type }}-master-controllers LimitNOFILE=131072 diff --git a/roles/openshift_master/templates/docker/master.docker.service.j2 b/roles/openshift_master/templates/docker/master.docker.service.j2 index e9f4a4d21..b714fdeb7 100644 --- a/roles/openshift_master/templates/docker/master.docker.service.j2 +++ b/roles/openshift_master/templates/docker/master.docker.service.j2 @@ -8,7 +8,7 @@ Wants=etcd_container.service [Service] EnvironmentFile=/etc/sysconfig/{{ openshift.common.service_type }}-master ExecStartPre=-/usr/bin/docker rm -f {{ openshift.common.service_type }}-master -ExecStart=/usr/bin/docker run --rm --privileged --net=host --name {{ openshift.common.service_type }}-master -v {{ openshift.common.data_dir }}:{{ openshift.common.data_dir }} -v /var/run/docker.sock:/var/run/docker.sock -v {{ openshift.common.config_base }}:{{ openshift.common.config_base }} {{ openshift.master.master_image }}:${IMAGE_VERSION} start master --config=${CONFIG_FILE} $OPTIONS +ExecStart=/usr/bin/docker run --rm --privileged --net=host --name {{ openshift.common.service_type }}-master --env-file=/etc/sysconfig/{{ openshift.common.service_type }}-master -v {{ openshift.common.data_dir }}:{{ openshift.common.data_dir }} -v /var/run/docker.sock:/var/run/docker.sock -v {{ openshift.common.config_base }}:{{ openshift.common.config_base }} {{ openshift.master.master_image }}:${IMAGE_VERSION} start master --config=${CONFIG_FILE} $OPTIONS ExecStartPost=/usr/bin/sleep 10 ExecStop=/usr/bin/docker stop {{ openshift.common.service_type }}-master Restart=always diff --git a/roles/openshift_node/tasks/main.yml b/roles/openshift_node/tasks/main.yml index 8768d426b..4b5832ab7 100644 --- a/roles/openshift_node/tasks/main.yml +++ b/roles/openshift_node/tasks/main.yml @@ -23,7 +23,6 @@ iptables_sync_period: "{{ openshift_node_iptables_sync_period | default(None) }}" kubelet_args: "{{ openshift_node_kubelet_args | default(None) }}" labels: "{{ lookup('oo_option', 'openshift_node_labels') | default( openshift_node_labels | default(none), true) }}" - portal_net: "{{ openshift_master_portal_net | default(None) }}" registry_url: "{{ oreg_url | default(none) }}" schedulable: "{{ openshift_schedulable | default(openshift_scheduleable) | default(None) }}" sdn_mtu: "{{ openshift_node_sdn_mtu | default(None) }}" @@ -126,7 +125,8 @@ # Using curl here since the uri module requires python-httplib2 and # wait_for port doesn't provide health information. command: > - curl -k --silent {{ openshift_node_master_api_url }}/healthz/ready + curl --silent --cacert {{ openshift.common.config_base }}/node/ca.crt + {{ openshift_node_master_api_url }}/healthz/ready register: api_available_output until: api_available_output.stdout == 'ok' retries: 120 diff --git a/roles/openshift_node/templates/openshift.docker.node.service b/roles/openshift_node/templates/openshift.docker.node.service index fa14cd770..53b1d6230 100644 --- a/roles/openshift_node/templates/openshift.docker.node.service +++ b/roles/openshift_node/templates/openshift.docker.node.service @@ -12,7 +12,7 @@ Wants={{ openshift.common.service_type }}-master.service [Service] EnvironmentFile=/etc/sysconfig/{{ openshift.common.service_type }}-node ExecStartPre=-/usr/bin/docker rm -f {{ openshift.common.service_type }}-node -ExecStart=/usr/bin/docker run --name {{ openshift.common.service_type }}-node --rm --privileged --net=host --pid=host -v /:/rootfs:ro -e CONFIG_FILE=${CONFIG_FILE} -e OPTIONS=${OPTIONS} -e HOST=/rootfs -e HOST_ETC=/host-etc -v {{ openshift.common.data_dir }}:{{ openshift.common.data_dir }} -v {{ openshift.common.config_base }}/node:{{ openshift.common.config_base }}/node -v /etc/localtime:/etc/localtime:ro -v /etc/machine-id:/etc/machine-id:ro -v /run:/run -v /sys:/sys:ro -v /usr/bin/docker:/usr/bin/docker:ro -v /var/lib/docker:/var/lib/docker -v /lib/modules:/lib/modules -v /etc/origin/openvswitch:/etc/openvswitch -v /etc/origin/sdn:/etc/openshift-sdn -v /etc/systemd/system:/host-etc/systemd/system -v /var/log:/var/log {{ openshift.node.node_image }}:${IMAGE_VERSION} +ExecStart=/usr/bin/docker run --name {{ openshift.common.service_type }}-node --rm --privileged --net=host --pid=host --env-file=/etc/sysconfig/{{ openshift.common.service_type }}-node -v /:/rootfs:ro -e CONFIG_FILE=${CONFIG_FILE} -e OPTIONS=${OPTIONS} -e HOST=/rootfs -e HOST_ETC=/host-etc -v {{ openshift.common.data_dir }}:{{ openshift.common.data_dir }} -v {{ openshift.common.config_base }}/node:{{ openshift.common.config_base }}/node -v /etc/localtime:/etc/localtime:ro -v /etc/machine-id:/etc/machine-id:ro -v /run:/run -v /sys:/sys:ro -v /usr/bin/docker:/usr/bin/docker:ro -v /var/lib/docker:/var/lib/docker -v /lib/modules:/lib/modules -v /etc/origin/openvswitch:/etc/openvswitch -v /etc/origin/sdn:/etc/openshift-sdn -v /etc/systemd/system:/host-etc/systemd/system -v /var/log:/var/log {{ openshift.node.node_image }}:${IMAGE_VERSION} ExecStartPost=/usr/bin/sleep 10 ExecStop=/usr/bin/docker stop {{ openshift.common.service_type }}-node SyslogIdentifier={{ openshift.common.service_type }}-node diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml index e36f23a2b..705066b35 100644 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -6,258 +6,276 @@ g_template_openshift_master: applications: - Openshift Master key: openshift.master.app.create + + - key: openshift.master.app.build.create + description: "check the app create with a build process" + value_type: int + applications: + - Openshift Master + + - key: openshift.master.app.create.time + description: "check the time it takes app create with a build process" + value_type: float + applications: + - Openshift Master + + - key: openshift.master.app.build.time + description: "check the time it takes app build" + value_type: float + applications: + - Openshift Master - key: openshift.master.process.count description: Shows number of master processes running - type: int + value_type: int applications: - Openshift Master - key: openshift.master.api.ping description: "Verify that the Openshift API is up (uses the cluster API URL)" - type: int + value_type: int applications: - Openshift Master - key: openshift.master.local.api.ping description: "Verify that the Openshift API is up on the host (uses the API URL as the https://127.0.0.1)" - type: int + value_type: int applications: - Openshift Master - key: openshift.master.api.healthz description: "Checks the healthz check of the master's api: https://<cluster_api_url>/healthz" - type: int + value_type: int data_type: bool applications: - Openshift Master - key: openshift.master.local.api.healthz description: "Checks the healthz check of the master's api: https://127.0.0.1/healthz" - type: int + value_type: int data_type: bool applications: - Openshift Master - key: openshift.master.user.count description: Shows number of users in a cluster - type: int + value_type: int applications: - Openshift Master - key: openshift.master.pod.running.count description: Shows number of pods running - type: int + value_type: int applications: - Openshift Master - key: openshift.master.pod.user.running.count description: Shows number of user pods running (non infrastructure pods) - type: int + value_type: int applications: - Openshift Master - key: openshift.master.pod.total.count description: Shows total number of pods (running and non running) - type: int + value_type: int applications: - Openshift Master - key: openshift.master.node.count description: Shows the total number of nodes found in the Openshift Cluster - type: int + value_type: int applications: - Openshift Master - key: openshift.project.count description: Shows number of projects on a cluster - type: int + value_type: int applications: - Openshift Master - key: openshift.master.pv.total.count description: Total number of Persistent Volumes in the Openshift Cluster - type: int + value_type: int applications: - Openshift Master - key: openshift.master.pv.available.count description: Total number of Available Persistent Volumes in the Openshift Cluster - type: int + value_type: int applications: - Openshift Master - key: openshift.master.pv.released.count description: Total number of Released Persistent Volumes in the Openshift Cluster - type: int + value_type: int applications: - Openshift Master - key: openshift.master.pv.bound.count description: Total number of Bound Persistent Volumes in the Openshift Cluster - type: int + value_type: int applications: - Openshift Master - key: openshift.master.pv.failed.count description: Total number of Failed Persistent Volumes in the Openshift Cluster - type: int + value_type: int applications: - Openshift Master - key: openshift.master.skydns.port.open description: State of the SkyDNS port open and listening - type: int + value_type: int applications: - Openshift Master - key: openshift.master.skydns.query description: SkyDNS can be queried or not - type: int + value_type: int applications: - Openshift Master - key: openshift.master.etcd.create.success description: Show number of successful create actions - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.create.fail description: Show number of failed create actions - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.delete.success description: Show number of successful delete actions - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.delete.fail description: Show number of failed delete actions - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.get.success description: Show number of successful get actions - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.get.fail description: Show number of failed get actions - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.set.success description: Show number of successful set actions - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.set.fail description: Show number of failed set actions - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.update.success description: Show number of successful update actions - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.update.fail description: Show number of failed update actions - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.watchers description: Show number of etcd watchers - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.etcd.ping description: etcd ping - type: int + value_type: int applications: - Openshift Etcd - key: openshift.master.metric.ping description: "This check verifies that the https://master/metrics check is alive and communicating properly." - type: int + value_type: int applications: - Openshift Master Metrics - key: openshift.master.nodesnotready.count description: "This check shows how many nodes in a cluster are in NotReady state." - type: int + value_type: int applications: - Openshift Master - key: openshift.master.nodesnotschedulable.count description: "This check shows how many nodes in a cluster are not schedulable." - type: int + value_type: int applications: - Openshift Master - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5 description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed." - type: int + value_type: int applications: - Openshift Master Metrics - key: openshift.master.apiserver.latency.summary.pods.quantile.list.9 description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed." - type: int + value_type: int applications: - Openshift Master Metrics - key: openshift.master.apiserver.latency.summary.pods.quantile.list.99 description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed." - type: int + value_type: int applications: - Openshift Master Metrics - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5 description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed." - type: int + value_type: int applications: - Openshift Master Metrics - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9 description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed." - type: int + value_type: int applications: - Openshift Master Metrics - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99 description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed." - type: int + value_type: int applications: - Openshift Master Metrics - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.5 description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the end to end scheduling operations have taken to completed." - type: int + value_type: int applications: - Openshift Master Metrics - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.9 description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the end to end scheduling operations have taken to completed." - type: int + value_type: int applications: - Openshift Master Metrics - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.99 description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the end to end scheduling operations have taken to completed." - type: int + value_type: int applications: - Openshift Master Metrics @@ -295,6 +313,13 @@ g_template_openshift_master: - 'Openshift Master process not running on {HOST.NAME}' priority: avg + - name: 'Application creation with build has failed on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.app.build.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.build.create.last(#2)}=1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: avg + - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.app.create.sum(1h)}>3' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' @@ -303,6 +328,14 @@ g_template_openshift_master: description: The application create loop has failed 4 or more times in the last hour priority: avg + - name: 'Application with build creation has failed multiple times in the last 2 hour on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.app.build.create.sum(2h)}>3' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + description: The application create loop has failed 4 or more times in the last hour + priority: avg + - name: 'Openshift Master API health check is failing on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml index 66bd3a147..9f84a2cdf 100644 --- a/roles/os_zabbix/vars/template_openshift_node.yml +++ b/roles/os_zabbix/vars/template_openshift_node.yml @@ -4,37 +4,37 @@ g_template_openshift_node: zitems: - key: openshift.node.process.count description: Shows number of OpenShift Node processes running - type: int + value_type: int applications: - Openshift Node - key: openshift.node.ovs.pids.count description: Shows number of ovs process ids running - type: int + value_type: int applications: - Openshift Node - key: openshift.node.ovs.ports.count description: Shows number of OVS ports defined - type: int + value_type: int applications: - Openshift Node - key: openshift.node.ovs.stray.rules description: Number of OVS stray rules found/removed - type: int + value_type: int applications: - Openshift Node - key: openshift.node.registry-pods.healthy_pct description: Shows the percentage of healthy registries in the cluster - type: int + value_type: int applications: - Openshift Node - key: openshift.node.registry.service.ping description: Ping docker-registry service from node - type: int + value_type: int applications: - Openshift Node |