summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix
diff options
context:
space:
mode:
authorChengcheng Mu <chengcheng.mu@amadeus.com>2015-08-18 10:46:23 +0200
committerChengcheng Mu <chengcheng.mu@amadeus.com>2015-10-01 13:30:03 +0200
commitd0b167bd075eda5ffa104229103dfad772e9f403 (patch)
tree71d353ec2e001ae9b854d230099f0815fbeeb05a /roles/os_zabbix
parent318ac6b9b65f42f032382114f35d3c9fa7f5610b (diff)
downloadopenshift-d0b167bd075eda5ffa104229103dfad772e9f403.tar.gz
openshift-d0b167bd075eda5ffa104229103dfad772e9f403.tar.bz2
openshift-d0b167bd075eda5ffa104229103dfad772e9f403.tar.xz
openshift-d0b167bd075eda5ffa104229103dfad772e9f403.zip
fixed some issues to boot VM on GCE
corrected openshift master config, commented infra correct list and terminate, it was bugged in case where no instance were terminated Using openshift-sdn for gce new join_node playbook for gce openstack/hosts/nova.py is now taking the nova/ini of its directory and not the directory of execution of bin/cluster add fix of ICMP reject rules Avoid a recursive loop Jenkins image was renamed Default masters to t2.medium instead of t2.small Fix a minor bug involving AWS ENV Keys * If a user forgot to set their AWS keys, we'd get a non descriptive error about a variable not being set * This patch uses the correct variable so the error message is more informative delete some fix that are needed anymore (selinux, iptables rules for sdn) GCE : all variables needed are in gce.ini, it will be used by bin/cluster (now check better the presence of gce.init in the default place or use GCE_INI_PATH to locate it ), also by gce.ini openshift_node_labels : get from oo_option fix syntax error in bin/cluster fix lookup for openshift_node_labels Adding desc, multiplier, and units to zabbix item Adding capability to have descriptions on triggers updated triggers and items to have better descriptions and multipliers Move openshift_data_dir to a fact based on deployment_type Previously this was being set to /var/lib/origin regardless of deployment_type which isn't correct given that existing 'enterprise' and 'online' deployments would have been deployed with /var/lib/openshift Verify again that ansible version is different than 1.9.0 and 1.9.0.1 bin/cluste does not take -a and -s anymore fix master_public_api_url : using by default a correct url Really fixed master public api url this time Really fixed master public api url this time uncommented infra deployment like before fixed again masterpublicurl in a template README_GCE.md : use GCE_INI_PATH in order to locate gce.ini, update description of gce.ini
Diffstat (limited to 'roles/os_zabbix')
-rw-r--r--roles/os_zabbix/vars/template_docker.yml12
-rw-r--r--roles/os_zabbix/vars/template_heartbeat.yml2
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml2
-rw-r--r--roles/os_zabbix/vars/template_os_linux.yml124
4 files changed, 85 insertions, 55 deletions
diff --git a/roles/os_zabbix/vars/template_docker.yml b/roles/os_zabbix/vars/template_docker.yml
index a1cd3519e..395e054de 100644
--- a/roles/os_zabbix/vars/template_docker.yml
+++ b/roles/os_zabbix/vars/template_docker.yml
@@ -52,35 +52,35 @@ g_template_docker:
- Docker Storage
value_type: float
ztriggers:
- - description: 'docker.ping failed on {HOST.NAME}'
+ - name: 'docker.ping failed on {HOST.NAME}'
expression: '{Template Docker:docker.ping.max(#3)}<1'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc'
priority: high
- - description: 'Docker storage is using LOOPBACK on {HOST.NAME}'
+ - name: 'Docker storage is using LOOPBACK on {HOST.NAME}'
expression: '{Template Docker:docker.storage.is_loopback.last()}<>0'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc'
priority: high
- - description: 'Critically low docker storage data space on {HOST.NAME}'
+ - name: 'Critically low docker storage data space on {HOST.NAME}'
expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.data.space.available.max(#3)}<5' # < 5% or < 5GB
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
priority: high
- - description: 'Critically low docker storage metadata space on {HOST.NAME}'
+ - name: 'Critically low docker storage metadata space on {HOST.NAME}'
expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.005' # < 5% or < 5MB
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
priority: high
# Put triggers that depend on other triggers here (deps must be created first)
- - description: 'Low docker storage data space on {HOST.NAME}'
+ - name: 'Low docker storage data space on {HOST.NAME}'
expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.data.space.available.max(#3)}<10' # < 10% or < 10GB
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
dependencies:
- 'Critically low docker storage data space on {HOST.NAME}'
priority: average
- - description: 'Low docker storage metadata space on {HOST.NAME}'
+ - name: 'Low docker storage metadata space on {HOST.NAME}'
expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.01' # < 10% or < 10MB
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
dependencies:
diff --git a/roles/os_zabbix/vars/template_heartbeat.yml b/roles/os_zabbix/vars/template_heartbeat.yml
index 798377cd9..8dbe0d0d6 100644
--- a/roles/os_zabbix/vars/template_heartbeat.yml
+++ b/roles/os_zabbix/vars/template_heartbeat.yml
@@ -7,7 +7,7 @@ g_template_heartbeat:
- Heartbeat
key: heartbeat.ping
ztriggers:
- - description: 'Heartbeat.ping has failed on {HOST.NAME}'
+ - name: 'Heartbeat.ping has failed on {HOST.NAME}'
expression: '{Template Heartbeat:heartbeat.ping.nodata(20m)}=1'
priority: avg
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_node_heartbeat.asciidoc'
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
index d2c1365b0..728423ac1 100644
--- a/roles/os_zabbix/vars/template_openshift_master.yml
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -7,7 +7,7 @@ g_template_openshift_master:
- Openshift Master
key: create_app
ztriggers:
- - description: 'Application creation has failed on {HOST.NAME}'
+ - name: 'Application creation has failed on {HOST.NAME}'
expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
priority: avg
diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml
index 7c446cd85..3173c79b2 100644
--- a/roles/os_zabbix/vars/template_os_linux.yml
+++ b/roles/os_zabbix/vars/template_os_linux.yml
@@ -52,106 +52,135 @@ g_template_os_linux:
- Kernel
value_type: float
- - key: mem.freemem
+ - key: kernel.all.cpu.nice
applications:
- - Memory
+ - Kernel
value_type: int
- - key: kernel.all.cpu.nice
+ - key: kernel.all.load.1_minute
applications:
- Kernel
- value_type: int
+ value_type: float
- - key: mem.util.bufmem
+ - key: kernel.uname.version
applications:
- - Memory
- value_type: int
+ - Kernel
+ value_type: string
- - key: swap.used
+ - key: kernel.all.uptime
applications:
- - Memory
+ - Kernel
value_type: int
- - key: kernel.all.load.1_minute
+ - key: kernel.all.cpu.user
applications:
- Kernel
- value_type: float
+ value_type: int
- - key: kernel.uname.version
+ - key: kernel.uname.machine
applications:
- Kernel
value_type: string
- - key: swap.length
+ - key: hinv.ncpu
applications:
- - Memory
+ - Kernel
value_type: int
- - key: mem.physmem
+ - key: kernel.all.cpu.steal
applications:
- - Memory
+ - Kernel
value_type: int
- - key: kernel.all.uptime
+ - key: kernel.all.pswitch
applications:
- Kernel
value_type: int
- - key: swap.free
+ - key: kernel.uname.release
applications:
- - Memory
- value_type: int
+ - Kernel
+ value_type: string
- - key: mem.util.available
+ - key: proc.nprocs
applications:
- - Memory
+ - Kernel
value_type: int
- - key: mem.util.used
+ # Memory Items
+ - key: mem.freemem
applications:
- Memory
value_type: int
+ description: "PCP: free system memory metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: kernel.all.cpu.user
+ - key: mem.util.bufmem
applications:
- - Kernel
+ - Memory
value_type: int
+ description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: kernel.uname.machine
+ - key: swap.used
applications:
- - Kernel
- value_type: string
+ - Memory
+ value_type: int
+ description: "PCP: swap used metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: hinv.ncpu
+ - key: swap.length
applications:
- - Kernel
+ - Memory
value_type: int
+ description: "PCP: total swap available metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: mem.util.cached
+ - key: mem.physmem
applications:
- Memory
value_type: int
+ description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM."
+ multiplier: 1024
+ units: B
- - key: kernel.all.cpu.steal
+ - key: swap.free
applications:
- - Kernel
+ - Memory
value_type: int
+ description: "PCP: swap free metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: kernel.all.pswitch
+ - key: mem.util.available
applications:
- - Kernel
+ - Memory
value_type: int
+ description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: kernel.uname.release
+ - key: mem.util.used
applications:
- - Kernel
- value_type: string
+ - Memory
+ value_type: int
+ description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
- - key: proc.nprocs
+ - key: mem.util.cached
applications:
- - Kernel
+ - Memory
value_type: int
+ description: "PCP: Memory used by the page cache, including buffered file data. This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+ # Disk items
- key: filesys.full.xvda2
applications:
- Disk
@@ -163,32 +192,33 @@ g_template_os_linux:
value_type: float
ztriggers:
- - description: 'Filesystem: / has less than 10% free on {HOST.NAME}'
+ - name: 'Filesystem: / has less than 10% free on {HOST.NAME}'
expression: '{Template OS Linux:filesys.full.xvda2.last()}>90'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
priority: warn
- - description: 'Filesystem: / has less than 5% free on {HOST.NAME}'
+ - name: 'Filesystem: / has less than 5% free on {HOST.NAME}'
expression: '{Template OS Linux:filesys.full.xvda2.last()}>95'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
priority: high
- - description: 'Filesystem: /var has less than 10% free on {HOST.NAME}'
+ - name: 'Filesystem: /var has less than 10% free on {HOST.NAME}'
expression: '{Template OS Linux:filesys.full.xvda3.last()}>90'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
priority: warn
- - description: 'Filesystem: /var has less than 5% free on {HOST.NAME}'
+ - name: 'Filesystem: /var has less than 5% free on {HOST.NAME}'
expression: '{Template OS Linux:filesys.full.xvda3.last()}>95'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
priority: high
- - description: 'Too many TOTAL processes on {HOST.NAME}'
+ - name: 'Too many TOTAL processes on {HOST.NAME}'
expression: '{Template OS Linux:proc.nprocs.last()}>5000'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc'
priority: warn
- - description: 'Lack of available memory on {HOST.NAME}'
- expression: '{Template OS Linux:mem.freemem.last()}<3000'
+ - name: 'Lack of available memory on {HOST.NAME}'
+ expression: '{Template OS Linux:mem.freemem.last()}<30720000'
url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc'
priority: warn
+ description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024'