From ba144fab071258a97cf3c42a0defeb0aae41a353 Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@suren.me>
Date: Sun, 6 Oct 2019 05:00:55 +0200
Subject: Document latest problems with docker images and resource
 reclaimation, add docker performance checks in the monitoring scripts,
 helpers to filter the logs

---
 .../templates/scripts/check_server_status.sh.j2    | 29 +++++++++++++++++++++-
 .../templates/scripts/check_uptime_status.sh.j2    |  2 +-
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'roles/ands_monitor')

diff --git a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
index 0bef13c..c2849f4 100755
--- a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
+++ b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
@@ -3,7 +3,7 @@
 fs=`df -lm / | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4`
 datafs=`df -lm /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4`
 mem=`free -g | grep "Mem" | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 7`
-cpu=`uptime | sed -e "s/[[:space:]]/\n/g" | tail -n 1`
+cpu=`uptime | sed -e "s/[[:space:]]/\n/g" -e s/,/./g | tail -n 1`
 
 if [ $fs -le 8192 ]; then
     echo "Only $(($fs / 1024)) GB left in the root file system"
@@ -53,3 +53,30 @@ ping -c 1 -W 2 8.8.8.8 &> /dev/null
 if [ $? -ne 0 ]; then
     echo "Networkign problems, can't ping Google's public DNS server"
 fi
+
+info=$(LC_ALL=C docker info)
+if [ -n "$info" ]; then
+    images=$(echo "$info" | grep -i images | grep -Po "\d+")
+    [ -n "$images" ] && images=$(docker images -a | wc -l)
+    c=$(echo "$info" | grep -i containers | grep -Po "\d+")
+    c_running=$(echo "$info" | grep -i containers -A 5 | grep -i running | grep -Po "\d+")
+    c_paused=$(echo "$info" | grep -i containers -A 5 | grep -i paused | grep -Po "\d+")
+    c_stopped=$(echo "$info" | grep -i containers -A 5 | grep -i stopped | grep -Po "\d+")
+
+    data_space=$(echo "$info" | grep -i "\bData Space Available" | grep -Po "[\d.]+\s+\w+")
+    data_size=$(echo "$data_space" | grep -Po "[\d.]+")
+    [ -n "$(echo $data_space | grep -P 'TB')" ] && data_size=$(echo "$data_size * 1024" | bc)
+    [ -z "$(echo $data_space | grep '[TG]B')" ] && data_size=0
+
+    metadata_space=$(echo "$info" | grep -i "\bMetadata Space Available" | grep -Po "[\d.]+\s+\w+")
+    metadata_size=$(echo "$metadata_space" | grep -Po "[\d.]+")
+    [ -n "$(echo $metadata_space | grep -P 'TB')" ] && metadata_size=$(echo "$metadata_size * 1024" | bc)
+    [ -z "$(echo $metadata_space | grep '[TG]B')" ] && metadata_size=0
+
+    [ $(echo "$data_size > 300" | bc) -eq 0 ] && echo "Docker Data Space is critically low ($data_space)"
+    [ $(echo "$metadata_size > 5" | bc) -eq 0 ] && echo "Docker Metadata Space is critically low ($metadata_space)"
+else
+    images=$(docker images -a | wc -l)
+    echo "docker info has timed out"
+fi
+[ "$images" -gt 1000 ] && echo "Too many docker images ($images) will cause severe scheduling penalties"
diff --git a/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2
index 7acac5f..df65c50 100755
--- a/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2
+++ b/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2
@@ -12,4 +12,4 @@ disks=$(/opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep "O
 data=`df -lh /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4`
 
 #echo -n "1 Up $up \${color gray}/ $disks disks, $data free, load: $load, pods: $pods"
-echo -en "1 $up\${color gray}, ${disks}/${data}, $(printf %3u ${containers}) c - $(printf %4.1f ${load}), $(printf %3u ${mem}) GB, $(printf %4u ${iops}) IOPS, $(printf %3u ${net}) MB/s"
+echo -en "1 $up\${color gray}, ${disks}/${data}, $(printf %3u ${containers}) c, $(printf %4.1f ${load})%, $(printf %3u ${mem}) GB, $(printf %4u ${iops}) IOPS, $(printf %3u ${net}) MB/s"
-- 
cgit v1.2.3