summaryrefslogtreecommitdiffstats
path: root/roles/ands_monitor/templates
diff options
context:
space:
mode:
authorSuren A. Chilingaryan <csa@suren.me>2020-01-22 03:16:06 +0100
committerSuren A. Chilingaryan <csa@suren.me>2020-01-22 03:16:06 +0100
commit1e8153c2af051ce48d5aa08d3dbdc0d0970ea532 (patch)
tree7bb1441a87521aa8c3c5524f95fa645850a6826e /roles/ands_monitor/templates
parente0b1b53f21095707af87a095934e971d788a90c7 (diff)
downloadands-1e8153c2af051ce48d5aa08d3dbdc0d0970ea532.tar.gz
ands-1e8153c2af051ce48d5aa08d3dbdc0d0970ea532.tar.bz2
ands-1e8153c2af051ce48d5aa08d3dbdc0d0970ea532.tar.xz
ands-1e8153c2af051ce48d5aa08d3dbdc0d0970ea532.zip
Document another problem with lost IPs and exhausting of SDN IP range
Diffstat (limited to 'roles/ands_monitor/templates')
-rwxr-xr-xroles/ands_monitor/templates/scripts/check_server_status.sh.j26
-rwxr-xr-xroles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j217
2 files changed, 21 insertions, 2 deletions
diff --git a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
index c2849f4..e49ec97 100755
--- a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
+++ b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
@@ -4,6 +4,8 @@ fs=`df -lm / | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -
datafs=`df -lm /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4`
mem=`free -g | grep "Mem" | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 7`
cpu=`uptime | sed -e "s/[[:space:]]/\n/g" -e s/,/./g | tail -n 1`
+max_cpu=$(cat /proc/cpuinfo | grep processor | tail -n 1 | cut -d ':' -f 2)
+cpu_usage=$(echo "100 * $cpu / ( $max_cpu + 1)" | bc) #"
if [ $fs -le 8192 ]; then
echo "Only $(($fs / 1024)) GB left in the root file system"
@@ -17,8 +19,8 @@ if [ $mem -le 16 ]; then
echo "The system is starving on memory, $mem GB left free"
fi
-if [ `echo "$cpu < 20" | bc` -eq 0 ]; then
- echo "The system is starving on cpu, $cpu is load average for the last 15 min"
+if [ `echo "$cpu_usage < 80" | bc` -eq 0 ]; then
+ echo "The system is starving on cpu, $cpu ($cpu_usage%) is load average for the last 15 min"
fi
vol=$(/opt/MegaRAID/storcli/storcli64 /c0/v0 show | grep -P "^0/0" | grep "Optl" | wc -l)
diff --git a/roles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j2 b/roles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j2
new file mode 100755
index 0000000..c938121
--- /dev/null
+++ b/roles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j2
@@ -0,0 +1,17 @@
+#! /bin/bash
+
+host=$(uname -n)
+
+# Check node is in the cluster and we have permissions to access OpenShift
+oc get node "$host" &> /dev/null
+[ $? -ne 0 ] && { echo "Can't query node $host, check cluster configuration and permissions"; exit; }
+
+oc adm manage-node "$host" --schedulable=false &> /dev/null
+[ $? -ne 0 ] && { echo "Failed to disable scheduling on the node $host"; exit; }
+
+for hash in $(find /var/lib/cni/networks/openshift-sdn/* -mmin +120 -print0 | xargs -0 tail -n +1 | grep '^[A-Za-z0-9]*$' | cut -c 1-8); do if [ -z $(docker ps -a | grep $hash | awk '{print $1}') ]; then grep -ilr $hash ./; fi; done | xargs rm
+
+systemctl restart origin-node
+
+oc adm manage-node "$host" --schedulable=true &> /dev/null
+[ $? -ne 0 ] && echo "Failed to re-nablee scheduling on the node $host"