diff options
author | Suren A. Chilingaryan <csa@suren.me> | 2020-01-22 03:16:06 +0100 |
---|---|---|
committer | Suren A. Chilingaryan <csa@suren.me> | 2020-01-22 03:16:06 +0100 |
commit | 1e8153c2af051ce48d5aa08d3dbdc0d0970ea532 (patch) | |
tree | 7bb1441a87521aa8c3c5524f95fa645850a6826e /roles/ands_monitor/templates | |
parent | e0b1b53f21095707af87a095934e971d788a90c7 (diff) | |
download | ands-1e8153c2af051ce48d5aa08d3dbdc0d0970ea532.tar.gz ands-1e8153c2af051ce48d5aa08d3dbdc0d0970ea532.tar.bz2 ands-1e8153c2af051ce48d5aa08d3dbdc0d0970ea532.tar.xz ands-1e8153c2af051ce48d5aa08d3dbdc0d0970ea532.zip |
Document another problem with lost IPs and exhausting of SDN IP range
Diffstat (limited to 'roles/ands_monitor/templates')
-rwxr-xr-x | roles/ands_monitor/templates/scripts/check_server_status.sh.j2 | 6 | ||||
-rwxr-xr-x | roles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j2 | 17 |
2 files changed, 21 insertions, 2 deletions
diff --git a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 index c2849f4..e49ec97 100755 --- a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 +++ b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 @@ -4,6 +4,8 @@ fs=`df -lm / | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' - datafs=`df -lm /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4` mem=`free -g | grep "Mem" | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 7` cpu=`uptime | sed -e "s/[[:space:]]/\n/g" -e s/,/./g | tail -n 1` +max_cpu=$(cat /proc/cpuinfo | grep processor | tail -n 1 | cut -d ':' -f 2) +cpu_usage=$(echo "100 * $cpu / ( $max_cpu + 1)" | bc) #" if [ $fs -le 8192 ]; then echo "Only $(($fs / 1024)) GB left in the root file system" @@ -17,8 +19,8 @@ if [ $mem -le 16 ]; then echo "The system is starving on memory, $mem GB left free" fi -if [ `echo "$cpu < 20" | bc` -eq 0 ]; then - echo "The system is starving on cpu, $cpu is load average for the last 15 min" +if [ `echo "$cpu_usage < 80" | bc` -eq 0 ]; then + echo "The system is starving on cpu, $cpu ($cpu_usage%) is load average for the last 15 min" fi vol=$(/opt/MegaRAID/storcli/storcli64 /c0/v0 show | grep -P "^0/0" | grep "Optl" | wc -l) diff --git a/roles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j2 b/roles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j2 new file mode 100755 index 0000000..c938121 --- /dev/null +++ b/roles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j2 @@ -0,0 +1,17 @@ +#! /bin/bash + +host=$(uname -n) + +# Check node is in the cluster and we have permissions to access OpenShift +oc get node "$host" &> /dev/null +[ $? -ne 0 ] && { echo "Can't query node $host, check cluster configuration and permissions"; exit; } + +oc adm manage-node "$host" --schedulable=false &> /dev/null +[ $? -ne 0 ] && { echo "Failed to disable scheduling on the node $host"; exit; } + +for hash in $(find /var/lib/cni/networks/openshift-sdn/* -mmin +120 -print0 | xargs -0 tail -n +1 | grep '^[A-Za-z0-9]*$' | cut -c 1-8); do if [ -z $(docker ps -a | grep $hash | awk '{print $1}') ]; then grep -ilr $hash ./; fi; done | xargs rm + +systemctl restart origin-node + +oc adm manage-node "$host" --schedulable=true &> /dev/null +[ $? -ne 0 ] && echo "Failed to re-nablee scheduling on the node $host" |