3.7 KiB
3.7 KiB
2021-08-30
Researching recent sites outages
Researching recent Control Panel outages due to OOM-killed pods.
Checking the log configuration, we don't appear to be capturing request durations nor request size.
-
Retrieve server status
- http://control-panel.service.production.consul/server-status
- http://control-panel.service.production.consul/server-status?auto
- Retrieve per-pod with a job
- Also grab memory usage by PID from
/proc
ENVIRONMENT=${ENVIRONMENT:-development}
SERVICE=${SERVICE:-aweber-classic}
STATSD_HOST=statsd
STATSD_PORT=8125
TMPFILE=$(mktemp)
capture() {
local stat="$1"
local path="$2"
local type="$3"
local fullpath="applications.${SERVICE}.${ENVIRONMENT}.${path}"
local value=$(sed -n "s/^${stat}: //p" $TMPFILE)
local metric="${fullpath}:${value}|${type}"
echo "Capturing metric ${metric}"
# echo "${metric}" | nc -w 1 -c ${STATSD_HOST} ${STATSD_PORT}
}
counter() {
local stat="$1"
local path="$2"
capture "$stat" "counters.apache.$path" "c"
}
CONTROL_PANEL_PODS=$(kubectl get pods -n cp -l app=control-panel | grep -v 'feature-branch\|sensu' | awk '{ print $1}')
for pod in $CONTROL_PANEL_PODS
do
POD_IP=$(kubectl get pod $pod -oyaml | grep " podIP: " | awk '{print $2}')
echo "Fetching status from ${pod} (${POD_IP})"
curl http://${POD_IP}/server-status?auto > "$TMPFILE"
counter "Load1" "load_1"
counter "Load5" "load_5"
counter "Load15" "load_15"
counter "CPUUser" "cpu_user"
counter "CPUSystem" "cpu_system"
counter "CPUChildrenUser" "cpu_children_user"
counter "CPUChildrenSystem" "cpu_children_system"
counter "CPULoad" "cpu_load"
counter "ReqPerSec" "requests_per_second"
counter "BytesPerSec" "bytes_per_second"
counter "BytesPerReq" "bytes_per_request"
counter "DurationPerReq" "duration_per_request"
counter "BusyWorkers" "busy_workers"
counter "IdleWorkers" "idle_workers"
done
rm "$TMPFILE"
Pod: {} 10.51.12.43 Pod: {} 10.51.27.62 Pod: {} 10.51.20.19 Pod: {} 10.51.23.32 Pod: {} 10.51.13.57 Pod: {} 10.51.19.22 Pod: {} 10.51.21.18 Pod: {} 10.51.15.47 Capturing metric applications.aweber-classic.development.counters.apache.load_1:1.96|c Capturing metric applications.aweber-classic.development.counters.apache.load_5:2.02|c Capturing metric applications.aweber-classic.development.counters.apache.load_15:1.91|c Capturing metric applications.aweber-classic.development.counters.apache.cpu_user:42.35|c Capturing metric applications.aweber-classic.development.counters.apache.cpu_system:55.37|c Capturing metric applications.aweber-classic.development.counters.apache.cpu_children_user:42393.1|c Capturing metric applications.aweber-classic.development.counters.apache.cpu_children_system:9040.76|c Capturing metric applications.aweber-classic.development.counters.apache.cpu_load:27.1413|c Capturing metric applications.aweber-classic.development.counters.apache.requests_per_second:4.08449|c Capturing metric applications.aweber-classic.development.counters.apache.bytes_per_second:18589.1|c Capturing metric applications.aweber-classic.development.counters.apache.bytes_per_request:4551.15|c Capturing metric applications.aweber-classic.development.counters.apache.duration_per_request:367.236|c Capturing metric applications.aweber-classic.development.counters.apache.busy_workers:13|c Capturing metric applications.aweber-classic.development.counters.apache.idle_workers:3|c