:PROPERTIES: :ID: 6109f700-5c45-4d09-9c58-6f57f6002a3d :END: #+title: 2021-08-30 * Researching recent sites outages Researching recent [[id:57ee2f00-9bcd-4e0f-8a77-ae1f2d4cda89][Control Panel]] outages due to OOM-killed pods. Checking the log configuration, we don't appear to be capturing request durations nor request size. - Retrieve server status + http://control-panel.service.production.consul/server-status + [[http://control-panel.service.production.consul/server-status?auto]] + Retrieve per-pod with a job + Also grab memory usage by PID from =/proc= #+begin_src bash :results output ENVIRONMENT=${ENVIRONMENT:-development} SERVICE=${SERVICE:-aweber-classic} STATSD_HOST=statsd STATSD_PORT=8125 TMPFILE=$(mktemp) capture() { local stat="$1" local path="$2" local type="$3" local fullpath="applications.${SERVICE}.${ENVIRONMENT}.${path}" local value=$(sed -n "s/^${stat}: //p" $TMPFILE) local metric="${fullpath}:${value}|${type}" echo "Capturing metric ${metric}" # echo "${metric}" | nc -w 1 -c ${STATSD_HOST} ${STATSD_PORT} } counter() { local stat="$1" local path="$2" capture "$stat" "counters.apache.$path" "c" } CONTROL_PANEL_PODS=$(kubectl get pods -n cp -l app=control-panel | grep -v 'feature-branch\|sensu' | awk '{ print $1}') for pod in $CONTROL_PANEL_PODS do POD_IP=$(kubectl get pod $pod -oyaml | grep " podIP: " | awk '{print $2}') echo "Fetching status from ${pod} (${POD_IP})" curl http://${POD_IP}/server-status?auto > "$TMPFILE" counter "Load1" "load_1" counter "Load5" "load_5" counter "Load15" "load_15" counter "CPUUser" "cpu_user" counter "CPUSystem" "cpu_system" counter "CPUChildrenUser" "cpu_children_user" counter "CPUChildrenSystem" "cpu_children_system" counter "CPULoad" "cpu_load" counter "ReqPerSec" "requests_per_second" counter "BytesPerSec" "bytes_per_second" counter "BytesPerReq" "bytes_per_request" counter "DurationPerReq" "duration_per_request" counter "BusyWorkers" "busy_workers" counter "IdleWorkers" "idle_workers" done rm "$TMPFILE" #+end_src #+RESULTS: #+begin_example Pod: {} 10.51.12.43 Pod: {} 10.51.27.62 Pod: {} 10.51.20.19 Pod: {} 10.51.23.32 Pod: {} 10.51.13.57 Pod: {} 10.51.19.22 Pod: {} 10.51.21.18 Pod: {} 10.51.15.47 Capturing metric applications.aweber-classic.development.counters.apache.load_1:1.96|c Capturing metric applications.aweber-classic.development.counters.apache.load_5:2.02|c Capturing metric applications.aweber-classic.development.counters.apache.load_15:1.91|c Capturing metric applications.aweber-classic.development.counters.apache.cpu_user:42.35|c Capturing metric applications.aweber-classic.development.counters.apache.cpu_system:55.37|c Capturing metric applications.aweber-classic.development.counters.apache.cpu_children_user:42393.1|c Capturing metric applications.aweber-classic.development.counters.apache.cpu_children_system:9040.76|c Capturing metric applications.aweber-classic.development.counters.apache.cpu_load:27.1413|c Capturing metric applications.aweber-classic.development.counters.apache.requests_per_second:4.08449|c Capturing metric applications.aweber-classic.development.counters.apache.bytes_per_second:18589.1|c Capturing metric applications.aweber-classic.development.counters.apache.bytes_per_request:4551.15|c Capturing metric applications.aweber-classic.development.counters.apache.duration_per_request:367.236|c Capturing metric applications.aweber-classic.development.counters.apache.busy_workers:13|c Capturing metric applications.aweber-classic.development.counters.apache.idle_workers:3|c #+end_example