roam/daily/2021-08-30.org

104 lines
3.7 KiB
Org Mode
Raw Normal View History

2021-09-01 20:57:39 +00:00
:PROPERTIES:
:ID: 6109f700-5c45-4d09-9c58-6f57f6002a3d
:END:
#+title: 2021-08-30
* Researching recent sites outages
Researching recent [[id:57ee2f00-9bcd-4e0f-8a77-ae1f2d4cda89][Control Panel]] outages due to OOM-killed pods.
Checking the log configuration, we don't appear to be capturing request
durations nor request size.
- Retrieve server status
+ http://control-panel.service.production.consul/server-status
+ [[http://control-panel.service.production.consul/server-status?auto]]
+ Retrieve per-pod with a job
+ Also grab memory usage by PID from =/proc=
#+begin_src bash :results output
ENVIRONMENT=${ENVIRONMENT:-development}
SERVICE=${SERVICE:-aweber-classic}
STATSD_HOST=statsd
STATSD_PORT=8125
TMPFILE=$(mktemp)
capture() {
local stat="$1"
local path="$2"
local type="$3"
local fullpath="applications.${SERVICE}.${ENVIRONMENT}.${path}"
local value=$(sed -n "s/^${stat}: //p" $TMPFILE)
local metric="${fullpath}:${value}|${type}"
echo "Capturing metric ${metric}"
# echo "${metric}" | nc -w 1 -c ${STATSD_HOST} ${STATSD_PORT}
}
counter() {
local stat="$1"
local path="$2"
capture "$stat" "counters.apache.$path" "c"
}
CONTROL_PANEL_PODS=$(kubectl get pods -n cp -l app=control-panel | grep -v 'feature-branch\|sensu' | awk '{ print $1}')
for pod in $CONTROL_PANEL_PODS
do
POD_IP=$(kubectl get pod $pod -oyaml | grep " podIP: " | awk '{print $2}')
echo "Fetching status from ${pod} (${POD_IP})"
curl http://${POD_IP}/server-status?auto > "$TMPFILE"
counter "Load1" "load_1"
counter "Load5" "load_5"
counter "Load15" "load_15"
counter "CPUUser" "cpu_user"
counter "CPUSystem" "cpu_system"
counter "CPUChildrenUser" "cpu_children_user"
counter "CPUChildrenSystem" "cpu_children_system"
counter "CPULoad" "cpu_load"
counter "ReqPerSec" "requests_per_second"
counter "BytesPerSec" "bytes_per_second"
counter "BytesPerReq" "bytes_per_request"
counter "DurationPerReq" "duration_per_request"
counter "BusyWorkers" "busy_workers"
counter "IdleWorkers" "idle_workers"
done
rm "$TMPFILE"
#+end_src
#+RESULTS:
#+begin_example
Pod: {}
10.51.12.43
Pod: {}
10.51.27.62
Pod: {}
10.51.20.19
Pod: {}
10.51.23.32
Pod: {}
10.51.13.57
Pod: {}
10.51.19.22
Pod: {}
10.51.21.18
Pod: {}
10.51.15.47
Capturing metric applications.aweber-classic.development.counters.apache.load_1:1.96|c
Capturing metric applications.aweber-classic.development.counters.apache.load_5:2.02|c
Capturing metric applications.aweber-classic.development.counters.apache.load_15:1.91|c
Capturing metric applications.aweber-classic.development.counters.apache.cpu_user:42.35|c
Capturing metric applications.aweber-classic.development.counters.apache.cpu_system:55.37|c
Capturing metric applications.aweber-classic.development.counters.apache.cpu_children_user:42393.1|c
Capturing metric applications.aweber-classic.development.counters.apache.cpu_children_system:9040.76|c
Capturing metric applications.aweber-classic.development.counters.apache.cpu_load:27.1413|c
Capturing metric applications.aweber-classic.development.counters.apache.requests_per_second:4.08449|c
Capturing metric applications.aweber-classic.development.counters.apache.bytes_per_second:18589.1|c
Capturing metric applications.aweber-classic.development.counters.apache.bytes_per_request:4551.15|c
Capturing metric applications.aweber-classic.development.counters.apache.duration_per_request:367.236|c
Capturing metric applications.aweber-classic.development.counters.apache.busy_workers:13|c
Capturing metric applications.aweber-classic.development.counters.apache.idle_workers:3|c
#+end_example