85 lines
2.8 KiB
Bash
Executable file
85 lines
2.8 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# Watches a stable PSU-derived rail (default: in5 on it8628 chips) by
|
|
# learning each chip's baseline from the first BASELINE_SAMPLES and alerting
|
|
# when later samples deviate by more than DEVIATION_MV.
|
|
#
|
|
# Works for any rail that shouldn't swing under normal operation. For Vcore
|
|
# (which swings 600mV+ during P-state transitions on Threadripper) this
|
|
# approach is unsuitable — use in5 (+12V divided) or in7 (3VSB) instead.
|
|
#
|
|
# hwmon numbering is boot-order-dependent, so we resolve it per-line.
|
|
#
|
|
# Optional mitigation hook (set MITIGATE_CMD) runs when a deviation fires —
|
|
# receives the chip, value, baseline, delta on its argv. Use to auto-throttle
|
|
# GPU power or CPU governor as an emergency response.
|
|
|
|
set -o pipefail
|
|
|
|
LOG="${HOME}/apricot-crash.log"
|
|
ALERTS="${HOME}/apricot-rail-alerts.log"
|
|
|
|
: "${DEVIATION_MV:=30}"
|
|
: "${BASELINE_SAMPLES:=20}"
|
|
: "${RAIL_KEY:=in5}"
|
|
: "${CHIP_REGEX:=it8628/hwmon[0-9]+}"
|
|
: "${MITIGATE_CMD:=}"
|
|
|
|
printf '=== rail-watchdog start %s key=%s deviation=%smV baseline_samples=%s chip=%s mitigate=%s ===\n' \
|
|
"$(date --iso-8601=ns)" "$RAIL_KEY" "$DEVIATION_MV" "$BASELINE_SAMPLES" "$CHIP_REGEX" "${MITIGATE_CMD:-<none>}" >> "$ALERTS"
|
|
|
|
emit() {
|
|
local ts msg="$*"
|
|
ts=$(date --iso-8601=ns)
|
|
printf '%s [WARN] %s\n' "$ts" "$msg" | tee -a "$ALERTS" >&2
|
|
}
|
|
|
|
info() {
|
|
local ts msg="$*"
|
|
ts=$(date --iso-8601=ns)
|
|
printf '%s [INFO] %s\n' "$ts" "$msg" >> "$ALERTS"
|
|
}
|
|
|
|
declare -A seen_count
|
|
declare -A baseline
|
|
declare -A buffer
|
|
|
|
chip_re="($CHIP_REGEX)"
|
|
val_re=" ${RAIL_KEY}=([0-9]+)$"
|
|
|
|
median_of() {
|
|
printf '%s\n' $1 | sort -n | awk -v n=$(wc -w <<< "$1") 'NR==int((n+1)/2){print;exit}'
|
|
}
|
|
|
|
tail -F -n 0 "$LOG" 2>/dev/null | while IFS= read -r line; do
|
|
[[ "$line" =~ $chip_re ]] || continue
|
|
chip="${BASH_REMATCH[1]}"
|
|
[[ "$line" =~ $val_re ]] || continue
|
|
val="${BASH_REMATCH[1]}"
|
|
src_ts="${line%% *}"
|
|
|
|
n="${seen_count[$chip]:-0}"
|
|
n=$(( n + 1 ))
|
|
seen_count[$chip]=$n
|
|
|
|
if (( n <= BASELINE_SAMPLES )); then
|
|
buffer[$chip]="${buffer[$chip]:+${buffer[$chip]} }$val"
|
|
if (( n == BASELINE_SAMPLES )); then
|
|
b=$(median_of "${buffer[$chip]}")
|
|
baseline[$chip]=$b
|
|
info "baseline_learned chip=${chip} key=${RAIL_KEY} baseline=${b}mV samples=${BASELINE_SAMPLES}"
|
|
unset 'buffer[$chip]'
|
|
fi
|
|
continue
|
|
fi
|
|
|
|
b="${baseline[$chip]}"
|
|
dev=$(( val - b ))
|
|
(( dev < 0 )) && dev=$(( -dev ))
|
|
if (( dev > DEVIATION_MV )); then
|
|
emit "rail_deviation chip=${chip} key=${RAIL_KEY} val=${val}mV baseline=${b}mV |Δ|=${dev}mV at=${src_ts}"
|
|
if [[ -n "$MITIGATE_CMD" ]]; then
|
|
# Detach mitigation so a slow command can't block alert delivery.
|
|
"$MITIGATE_CMD" "$chip" "$val" "$b" "$dev" "$src_ts" >> "$ALERTS" 2>&1 &
|
|
fi
|
|
fi
|
|
done
|