#!/usr/bin/env bash
# Watches a stable PSU-derived rail (default: in5 on it8628 chips) by
# learning each chip's baseline from the first BASELINE_SAMPLES and alerting
# when later samples deviate by more than DEVIATION_MV.
#
# Works for any rail that shouldn't swing under normal operation. For Vcore
# (which swings 600mV+ during P-state transitions on Threadripper) this
# approach is unsuitable — use in5 (+12V divided) or in7 (3VSB) instead.
#
# hwmon numbering is boot-order-dependent, so we resolve it per-line.
#
# Optional mitigation hook (set MITIGATE_CMD) runs when a deviation fires —
# receives the chip, value, baseline, delta on its argv. Use to auto-throttle
# GPU power or CPU governor as an emergency response.

set -o pipefail

LOG="${HOME}/apricot-crash.log"
ALERTS="${HOME}/apricot-rail-alerts.log"

: "${DEVIATION_MV:=30}"
: "${BASELINE_SAMPLES:=20}"
: "${RAIL_KEY:=in5}"
: "${CHIP_REGEX:=it8628/hwmon[0-9]+}"
: "${MITIGATE_CMD:=}"

printf '=== rail-watchdog start %s key=%s deviation=%smV baseline_samples=%s chip=%s mitigate=%s ===\n' \
    "$(date --iso-8601=ns)" "$RAIL_KEY" "$DEVIATION_MV" "$BASELINE_SAMPLES" "$CHIP_REGEX" "${MITIGATE_CMD:-<none>}" >> "$ALERTS"

emit() {
    local ts msg="$*"
    ts=$(date --iso-8601=ns)
    printf '%s [WARN] %s\n' "$ts" "$msg" | tee -a "$ALERTS" >&2
}

info() {
    local ts msg="$*"
    ts=$(date --iso-8601=ns)
    printf '%s [INFO] %s\n' "$ts" "$msg" >> "$ALERTS"
}

declare -A seen_count
declare -A baseline
declare -A buffer

chip_re="($CHIP_REGEX)"
val_re=" ${RAIL_KEY}=([0-9]+)$"

median_of() {
    printf '%s\n' $1 | sort -n | awk -v n=$(wc -w <<< "$1") 'NR==int((n+1)/2){print;exit}'
}

tail -F -n 0 "$LOG" 2>/dev/null | while IFS= read -r line; do
    [[ "$line" =~ $chip_re ]] || continue
    chip="${BASH_REMATCH[1]}"
    [[ "$line" =~ $val_re ]] || continue
    val="${BASH_REMATCH[1]}"
    src_ts="${line%% *}"

    n="${seen_count[$chip]:-0}"
    n=$(( n + 1 ))
    seen_count[$chip]=$n

    if (( n <= BASELINE_SAMPLES )); then
        buffer[$chip]="${buffer[$chip]:+${buffer[$chip]} }$val"
        if (( n == BASELINE_SAMPLES )); then
            b=$(median_of "${buffer[$chip]}")
            baseline[$chip]=$b
            info "baseline_learned chip=${chip} key=${RAIL_KEY} baseline=${b}mV samples=${BASELINE_SAMPLES}"
            unset 'buffer[$chip]'
        fi
        continue
    fi

    b="${baseline[$chip]}"
    dev=$(( val - b ))
    (( dev < 0 )) && dev=$(( -dev ))
    if (( dev > DEVIATION_MV )); then
        emit "rail_deviation chip=${chip} key=${RAIL_KEY} val=${val}mV baseline=${b}mV |Δ|=${dev}mV at=${src_ts}"
        if [[ -n "$MITIGATE_CMD" ]]; then
            # Detach mitigation so a slow command can't block alert delivery.
            "$MITIGATE_CMD" "$chip" "$val" "$b" "$dev" "$src_ts" >> "$ALERTS" 2>&1 &
        fi
    fi
done
