83 lines
2.9 KiB
Bash
Executable file
83 lines
2.9 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# Continuously appends power/thermal/voltage state to $LOG so that the last
|
|
# fractions of a second before a hard reset survive the crash.
|
|
#
|
|
# Env overrides:
|
|
# LOG output path (default ~/apricot-crash.log)
|
|
# INTERVAL sample period in seconds (default 0.1 = 10 Hz)
|
|
# SENSOR_CHIPS regex of hwmon name(s) to capture (default k10temp|nvme|it8628|nct6*|w83*)
|
|
|
|
set -o pipefail
|
|
|
|
LOG="${LOG:-${HOME}/apricot-crash.log}"
|
|
INTERVAL="${INTERVAL:-0.1}"
|
|
GPU_SAMPLE_EVERY="${GPU_SAMPLE_EVERY:-10}" # nvidia-smi is slow; only invoke every Nth iter
|
|
SENSOR_CHIPS="${SENSOR_CHIPS:-k10temp|nvme|it8628|nct6.*|w83.*}"
|
|
|
|
printf '=== session start %s (pid=%s interval=%ss gpu_every=%s chips=%s) ===\n' \
|
|
"$(date --iso-8601=ns)" "$$" "$INTERVAL" "$GPU_SAMPLE_EVERY" "$SENSOR_CHIPS" >> "$LOG"
|
|
|
|
# Pre-resolve matching hwmon paths once per second (cheaper than per-sample).
|
|
declare -a HWMONS
|
|
refresh_hwmons() {
|
|
HWMONS=()
|
|
for h in /sys/class/hwmon/hwmon*; do
|
|
[ -d "$h" ] || continue
|
|
[ -r "$h/name" ] || continue
|
|
name=$(<"$h/name") # bash builtin — no fork
|
|
[[ "$name" =~ ^(${SENSOR_CHIPS})$ ]] || continue
|
|
HWMONS+=("$h")
|
|
done
|
|
}
|
|
refresh_hwmons
|
|
last_refresh=$SECONDS
|
|
iter=0
|
|
|
|
while :; do
|
|
ts=$(date --iso-8601=ns)
|
|
|
|
# GPU telemetry — skip most iterations because nvidia-smi startup is
|
|
# ~300-500ms, which would cap the loop at ~2 Hz otherwise.
|
|
if (( iter % GPU_SAMPLE_EVERY == 0 )); then
|
|
while IFS= read -r gpu_line; do
|
|
printf '%s gpu %s\n' "$ts" "$gpu_line"
|
|
done < <(nvidia-smi \
|
|
--query-gpu=index,temperature.gpu,power.draw,clocks.gr,clocks.mem,pstate,utilization.gpu,memory.used \
|
|
--format=csv,noheader,nounits 2>/dev/null)
|
|
fi
|
|
iter=$(( iter + 1 ))
|
|
|
|
# Platform sensors — use $(<file) bash builtin everywhere to avoid
|
|
# fork+exec per-read. With ~60 sensor files that's the difference
|
|
# between ~600ms per iteration and <20ms.
|
|
for h in "${HWMONS[@]}"; do
|
|
[ -r "$h/name" ] || continue
|
|
name=$(<"$h/name")
|
|
hb=${h##*/}
|
|
for inp in "$h"/temp*_input "$h"/in*_input "$h"/fan*_input "$h"/curr*_input; do
|
|
[ -r "$inp" ] || continue
|
|
n=${inp##*/}; n=${n%_input}
|
|
label_file="$h/${n}_label"
|
|
if [ -r "$label_file" ]; then
|
|
label=$(<"$label_file")
|
|
else
|
|
label="$n"
|
|
fi
|
|
raw=$(<"$inp")
|
|
printf '%s sensor %s/%s %s=%s\n' "$ts" "$name" "$hb" "$label" "$raw"
|
|
done
|
|
done
|
|
|
|
# Refresh hwmon list every ~5s in case modules load/unload.
|
|
if (( SECONDS - last_refresh > 5 )); then
|
|
refresh_hwmons
|
|
last_refresh=$SECONDS
|
|
fi
|
|
|
|
# Fsync once per second regardless of sample rate (amortized).
|
|
if (( ${ts:20:1} == 0 )); then
|
|
sync "$LOG" 2>/dev/null || true
|
|
fi
|
|
|
|
sleep "$INTERVAL"
|
|
done >> "$LOG"
|