From 95b3b6519683066a41eeaf5ccc61dd47c50728e2 Mon Sep 17 00:00:00 2001 From: Natalie Date: Mon, 25 May 2026 15:40:43 -0700 Subject: [PATCH] =?UTF-8?q?feat(@scripts):=20=E2=9C=A8=20add=20health-chec?= =?UTF-8?q?king=20doctor=20script?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- bin/apricot-doctor | 298 +++++++++++++++++++++++++++++++++++++++++++++ bin/wg-dns-sync | 25 +++- 2 files changed, 319 insertions(+), 4 deletions(-) create mode 100755 bin/apricot-doctor diff --git a/bin/apricot-doctor b/bin/apricot-doctor new file mode 100755 index 0000000..b4f5510 --- /dev/null +++ b/bin/apricot-doctor @@ -0,0 +1,298 @@ +#!/bin/sh +# apricot-doctor Health snapshot of apricot's known-fragile parts +# apricot-doctor check Same as above +# apricot-doctor fix dnsmasq Pull session-tools + regen wg-mesh.conf + restart +# apricot-doctor fix oomd Install composefs PrivateTmp drop-in + start +# apricot-doctor fix quinn-ai Stop, delete unit, mask (half-finished cutover) +# apricot-doctor fix all All three fix subcommands +# apricot-doctor forensic [N] Dump post-mortem artifacts for boot index N +# (default: -1, the previous boot) +# +# Targets the recurring boot-time fragility surfaced during the 2026-05-21 +# wedge investigation: +# +# 1. dnsmasq vs wg1 boot race ("Cannot assign requested address" for the +# mesh IP). Durable fix lives in session-tools/bin/wg-dns-sync; this +# script pulls the latest, regenerates the conf, and restarts. +# +# 2. systemd-oomd vs composefs ("Failed at step NAMESPACE" — systemd +# cannot create /run/systemd/mount-rootfs/var/tmp on the read-only +# composefs rootfs when PrivateTmp=disconnected). Drop-in overrides to +# PrivateTmp=no. +# +# 3. quinn-ai-auto-respond.service half-finished cutover from black — +# crash-loops every 10s with getaddrinfo ENOTIMP. User has explicitly +# asked for it stopped and prevented from starting; cannot mask while +# the real unit file occupies the symlink target, so rm-then-mask. +# +# Idempotent: re-runs are safe and report no-op for already-correct state. +# +# Env: +# APRICOT_DOCTOR_OUT forensic output dir (default: /var/tmp/apricot-doctor) +# +# Exit codes: +# 0 success +# 1 precondition failed (wrong host, missing tool, repo missing) +# 2 usage error +# 3 fix could not be applied +# 4 fix applied but verification failed + +set -eu + +out_root="${APRICOT_DOCTOR_OUT:-/var/tmp/apricot-doctor}" +repo="$HOME/Code/@scripts/session-tools" + +die() { echo "apricot-doctor: $*" >&2; exit 1; } +usage() { sed -n '2,/^$/p' "$0" | sed 's/^# \{0,1\}//'; exit 2; } +ok() { printf ' \033[32m✓\033[0m %s\n' "$*"; } +warn() { printf ' \033[33m!\033[0m %s\n' "$*"; } +fail() { printf ' \033[31m✗\033[0m %s\n' "$*"; } + +require_apricot() { + h=$(hostname -s) + [ "$h" = "apricot" ] || die "must be run on apricot (got: $h)" +} + +# ------------------------------------------------------------------ check --- + +cmd_check() { + require_apricot + echo "apricot-doctor check" + + echo + echo "Services:" + for s in dnsmasq systemd-oomd quinn-ai-auto-respond; do + state=$(systemctl is-active "$s" 2>/dev/null || true) + enabled=$(systemctl is-enabled "$s" 2>/dev/null || true) + case "$state" in + active) ok "$s ($state, $enabled)" ;; + inactive|masked) + # inactive+masked or inactive+disabled is the desired + # end-state for quinn-ai. Treat as ok. + ok "$s ($state, $enabled)" ;; + failed|activating) fail "$s ($state, $enabled)" ;; + *) warn "$s ($state, $enabled)" ;; + esac + done + + echo + echo "Pressure (avg10):" + for p in cpu io memory; do + line=$(awk '/^some/ {print; exit}' "/proc/pressure/$p" 2>/dev/null || true) + avg10=$(printf %s "$line" | sed -n 's/.*avg10=\([0-9.]*\).*/\1/p') + printf ' %-7s %s\n' "$p" "${avg10:-?}" + done + + if command -v nvme >/dev/null 2>&1; then + echo + echo "NVMe composite temps:" + for dev in /dev/nvme?n1; do + [ -e "$dev" ] || continue + t=$(sudo -n nvme smart-log "$dev" 2>/dev/null \ + | awk -F: '/^temperature/ {gsub(/[^0-9.]/,"",$2); print $2; exit}') + printf ' %s %s°C\n' "$dev" "${t:-?}" + done + fi + + echo + echo "Boot history (last 3):" + journalctl --list-boots --no-pager | tail -3 + + echo + # Detect "clean stop" on the previous boot by looking for the systemd + # shutdown-target lines in its tail. Absent => journal cut off mid-stream + # => unclean stop (wedge, crash, or hard power-off). + if journalctl -b -1 --no-pager 2>/dev/null | tail -50 \ + | grep -q -e 'Reached target.*[Ss]hutdown' -e 'Stopped target' -e 'systemd-shutdown'; then + ok "previous boot ended with orderly stop sequence" + else + warn "previous boot ended WITHOUT orderly stop sequence" + warn " -> run 'apricot-doctor forensic -1' to dump post-mortem artifacts" + fi + + echo + failed_count=$(systemctl --failed --no-pager --no-legend 2>/dev/null | wc -l | tr -d ' ') + if [ "$failed_count" -eq 0 ]; then + ok "no failed units" + else + warn "$failed_count failed unit(s):" + systemctl --failed --no-pager --no-legend | awk '{print " " $1}' + fi +} + +# -------------------------------------------------------------- fix dnsmasq - + +cmd_fix_dnsmasq() { + require_apricot + echo "fix dnsmasq: pull session-tools + regen + restart" + + [ -d "$repo/.git" ] || die "session-tools repo not at $repo" + (cd "$repo" && git pull --ff-only --quiet) \ + || die "git pull failed in $repo (check network / merge state)" + ok "session-tools updated" + + # Stale .prev was loaded by dnsmasq alongside the live conf, doubling + # records. Newer wg-dns-sync writes the backup to /var/lib/. Remove the + # stale file if the old script ever left one in dnsmasq.d/. + if [ -e /etc/dnsmasq.d/wg-mesh.conf.prev ]; then + sudo rm -f /etc/dnsmasq.d/wg-mesh.conf.prev + ok "removed stale /etc/dnsmasq.d/wg-mesh.conf.prev" + fi + + "$repo/bin/wg-dns-sync" + + sleep 1 + state=$(systemctl is-active dnsmasq 2>/dev/null || true) + case "$state" in + active) ok "dnsmasq active" ;; + *) fail "dnsmasq is $state after regen"; return 4 ;; + esac +} + +# ----------------------------------------------------------------- fix oomd - + +cmd_fix_oomd() { + require_apricot + echo "fix oomd: composefs PrivateTmp drop-in" + + dropin_dir=/etc/systemd/system/systemd-oomd.service.d + dropin_file="$dropin_dir/composefs-private-tmp.conf" + desired='[Service] +# Bluefin composefs ro-rootfs vs systemd PrivateTmp=disconnected: +# systemd cannot create /run/systemd/mount-rootfs/var/tmp on the ro root, +# so the unit fails at step NAMESPACE (status=226) on every boot. +# Drop to PrivateTmp=no — oomd has no real need for a private /tmp. +PrivateTmp=no +' + + if [ -f "$dropin_file" ] && [ "$(cat "$dropin_file")" = "$desired" ]; then + ok "drop-in already in place at $dropin_file" + else + sudo mkdir -p "$dropin_dir" + printf '%s' "$desired" | sudo tee "$dropin_file" >/dev/null + sudo systemctl daemon-reload + ok "installed $dropin_file" + fi + + sudo systemctl reset-failed systemd-oomd.service 2>/dev/null || true + sudo systemctl start systemd-oomd.service + sleep 1 + state=$(systemctl is-active systemd-oomd 2>/dev/null || true) + case "$state" in + active) ok "systemd-oomd active" ;; + *) fail "systemd-oomd is $state after start"; return 4 ;; + esac +} + +# ------------------------------------------------------------- fix quinn-ai - + +cmd_fix_quinn_ai() { + require_apricot + echo "fix quinn-ai: stop, delete unit, mask" + + unit_file=/etc/systemd/system/quinn-ai-auto-respond.service + + sudo systemctl stop quinn-ai-auto-respond.service 2>/dev/null || true + sudo systemctl disable quinn-ai-auto-respond.service 2>/dev/null || true + + if [ -e "$unit_file" ]; then + sudo rm -f "$unit_file" + ok "removed $unit_file" + else + ok "unit file already absent" + fi + + sudo systemctl daemon-reload + # mask now works because the symlink target is free + sudo systemctl mask quinn-ai-auto-respond.service >/dev/null + ok "masked" + + state=$(systemctl is-active quinn-ai-auto-respond.service 2>/dev/null || true) + enabled=$(systemctl is-enabled quinn-ai-auto-respond.service 2>/dev/null || true) + case "$enabled" in + masked) ok "is-active=$state is-enabled=$enabled" ;; + *) fail "expected enabled=masked, got '$enabled'"; return 4 ;; + esac +} + +cmd_fix_all() { + cmd_fix_dnsmasq + echo + cmd_fix_oomd + echo + cmd_fix_quinn_ai +} + +# ------------------------------------------------------------------ forensic + +cmd_forensic() { + require_apricot + idx=${1:--1} + case "$idx" in + -[0-9]*|[0-9]*) ;; + *) die "boot index must be an integer (e.g. -1 for previous boot)" ;; + esac + + boot_id=$(journalctl --list-boots --no-pager \ + | awk -v i="$idx" '$1 == i {print $2; exit}') + [ -n "$boot_id" ] \ + || die "no boot at index $idx (see: journalctl --list-boots)" + + stamp=$(date -u +%Y%m%dT%H%M%SZ) + out="$out_root/forensic-$stamp-boot$idx-$boot_id" + sudo mkdir -p "$out" + sudo chown "$(id -u):$(id -g)" "$out" + echo "forensic dump: $out" + + journalctl --list-boots --no-pager > "$out/01-boots.txt" 2>&1 || true + journalctl -b "$idx" -k --no-pager > "$out/02-kernel-ring.txt" 2>&1 || true + journalctl -b "$idx" -p err --no-pager > "$out/03-priority-err.txt" 2>&1 || true + journalctl -b "$idx" --no-pager | tail -500 > "$out/04-tail-500.txt" 2>&1 || true + # error-signature grep, written with -e to avoid alternation quoting + journalctl -b "$idx" --no-pager 2>/dev/null \ + | grep -e panic -e hung_task -e MCE -e EDAC -e segfault \ + -e 'Hardware Error' -e 'Out of memory' -e 'nvme.*error' \ + -e throttl -e oom-kill \ + > "$out/05-error-signatures.txt" 2>&1 || true + sudo coredumpctl list > "$out/06-coredumps.txt" 2>&1 || true + systemctl --failed --no-pager > "$out/07-failed-units.txt" 2>&1 || true + rpm-ostree status > "$out/08-rpm-ostree.txt" 2>&1 || true + { + echo '--- /proc/pressure/cpu ---'; cat /proc/pressure/cpu 2>/dev/null + echo '--- /proc/pressure/io ---'; cat /proc/pressure/io 2>/dev/null + echo '--- /proc/pressure/memory ---'; cat /proc/pressure/memory 2>/dev/null + } > "$out/09-pressure-now.txt" 2>&1 || true + [ -f "$HOME/apricot-pressure-alerts.log" ] \ + && cp "$HOME/apricot-pressure-alerts.log" "$out/10-pressure-alerts.log" || true + + n=$(ls "$out" | wc -l | tr -d ' ') + ok "wrote $n artifact(s) to $out" +} + +# --------------------------------------------------------------- dispatch --- + +case "${1:-check}" in + check|'') + cmd_check + ;; + fix) + shift + case "${1:-}" in + dnsmasq) cmd_fix_dnsmasq ;; + oomd) cmd_fix_oomd ;; + quinn-ai) cmd_fix_quinn_ai ;; + all) cmd_fix_all ;; + *) usage ;; + esac + ;; + forensic) + shift + cmd_forensic "${1:-}" + ;; + -h|--help|help) + usage + ;; + *) + usage + ;; +esac diff --git a/bin/wg-dns-sync b/bin/wg-dns-sync index d8fdbb9..d3897cd 100755 --- a/bin/wg-dns-sync +++ b/bin/wg-dns-sync @@ -64,7 +64,9 @@ host=$(hostname -s) printf '# Bind only to the wg1 IP so this view is invisible to LAN/loopback clients\n' printf '# (which are served by lilith-local.conf with split-horizon 127.0.0.1 records).\n' printf 'listen-address=%s\n' "$listen" - printf 'bind-interfaces\n' + # bind-dynamic (not bind-interfaces): binds the listen-address as it appears, + # so dnsmasq doesn't lose the boot race against wg1 coming up. + printf 'bind-dynamic\n' printf '\n' printf '# DNS records (one per record entry in wg-mesh-hosts.json)\n' jq -r '.records[] | "address=/\(.name|sub("^\\.";""))/\(.ip) # \(.comment // "")"' "$data_file" @@ -90,8 +92,23 @@ fi # Back up current target (if any) before replacing — undo handled by re-running # wg-dns-sync after editing the JSON, NOT by restoring this backup. Backup is # audit-only; safe to delete. +# +# Backup MUST NOT live in /etc/dnsmasq.d/ because dnsmasq's conf-dir loads every +# file there that isn't on its (small) exclude list; .prev was not excluded and +# got parsed as a second config, silently doubling listen-address= and address= +# entries. +backup_dir=/var/lib/wg-dns-sync +backup="$backup_dir/wg-mesh.conf.prev" +sudo mkdir -p "$backup_dir" + +# One-shot migration: earlier versions of this script wrote the backup as +# ${target}.prev inside /etc/dnsmasq.d/. Remove that stale file if present. +if [ -e "${target}.prev" ]; then + sudo rm -f "${target}.prev" +fi + if [ -r "$target" ]; then - sudo cp "$target" "${target}.prev" + sudo install -m 0644 -o root -g root "$target" "$backup" fi sudo install -m 0644 -o root -g root "$tmp" "$target" @@ -101,8 +118,8 @@ echo "wg-dns-sync: installed $target" # unreachable because wg1 is down), restore the previous conf and exit 3. if ! sudo systemctl restart dnsmasq; then echo "wg-dns-sync: dnsmasq failed to restart, rolling back" >&2 - if [ -r "${target}.prev" ]; then - sudo install -m 0644 -o root -g root "${target}.prev" "$target" + if [ -r "$backup" ]; then + sudo install -m 0644 -o root -g root "$backup" "$target" else sudo rm -f "$target" fi