#!/bin/sh
# bigdisk-mount-guard — autorecover tool for the ~/_/bigdisk NFS mount
# (Apple Photos originals + other big storage on black).
#
# The mount is provided both via:
#   - /etc/auto_nfs (autofs static): bigdisk -fstype=nfs,resvport black:/bigdisk
#   - /etc/fstab (hard NFS): 10.0.0.11:/bigdisk /Users/natalie/_/bigdisk nfs resvport,rw,bg,hard,intr,tcp
#
# On sleep, network roam, WG flap or "no WiFi" location switches the handle often
# goes stale. autofs/fstab fails to re-trigger, Photos sees a dead symlink for
# originals and the library gets stuck (permission errors, "Repairing…", repeated
# password prompts for com.apple.library-repair).
#
# This tool autorecovers:
#   1. If storage IP unreachable, attempt to bring the WireGuard mesh (wg1) up.
#   2. If server reachable but mount wedged, force-umount + touch to re-trigger.
#   3. On success, run Photos-specific recovery (clean DB locks, restart daemons,
#      restore canonical symlink if it had been pointed at a local stub).
#
# Designed to run as root LaunchDaemon (contrib plist). Also safe/useful by hand
# or from host-monitor/disk-guard style tools.
#
# No GNU timeout (uses perl alarm, always on macOS).
set -u

# Support limited CLI for use as a tool: --status, --force, --help
case "${1:-}" in
--help|-h)
	cat <<'EOF'
bigdisk-mount-guard [options]

  (no arg)   normal check + autorecover (idempotent, for launchd or cron)
  --status   report only (0=healthy, 1=needs attention), no changes
  --force    force a full recovery cycle (useful after manual WG bring-up)
  --help     this message

Runs as root (needs umount/mount/wg-quick). Logs to $LOG.
EOF
	exit 0
	;;
--status)
	STATUS_ONLY=1
	;;
--force)
	FORCE=1
	;;
"") ;;
*)
	echo "unknown arg: $1" >&2
	exit 64
	;;
esac


MP="/Users/natalie/_/bigdisk"
PROBE="${MP}/Photos"          # a dir that only exists when the mount is live
SERVER_IP="10.0.0.11"         # black; 'black' resolves here too, routed via LAN or WG
LOG="/var/log/bigdisk-mount-guard.log"
WG_IF="wg1"
WG_CONF="/Users/natalie/.wireguard/wg1.conf"
PHOTOS_LIB="/Users/natalie/Pictures/Photos Library.photoslibrary"

log() { echo "$(/bin/date '+%Y-%m-%dT%H:%M:%S') $*" >> "$LOG" 2>/dev/null; }

# macOS notification (works from root daemon for the console user)
notify() {
    title="$1"
    msg="$2"
    /usr/bin/osascript -e "display notification \"$msg\" with title \"$title\" sound name \"Ping\"" 2>/dev/null || true
}

# run "$@" but kill it after N seconds (first arg); returns the command's status,
# or 124 if it timed out / failed to exec. Uses perl's SIGALRM — always present on macOS.
bounded() {
	_t=$1; shift
	/usr/bin/perl -e 'alarm shift; exec @ARGV or exit 124' "$_t" "$@"
}

ensure_wg() {
	if /opt/homebrew/bin/wg show "$WG_IF" >/dev/null 2>&1; then
		return 0
	fi
	log "WG $WG_IF down; attempting wg-quick up to reach $SERVER_IP"
	/opt/homebrew/bin/wg-quick down "$WG_CONF" >/dev/null 2>&1 || true
	if /opt/homebrew/bin/wg-quick up "$WG_CONF" >>"$LOG" 2>&1; then
		log "WG up succeeded; waiting for mesh routes"
		sleep 8
		return 0
	else
		log "wg-quick up failed"
		return 1
	fi
}

recover_photos() {
	[ -d "$PHOTOS_LIB" ] || return 0
	log "Photos autorecover cleanup"
	# stale locks often block open/repair after volume events
	find "$PHOTOS_LIB/database" -name '*lock*' -type f -delete 2>/dev/null || true
	# restart the user's analysis daemons so they see the freshly mounted originals
	/usr/bin/sudo -u natalie /usr/bin/pkill -x photoanalysisd 2>/dev/null || true
	/usr/bin/sudo -u natalie /usr/bin/pkill -f photolibraryd 2>/dev/null || true
	# if a previous troubleshooting step pointed the symlink at a local stub, restore the real one
	cur=$(readlink "$PHOTOS_LIB/originals" 2>/dev/null || true)
	if echo "$cur" | grep -q 'Photos-stub\|stub/originals'; then
		ln -sfn "${MP}/Photos/Photos Library.photoslibrary/originals" "$PHOTOS_LIB/originals"
		log "restored canonical originals symlink"
	fi
	log "Photos cleanup done — relaunch Photos (hold Option+Command for Repair if it still whines)"
	notify "bigdisk-mount-guard" "Photos library autorecovered (mount + cleanup)"
}

# --- main ---

if [ "${STATUS_ONLY:-0}" = 1 ]; then
	if bounded 5 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
		echo "OK: bigdisk probe $PROBE reachable"
		exit 0
	else
		echo "STALE: bigdisk probe $PROBE not reachable"
		if /sbin/ping -c1 -t2 "$SERVER_IP" >/dev/null 2>&1; then
			echo "  server $SERVER_IP is up (mount wedged?)"
		else
			echo "  server $SERVER_IP unreachable"
		fi
		exit 1
	fi
fi

# Healthy path: probe stats quickly. Nothing (more) to do.
if [ "${FORCE:-0}" != 1 ] && bounded 8 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
	exit 0
fi

# Path not reachable (or forced). Try to make the server reachable first.
if ! /sbin/ping -c1 -t3 "$SERVER_IP" >/dev/null 2>&1; then
	log "probe failed; ${SERVER_IP} unreachable"
	if ensure_wg; then
		notify "bigdisk-mount-guard" "Brought up WG mesh to reach storage"
		if ! /sbin/ping -c1 -t3 "$SERVER_IP" >/dev/null 2>&1; then
			log "${SERVER_IP} still unreachable after WG attempt — leaving mount alone"
			notify "bigdisk-mount-guard" "bigdisk offline (server unreachable after WG) — no action"
			exit 0
		fi
		# fallthrough to mount recovery now that mesh is up
	else
		log "WG recovery not possible — offline, leaving mount alone"
		notify "bigdisk-mount-guard" "bigdisk offline (server unreachable) — no action taken"
		exit 0
	fi
fi

# Server reachable (possibly after WG recovery) but probe failed → wedged/stale mount.
log "probe failed but ${SERVER_IP} reachable — force-clearing wedged mount at ${MP}"
notify "bigdisk-mount-guard" "bigdisk mount wedged — force recovering now"
bounded 10 /sbin/umount -f "$MP" >/dev/null 2>&1 || true
# Kick automount to refresh static maps (safe/no-op if already good).
bounded 5 /usr/sbin/automount -vc >>"$LOG" 2>&1 || true
# Touch the path to (re)trigger autofs (or fstab bg reconnect).
bounded 15 /bin/ls "$MP" >/dev/null 2>&1 || true
sleep 3

if bounded 10 /usr/bin/stat "$PROBE" >/dev/null 2>&1; then
	log "remount OK"
	notify "bigdisk-mount-guard" "bigdisk mount recovered successfully"
	recover_photos
	exit 0
fi

log "remount still failing after force-clear"
notify "bigdisk-mount-guard" "bigdisk remount still failing (check logs)"
exit 1
