The tray's Quit handler already boots out com.wireguard.vpn-tray, but install-tray.sh had retired that launchd job and relied on the fleet agent to nohup it — which never ran the tray reliably at boot (no GUI session yet). Restore the LaunchAgent (same pattern as com.lilith.mac-sync): RunAtLoad starts it at login in the GUI session, KeepAlive relaunches on crash. ensure_tray() now defers to launchd when the agent is installed (Popen path kept as fallback). Removes the dead standalone plist.
909 lines
37 KiB
Python
Executable file
909 lines
37 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""smart-lan-router — the ONE net-tools agent, same code on every fleet node.
|
||
|
||
Every node (osx / ubuntu / bluefin) runs this identical service; what a node
|
||
actually *does* each cycle is derived from its own entry in data/mesh-hosts.json
|
||
— not from which variant got installed:
|
||
|
||
pull every node git pull the repo (declared truth + this agent's own
|
||
code propagate; on a code change the agent exits so
|
||
launchd/systemd restart it fresh)
|
||
discover LAN-visible map each declared MAC -> current DHCP IP via the
|
||
nodes neighbour table (ARP / ip neigh); write
|
||
data/lan-state.json on change
|
||
route class=laptop HOME (gateway MAC == lan.gateway_mac) -> route the
|
||
(darwin) LAN /24 out the physical interface (~3ms); AWAY ->
|
||
via the wg mesh (Iceland hub) so home stays reachable
|
||
render every node on any change, regenerate /etc/hosts
|
||
(mesh-hosts-render --install) and the render user's
|
||
~/.ssh/config (host-apply --ssh-apply)
|
||
|
||
Locked-down platforms (ios / android) cannot run agents at all — they consume
|
||
names via apricot's mesh dnsmasq (wg-dns-sync) with DNS=10.9.0.2 in their
|
||
WireGuard config. Windows is a documented non-goal until a Windows node exists.
|
||
|
||
Safety properties (unchanged from the single-node daemon):
|
||
* subnet route, never per-host /32 pins (macOS self-MAC ARP bug)
|
||
* home is fingerprinted by the GATEWAY MAC, so a café 10.0.0.0/24 is AWAY
|
||
* bad config read -> keep last-good, never tear down routing
|
||
* git pull runs as the REPO OWNER, never root (preserves autocommit ownership)
|
||
|
||
Requires root for route changes + /etc/hosts writes. `--status` needs none.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import ipaddress
|
||
import json
|
||
import logging
|
||
import os
|
||
import pwd
|
||
import re
|
||
import shutil
|
||
import signal
|
||
import socket
|
||
import subprocess
|
||
import sys
|
||
import time
|
||
from dataclasses import dataclass
|
||
|
||
PLATFORM = "darwin" if sys.platform == "darwin" else "linux"
|
||
PROBE_INTERVAL_SEC = 20
|
||
SUBPROCESS_TIMEOUT_SEC = 8
|
||
SWEEP_MIN_INTERVAL_SEC = 300 # don't ping-sweep the /24 more often than this
|
||
PULL_MIN_INTERVAL_SEC = 300 # don't git-pull more often than this
|
||
|
||
logger = logging.getLogger("smart-lan-router")
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class Config:
|
||
lan_cidr: str # e.g. "10.0.0.0/24"
|
||
gateway: str # e.g. "10.0.0.1"
|
||
gateway_mac: str # home-LAN fingerprint
|
||
mesh_cidr: str # e.g. "10.9.0.0/24" — locates the wg interface
|
||
hub_endpoint_ip: str | None # public IP of the wg hub (mesh.hub_endpoint)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Source of truth
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def find_data_file() -> str:
|
||
self_path = os.path.abspath(__file__)
|
||
while os.path.islink(self_path):
|
||
link = os.readlink(self_path)
|
||
self_path = link if os.path.isabs(link) else os.path.join(os.path.dirname(self_path), link)
|
||
root = os.path.dirname(self_path)
|
||
while root != "/" and not os.path.isfile(os.path.join(root, "data", "mesh-hosts.json")):
|
||
root = os.path.dirname(root)
|
||
data_file = os.path.join(root, "data", "mesh-hosts.json")
|
||
if not os.path.isfile(data_file):
|
||
raise FileNotFoundError(f"cannot locate data/mesh-hosts.json from {self_path}")
|
||
return data_file
|
||
|
||
|
||
def load_json(path: str) -> dict:
|
||
with open(path, encoding="utf-8") as fh:
|
||
return json.load(fh)
|
||
|
||
|
||
def load_config(data_file: str) -> Config:
|
||
data = load_json(data_file)
|
||
lan, mesh = data.get("lan", {}), data.get("mesh", {})
|
||
gw_mac = lan.get("gateway_mac")
|
||
if not gw_mac:
|
||
raise ValueError("mesh-hosts.json lan.gateway_mac is required (home-LAN fingerprint)")
|
||
return Config(lan_cidr=lan["cidr"], gateway=lan["gateway"],
|
||
gateway_mac=gw_mac.lower(), mesh_cidr=mesh["cidr"],
|
||
hub_endpoint_ip=(mesh.get("hub_endpoint") or "").rsplit(":", 1)[0] or None)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Subprocess + platform shims
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _run(argv: list[str], timeout: float = SUBPROCESS_TIMEOUT_SEC) -> tuple[int, str, str]:
|
||
try:
|
||
p = subprocess.run(argv, capture_output=True, text=True, timeout=timeout, check=False)
|
||
return p.returncode, p.stdout, p.stderr
|
||
except subprocess.TimeoutExpired:
|
||
return 124, "", f"timeout after {timeout}s"
|
||
except FileNotFoundError as e:
|
||
return 127, "", str(e)
|
||
|
||
|
||
def _bin(name: str, *fallbacks: str) -> str:
|
||
found = shutil.which(name)
|
||
if found:
|
||
return found
|
||
for f in fallbacks:
|
||
if os.path.exists(f):
|
||
return f
|
||
return name
|
||
|
||
|
||
def norm_mac(mac: str) -> str:
|
||
try:
|
||
return ":".join("%02x" % int(o, 16) for o in mac.split(":"))
|
||
except (ValueError, AttributeError):
|
||
return (mac or "").lower()
|
||
|
||
|
||
def neighbors() -> dict[str, str]:
|
||
"""mac -> ip from the neighbour table (ARP on darwin, ip neigh on linux)."""
|
||
table: dict[str, str] = {}
|
||
if PLATFORM == "darwin":
|
||
rc, out, _ = _run([_bin("arp", "/usr/sbin/arp"), "-a", "-n"])
|
||
for line in out.splitlines():
|
||
m = re.search(r"\((\d+\.\d+\.\d+\.\d+)\) at ([0-9a-f:]{1,17}) ", line)
|
||
if m:
|
||
table[norm_mac(m.group(2))] = m.group(1)
|
||
else:
|
||
rc, out, _ = _run([_bin("ip", "/usr/sbin/ip", "/sbin/ip"), "-4", "neigh", "show"])
|
||
for line in out.splitlines():
|
||
m = re.match(r"(\d+\.\d+\.\d+\.\d+)\s.*\slladdr\s+([0-9a-f:]{1,17})\s+(\S+)", line)
|
||
if m and m.group(3) not in ("FAILED", "INCOMPLETE"):
|
||
table[norm_mac(m.group(2))] = m.group(1)
|
||
return table
|
||
|
||
|
||
def neighbor_mac(ip: str) -> str | None:
|
||
for mac, nip in neighbors().items():
|
||
if nip == ip:
|
||
return mac
|
||
return None
|
||
|
||
|
||
def _ipv4(s: str | None) -> bool:
|
||
return bool(s and re.match(r"^\d+\.\d+\.\d+\.\d+$", s))
|
||
|
||
|
||
def default_route() -> tuple[str | None, str | None]:
|
||
"""(gateway_ip, interface) of the current default route."""
|
||
if PLATFORM == "darwin":
|
||
rc, out, _ = _run([_bin("route", "/sbin/route"), "-n", "get", "default"])
|
||
gw = iface = None
|
||
for line in out.splitlines():
|
||
s = line.strip()
|
||
if s.startswith("gateway:"):
|
||
cand = s.split(":", 1)[1].strip().split()[0]
|
||
gw = cand if _ipv4(cand) else None
|
||
elif s.startswith("interface:"):
|
||
iface = s.split()[1]
|
||
return gw, iface
|
||
rc, out, _ = _run([_bin("ip", "/usr/sbin/ip", "/sbin/ip"), "route", "show", "default"])
|
||
m = re.search(r"default via (\d+\.\d+\.\d+\.\d+) dev (\S+)", out)
|
||
return (m.group(1), m.group(2)) if m else (None, None)
|
||
|
||
|
||
def local_ipv4s() -> set[str]:
|
||
if PLATFORM == "darwin":
|
||
rc, out, _ = _run([_bin("ifconfig", "/sbin/ifconfig")])
|
||
else:
|
||
rc, out, _ = _run([_bin("ip", "/usr/sbin/ip", "/sbin/ip"), "-o", "-4", "addr", "show"])
|
||
return set(re.findall(r"inet (\d+\.\d+\.\d+\.\d+)", out))
|
||
|
||
|
||
def iface_in_cidr(cidr: str) -> str | None:
|
||
"""The local interface carrying an address inside `cidr` (e.g. the wg iface)."""
|
||
net = ipaddress.ip_network(cidr, strict=False)
|
||
if PLATFORM == "darwin":
|
||
rc, out, _ = _run([_bin("ifconfig", "/sbin/ifconfig")])
|
||
cur = None
|
||
for line in out.splitlines():
|
||
if line and not line[0].isspace():
|
||
cur = line.split(":")[0]
|
||
else:
|
||
m = re.search(r"inet (\d+\.\d+\.\d+\.\d+)", line)
|
||
if m and ipaddress.ip_address(m.group(1)) in net:
|
||
return cur
|
||
return None
|
||
rc, out, _ = _run([_bin("ip", "/usr/sbin/ip", "/sbin/ip"), "-o", "-4", "addr", "show"])
|
||
for line in out.splitlines():
|
||
m = re.match(r"\d+:\s+(\S+)\s+inet (\d+\.\d+\.\d+\.\d+)", line)
|
||
if m and ipaddress.ip_address(m.group(2)) in net:
|
||
return m.group(1)
|
||
return None
|
||
|
||
|
||
def ping_sweep(cidr: str) -> None:
|
||
"""Briefly ping every host in the subnet to populate the neighbour table."""
|
||
ping = _bin("ping", "/sbin/ping", "/usr/bin/ping")
|
||
flags = ["-c1", "-t1"] if PLATFORM == "darwin" else ["-c1", "-W1"]
|
||
net = ipaddress.ip_network(cidr, strict=False)
|
||
batch: list[subprocess.Popen] = []
|
||
for host in net.hosts():
|
||
batch.append(subprocess.Popen([ping, *flags, str(host)],
|
||
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))
|
||
if len(batch) >= 80:
|
||
for p in batch:
|
||
try:
|
||
p.wait(timeout=3)
|
||
except subprocess.TimeoutExpired:
|
||
p.kill()
|
||
batch = []
|
||
for p in batch:
|
||
try:
|
||
p.wait(timeout=3)
|
||
except subprocess.TimeoutExpired:
|
||
p.kill()
|
||
|
||
|
||
# --- subnet route (laptop role; darwin only — no linux laptop in the fleet) ----
|
||
|
||
def subnet_route_iface(cidr: str) -> str | None:
|
||
net = ipaddress.ip_network(cidr, strict=False)
|
||
probe = str(net.network_address + 250)
|
||
if PLATFORM == "darwin":
|
||
rc, out, _ = _run([_bin("route", "/sbin/route"), "-n", "get", probe])
|
||
for line in out.splitlines():
|
||
if line.strip().startswith("interface:"):
|
||
return line.split()[1]
|
||
return None
|
||
rc, out, _ = _run([_bin("ip", "/usr/sbin/ip", "/sbin/ip"), "route", "get", probe])
|
||
m = re.search(r"\bdev (\S+)", out)
|
||
return m.group(1) if m else None
|
||
|
||
|
||
def set_subnet_route(cidr: str, iface: str) -> bool:
|
||
if PLATFORM != "darwin":
|
||
logger.warning("route switch not implemented on linux (no linux laptop role) — skipping")
|
||
return False
|
||
route = _bin("route", "/sbin/route")
|
||
rc, _, _ = _run([route, "-n", "change", cidr, "-interface", iface])
|
||
if rc == 0 and subnet_route_iface(cidr) == iface:
|
||
return True
|
||
# `route change` can return 0 while leaving the route on a dead interface
|
||
# index (the hotspot loop of 2026-06-10) — verify, else rebuild from scratch
|
||
_run([route, "-n", "delete", cidr])
|
||
rc, _, err = _run([route, "-n", "add", cidr, "-interface", iface])
|
||
if rc != 0 or subnet_route_iface(cidr) != iface:
|
||
logger.error("failed to route %s via %s: %s", cidr, iface,
|
||
err.strip() or "route did not stick")
|
||
return False
|
||
return True
|
||
|
||
|
||
# --- wg endpoint pin + default-route healing (laptop role; darwin only) -------
|
||
|
||
def _lease_gateway(iface: str) -> tuple[str | None, str | None]:
|
||
"""(router_ip, iface) from a single interface's DHCP lease, or (None, None)."""
|
||
ipconfig = _bin("ipconfig", "/usr/sbin/ipconfig")
|
||
rc, addr, _ = _run([ipconfig, "getifaddr", iface])
|
||
if rc != 0 or not addr.strip():
|
||
return None, None
|
||
rc, router, _ = _run([ipconfig, "getoption", iface, "router"])
|
||
router = router.strip()
|
||
if rc == 0 and router:
|
||
return router, iface
|
||
return None, None
|
||
|
||
|
||
def physical_gateway() -> tuple[str | None, str | None]:
|
||
"""(router_ip, iface) of the physical uplink.
|
||
|
||
After a network switch ipconfig can keep a STALE DHCP router (home
|
||
10.0.0.1) while the routing table already points at the new uplink
|
||
(hotspot 172.20.10.1). When they disagree, the routing table wins."""
|
||
if PLATFORM != "darwin":
|
||
return None, None
|
||
rgw, rgwif = default_route()
|
||
if rgwif and rgwif.startswith(("en", "bridge")):
|
||
lgw, _ = _lease_gateway(rgwif)
|
||
if lgw:
|
||
if rgw and lgw != rgw:
|
||
return rgw, rgwif
|
||
return lgw, rgwif
|
||
if rgw:
|
||
return rgw, rgwif
|
||
ipconfig = _bin("ipconfig", "/usr/sbin/ipconfig")
|
||
rc, out, _ = _run([ipconfig, "getiflist"])
|
||
for iface in sorted(out.split()):
|
||
if not iface.startswith(("en", "bridge")):
|
||
continue
|
||
pgw, pif = _lease_gateway(iface)
|
||
if pgw:
|
||
return pgw, pif
|
||
return None, None
|
||
|
||
|
||
def host_route(ip: str) -> tuple[bool, str | None, str | None]:
|
||
"""(is_host_pin, gateway, iface) for `ip` per the routing table."""
|
||
rc, out, _ = _run([_bin("route", "/sbin/route"), "-n", "get", ip])
|
||
dest = gw = iface = None
|
||
for line in out.splitlines():
|
||
s = line.strip()
|
||
if s.startswith("destination:"):
|
||
dest = s.split()[1]
|
||
elif s.startswith("gateway:"):
|
||
gw = s.split()[1]
|
||
elif s.startswith("interface:"):
|
||
iface = s.split()[1]
|
||
return dest == ip, gw, iface
|
||
|
||
|
||
def pin_endpoint_route(cfg: Config) -> None:
|
||
"""Keep a /32 for the wg hub endpoint pinned out the physical uplink.
|
||
|
||
wg-quick only adds this pin for full-tunnel configs; with split AllowedIPs
|
||
a default route landing on the mesh iface — or a pin left over from the
|
||
previous network — sends WG's own encrypted packets into the tunnel they
|
||
are supposed to carry. Silent blackhole until converged here."""
|
||
ep = cfg.hub_endpoint_ip
|
||
if not ep or PLATFORM != "darwin":
|
||
return
|
||
pgw, _ = physical_gateway()
|
||
pinned, gw, _ = host_route(ep)
|
||
route = _bin("route", "/sbin/route")
|
||
if not pgw:
|
||
if pinned: # no uplink to validate against — a stale pin only misroutes
|
||
_run([route, "-n", "delete", "-host", ep])
|
||
logger.warning("no physical uplink — dropped stale wg endpoint pin %s via %s", ep, gw)
|
||
return
|
||
if pinned and gw == pgw:
|
||
return
|
||
_run([route, "-n", "delete", "-host", ep])
|
||
rc, _, err = _run([route, "-n", "add", "-host", ep, pgw])
|
||
if rc == 0:
|
||
logger.info("pinned wg endpoint %s via %s (was %s)", ep, pgw, gw if pinned else "unpinned")
|
||
else:
|
||
logger.error("failed to pin wg endpoint %s via %s: %s", ep, pgw, err.strip())
|
||
|
||
|
||
def heal_default_route(cfg: Config) -> None:
|
||
"""The mesh is split-tunnel by design — a v4 default route on the mesh
|
||
iface is always wreckage from a network switch, and it blackholes ALL v4
|
||
including WG's own handshake. Point it back at the physical uplink."""
|
||
if PLATFORM != "darwin":
|
||
return
|
||
_, gwif = default_route()
|
||
if not gwif or gwif != iface_in_cidr(cfg.mesh_cidr):
|
||
return
|
||
pgw, pif = physical_gateway()
|
||
if not pgw:
|
||
# A mesh default with no physical uplink blackholes DHCP — drop it so
|
||
# Wi‑Fi/hotspot can join (mid-switch or first connect).
|
||
route = _bin("route", "/sbin/route")
|
||
_run([route, "-n", "delete", "default"])
|
||
logger.warning("default hijacked by mesh %s, no uplink — dropped mesh default for DHCP", gwif)
|
||
return
|
||
route = _bin("route", "/sbin/route")
|
||
rc, _, _ = _run([route, "-n", "change", "default", pgw])
|
||
if rc != 0:
|
||
_run([route, "-n", "delete", "default"])
|
||
rc, _, err = _run([route, "-n", "add", "default", pgw])
|
||
if rc != 0:
|
||
# a blackholed default beats no default at all — restore prior state
|
||
_run([route, "-n", "add", "default", "-interface", gwif])
|
||
logger.error("could not move default to %s (%s) — reverted to %s", pgw, err.strip(), gwif)
|
||
return
|
||
logger.info("healed hijacked default route: mesh iface %s → via %s on %s", gwif, pgw, pif)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Self-identity + roles (all derived from mesh-hosts.json — never hardcoded)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def identify_self(data: dict) -> dict | None:
|
||
short = socket.gethostname().split(".")[0].lower()
|
||
ips = local_ipv4s()
|
||
for h in data.get("hosts", []):
|
||
names = {h.get("name", "").lower(), *(a.lower() for a in h.get("aliases", []))}
|
||
if short in names or (h.get("lan") in ips) or (h.get("wg") in ips):
|
||
return h
|
||
return None
|
||
|
||
|
||
def render_user(repo_root: str, self_host: dict | None) -> str | None:
|
||
"""Whose ~/.ssh/config to maintain: the host's DECLARED ssh_user (even root —
|
||
on a root-operated VPS that is the real login user), else the repo owner,
|
||
else (darwin) the console user — those fallbacks never guess root."""
|
||
if self_host and self_host.get("ssh_user"):
|
||
return self_host["ssh_user"]
|
||
try:
|
||
owner = pwd.getpwuid(os.stat(repo_root).st_uid).pw_name
|
||
if owner != "root":
|
||
return owner
|
||
except (OSError, KeyError):
|
||
pass
|
||
if PLATFORM == "darwin":
|
||
rc, out, _ = _run(["/usr/bin/stat", "-f%Su", "/dev/console"])
|
||
u = out.strip()
|
||
if rc == 0 and u and u != "root":
|
||
return u
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Hostname convergence — the FLEET renames hosts, never a human with hostnamectl
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def enforce_hostname(name: str) -> None:
|
||
"""Converge this node's OS hostname to its canonical mesh-hosts.json name.
|
||
Gated by fleet.enforce_hostname in the source of truth. Idempotent; kills
|
||
relic FQDNs (plum.voyager.nasty.sh, 0.vps.1984.uvlava.com) as a side effect."""
|
||
current = socket.gethostname().split(".")[0].lower()
|
||
if current == name:
|
||
return
|
||
if PLATFORM == "darwin":
|
||
ok = True
|
||
for key in ("HostName", "LocalHostName", "ComputerName"):
|
||
rc, _, err = _run(["/usr/sbin/scutil", "--set", key, name])
|
||
if rc != 0:
|
||
logger.error("scutil --set %s failed: %s", key, err.strip())
|
||
ok = False
|
||
if ok:
|
||
logger.info("hostname converged: %s → %s (scutil ×3)", current, name)
|
||
else:
|
||
rc, _, err = _run([_bin("hostnamectl", "/usr/bin/hostnamectl"), "set-hostname", name])
|
||
if rc == 0:
|
||
logger.info("hostname converged: %s → %s (hostnamectl)", current, name)
|
||
else:
|
||
logger.error("hostnamectl set-hostname %s failed: %s", name, err.strip())
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Pull — propagate declared truth + this agent's own code
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _heal_pull_blockers(as_owner, repo_root: str, err: str) -> bool:
|
||
"""git refuses an ff merge over working files it would touch — even when
|
||
their content is byte-identical to the incoming commit (e.g. a tree that
|
||
was bridged by rsync before the same files landed upstream). Clearing only
|
||
exact matches is lossless: untracked twins are removed (the merge recreates
|
||
them verbatim), tracked edits matching the target are reset to HEAD (the
|
||
merge fast-forwards them straight back). Any file that differs stays put
|
||
and keeps blocking — real local work is never destroyed."""
|
||
paths: list[str] = []
|
||
collect = False
|
||
for line in err.splitlines():
|
||
if "would be overwritten by" in line:
|
||
collect = True
|
||
elif collect and line.startswith(("\t", " ")):
|
||
paths.append(line.strip())
|
||
else:
|
||
collect = False
|
||
if not paths:
|
||
return False
|
||
healed = 0
|
||
for p in paths:
|
||
rc, want, _ = as_owner(["rev-parse", f"@{{u}}:{p}"], 15)
|
||
if rc != 0:
|
||
return False
|
||
rc, have, _ = as_owner(["hash-object", "--", p], 15)
|
||
if rc != 0 or have.strip() != want.strip():
|
||
logger.warning("pull blocked by %s which differs from upstream — leaving it", p)
|
||
return False
|
||
if as_owner(["ls-files", "--error-unmatch", "--", p], 15)[0] == 0:
|
||
if as_owner(["checkout", "--", p], 15)[0] != 0:
|
||
return False
|
||
else:
|
||
try:
|
||
os.unlink(os.path.join(repo_root, p))
|
||
except OSError as exc:
|
||
logger.warning("pull self-heal: cannot remove %s: %s", p, exc)
|
||
return False
|
||
healed += 1
|
||
logger.info("pull self-heal: cleared %d file(s) byte-identical to upstream", healed)
|
||
return True
|
||
|
||
|
||
def git_pull(repo_root: str, ctx: dict) -> bool:
|
||
"""ff-only pull as the REPO OWNER (root-owned .git objects would break the
|
||
autocommit service). Returns True iff HEAD moved (caller exits to restart)."""
|
||
if os.environ.get("NET_TOOLS_SKIP_PULL"):
|
||
return False
|
||
if not os.path.isdir(os.path.join(repo_root, ".git")):
|
||
return False
|
||
now = time.time()
|
||
if now - ctx.get("last_pull", 0) < PULL_MIN_INTERVAL_SEC:
|
||
return False
|
||
ctx["last_pull"] = now
|
||
try:
|
||
owner = pwd.getpwuid(os.stat(repo_root).st_uid).pw_name
|
||
except KeyError:
|
||
# The repo uid has no passwd entry (e.g. a clone carrying a foreign
|
||
# uid) — there is no owner whose objects need protecting; run git as
|
||
# ourselves rather than silently never pulling.
|
||
logger.warning("repo %s owned by unknown uid %d — pulling as current user",
|
||
repo_root, os.stat(repo_root).st_uid)
|
||
owner = None
|
||
except OSError:
|
||
return False
|
||
git = _bin("git", "/usr/bin/git")
|
||
def as_owner(args: list[str], timeout: float = 60) -> tuple[int, str, str]:
|
||
if owner is not None and owner != "root" and os.geteuid() == 0:
|
||
return _run(["/usr/bin/sudo", "-u", owner, "-H", git, "-C", repo_root, *args], timeout)
|
||
return _run([git, "-C", repo_root, *args], timeout)
|
||
rc, before, _ = as_owner(["rev-parse", "HEAD"], 15)
|
||
if rc != 0:
|
||
return False
|
||
for _ in range(3): # untracked + tracked blockers can surface in separate aborts
|
||
rc, _, err = as_owner(["pull", "--ff-only", "--quiet"])
|
||
if rc == 0 or not _heal_pull_blockers(as_owner, repo_root, err):
|
||
break
|
||
if rc != 0:
|
||
logger.warning("git pull failed (keeping current code/config): %s", err.strip()[:200])
|
||
return False
|
||
rc, after, _ = as_owner(["rev-parse", "HEAD"], 15)
|
||
if rc == 0 and after.strip() != before.strip():
|
||
logger.info("repo updated %s → %s — restarting agent for new code",
|
||
before.strip()[:8], after.strip()[:8])
|
||
return True
|
||
return False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Discover + render
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def lan_hosts_with_mac(data: dict, self_name: str | None) -> list[tuple[str, str]]:
|
||
return [(h["name"], norm_mac(h["mac"])) for h in data.get("hosts", [])
|
||
if h.get("mac") and h.get("name") != self_name
|
||
and (h.get("lan") is not None or h.get("class") == "laptop")]
|
||
|
||
|
||
def discover(cfg: Config, hosts: list[tuple[str, str]], ctx: dict) -> dict[str, str]:
|
||
want = {mac for _, mac in hosts}
|
||
table = neighbors()
|
||
if want - set(table):
|
||
now = time.time()
|
||
if now - ctx.get("last_sweep", 0) >= SWEEP_MIN_INTERVAL_SEC:
|
||
ctx["last_sweep"] = now
|
||
ping_sweep(cfg.lan_cidr)
|
||
table = neighbors()
|
||
found = {}
|
||
for name, mac in hosts:
|
||
ip = table.get(mac)
|
||
if ip and ipaddress.ip_address(ip) in ipaddress.ip_network(cfg.lan_cidr, strict=False):
|
||
found[name] = ip
|
||
return found
|
||
|
||
|
||
def render_views(repo_root: str, user: str | None) -> None:
|
||
_run([os.path.join(repo_root, "bin", "mesh-hosts-render"), "--install"])
|
||
if not user:
|
||
return
|
||
ha = [os.path.join(repo_root, "bin", "host-apply"), "--ssh-apply"]
|
||
try:
|
||
current = pwd.getpwuid(os.geteuid()).pw_name
|
||
except KeyError:
|
||
current = None
|
||
if user == current:
|
||
_run(ha) # already that user (e.g. root on a root-operated VPS — no sudo there)
|
||
else:
|
||
_run(["/usr/bin/sudo", "-u", user, "-H", *ha])
|
||
|
||
|
||
def ensure_tray(repo_root: str, user: str | None) -> None:
|
||
"""macOS laptop: keep the menu-bar tray up unless the user quit it.
|
||
|
||
The tray is owned by a per-user LaunchAgent (com.wireguard.vpn-tray —
|
||
RunAtLoad + KeepAlive, see tray/install-tray.sh), which starts it at login
|
||
and relaunches it on crash. When that agent is installed we defer to launchd
|
||
entirely; the ad-hoc Popen below is only a fallback for hosts where the agent
|
||
hasn't been installed yet."""
|
||
if PLATFORM != "darwin" or not user:
|
||
return
|
||
if os.path.isfile(os.path.join(repo_root, "data", ".tray-disabled")):
|
||
return
|
||
if os.path.isfile(f"/Users/{user}/Library/LaunchAgents/com.wireguard.vpn-tray.plist"):
|
||
return # launchd owns lifecycle (RunAtLoad + KeepAlive)
|
||
tray = os.path.join(repo_root, "tray", "vpn-tray")
|
||
if not os.path.isfile(tray):
|
||
return
|
||
rc, _, _ = _run(["/usr/bin/pgrep", "-f", "vpn_tray.py"], 2)
|
||
if rc == 0:
|
||
return
|
||
try:
|
||
subprocess.Popen(
|
||
["/usr/bin/sudo", "-u", user, "-H", tray],
|
||
stdout=subprocess.DEVNULL,
|
||
stderr=subprocess.DEVNULL,
|
||
start_new_session=True,
|
||
)
|
||
logger.info("started menu-bar tray as %s", user)
|
||
except OSError as exc:
|
||
logger.warning("tray spawn failed: %s", exc)
|
||
|
||
|
||
def sync_mesh_dns(repo_root: str, self_name: str | None) -> None:
|
||
"""Refresh apricot's mesh dnsmasq when lan-state drifts (phone DNS clients)."""
|
||
if not self_name:
|
||
return
|
||
data_file = os.path.join(repo_root, "data", "mesh-hosts.json")
|
||
try:
|
||
data = load_json(data_file)
|
||
except (OSError, json.JSONDecodeError):
|
||
return
|
||
if self_name != data.get("mesh", {}).get("dns_host"):
|
||
return
|
||
script = os.path.join(repo_root, "bin", "wg-dns-sync")
|
||
if os.path.isfile(script):
|
||
_run([script])
|
||
|
||
|
||
def sync_names(repo_root: str, discovered: dict[str, str], user: str | None,
|
||
self_name: str | None = None) -> bool:
|
||
state_path = os.path.join(repo_root, "data", "lan-state.json")
|
||
old: dict[str, str] = {}
|
||
if os.path.isfile(state_path):
|
||
try:
|
||
old = load_json(state_path)
|
||
except (json.JSONDecodeError, OSError):
|
||
old = {}
|
||
new = dict(old)
|
||
new.update(discovered) # keep last-known for hosts not seen this cycle
|
||
if new == old:
|
||
return False
|
||
tmp = state_path + ".tmp"
|
||
with open(tmp, "w", encoding="utf-8") as fh:
|
||
json.dump(new, fh, indent=2, sort_keys=True)
|
||
os.replace(tmp, state_path)
|
||
os.chmod(state_path, 0o644)
|
||
render_views(repo_root, user)
|
||
sync_mesh_dns(repo_root, self_name)
|
||
logger.info("names synced → %s", ", ".join(f"{k}={v}" for k, v in sorted(new.items())))
|
||
return True
|
||
|
||
|
||
def repo_head(repo_root: str) -> str | None:
|
||
git = _bin("git", "/usr/bin/git")
|
||
rc, out, _ = _run([git, "-c", f"safe.directory={repo_root}", "-C", repo_root,
|
||
"rev-parse", "--short", "HEAD"], 10)
|
||
return out.strip() if rc == 0 and out.strip() else None
|
||
|
||
|
||
def write_status(cfg: Config, ctx: dict) -> None:
|
||
"""Snapshot this agent's view to data/agent-status.json each cycle — the
|
||
feed for the tray, `fleet-status`, and any future web dashboard."""
|
||
state_path = os.path.join(ctx["repo_root"], "data", "lan-state.json")
|
||
discovered = {}
|
||
if os.path.isfile(state_path):
|
||
try:
|
||
discovered = load_json(state_path)
|
||
except (json.JSONDecodeError, OSError):
|
||
pass
|
||
status = {
|
||
"ts": int(time.time()),
|
||
"self": ctx["self_name"],
|
||
"hostname": socket.gethostname(),
|
||
"platform": PLATFORM,
|
||
"roles": sorted(ctx["roles"]),
|
||
"location": ctx.get("location"),
|
||
"lan_route_via": subnet_route_iface(cfg.lan_cidr),
|
||
"head": repo_head(ctx["repo_root"]),
|
||
"discovered": discovered,
|
||
}
|
||
path = os.path.join(ctx["repo_root"], "data", "agent-status.json")
|
||
try:
|
||
tmp = path + ".tmp"
|
||
with open(tmp, "w", encoding="utf-8") as fh:
|
||
json.dump(status, fh, indent=2, sort_keys=True)
|
||
os.replace(tmp, path)
|
||
os.chmod(path, 0o644)
|
||
except OSError as e:
|
||
logger.warning("status write failed: %s", e)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Reconcile — one cycle of the unified agent
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def is_home(cfg: Config) -> tuple[bool, str | None, str | None]:
|
||
"""HOME only when the physical DHCP uplink is the home gateway (MAC match)."""
|
||
pgw, pgif = physical_gateway()
|
||
if pgw:
|
||
home = pgw == cfg.gateway and neighbor_mac(pgw) == cfg.gateway_mac
|
||
return home, pgw, pgif
|
||
gw, gwif = default_route()
|
||
return False, gw, gwif
|
||
|
||
|
||
def default_route_hijacked(cfg: Config) -> str | None:
|
||
"""Mesh iface name if v4 default is wrongly on the tunnel, else None."""
|
||
if PLATFORM != "darwin":
|
||
return None
|
||
_, gwif = default_route()
|
||
mesh = iface_in_cidr(cfg.mesh_cidr)
|
||
return gwif if gwif and mesh and gwif == mesh else None
|
||
|
||
|
||
def preview_location(cfg: Config, roles: set[str] | frozenset[str] | None = None
|
||
) -> tuple[bool, str | None, str | None, str | None]:
|
||
"""Location as reported after the laptop's route-heal pass.
|
||
|
||
Returns (home, gw, gwif, note). `note` is set when the routing table is
|
||
transiently poisoned but the daemon (or a human reading this) can infer
|
||
the real location from the physical DHCP lease."""
|
||
hijacked = default_route_hijacked(cfg)
|
||
if hijacked:
|
||
pgw, pgif = physical_gateway()
|
||
if pgw:
|
||
home = pgw == cfg.gateway and neighbor_mac(pgw) == cfg.gateway_mac
|
||
if roles and "route" in roles:
|
||
note = (f"default hijacked by {hijacked} — daemon heals then "
|
||
f"reports {'HOME' if home else 'AWAY'} via {pgif}")
|
||
else:
|
||
note = (f"default hijacked by {hijacked} — physical uplink "
|
||
f"{'HOME' if home else 'AWAY'} via {pgif}")
|
||
return home, pgw, pgif, note
|
||
note = f"default hijacked by {hijacked} — no physical uplink found"
|
||
return False, None, hijacked, note
|
||
home, gw, gwif = is_home(cfg)
|
||
return home, gw, gwif, None
|
||
|
||
|
||
def reconcile(cfg: Config, data: dict, ctx: dict) -> bool:
|
||
"""Run one cycle. Returns True if the agent should exit for restart."""
|
||
# 1. pull (rate-limited; every node)
|
||
if git_pull(ctx["repo_root"], ctx):
|
||
render_views(ctx["repo_root"], ctx["render_user"])
|
||
return True
|
||
|
||
# 1b. hostname convergence (fleet-gated, idempotent)
|
||
if "hostname" in ctx["roles"] and ctx["self_name"]:
|
||
enforce_hostname(ctx["self_name"])
|
||
|
||
roles = ctx["roles"]
|
||
|
||
# 1c. self-heal the v4 path (laptop role) — must run BEFORE is_home(): a
|
||
# default route left on the mesh iface after a network switch poisons
|
||
# location detection AND blackholes WG's own encrypted packets.
|
||
if "route" in roles:
|
||
heal_default_route(cfg)
|
||
pin_endpoint_route(cfg)
|
||
|
||
home, gw, gwif = is_home(cfg)
|
||
ctx["location"] = "HOME" if home else "AWAY"
|
||
|
||
# 2. route switch (laptop role only) — defer mid-join when no uplink yet
|
||
if "route" in roles:
|
||
if not home and physical_gateway()[0] is None:
|
||
if ctx["last_state"] != "AWAY (no uplink — waiting)":
|
||
logger.info("no physical uplink — deferring route switch")
|
||
ctx["last_state"] = "AWAY (no uplink — waiting)"
|
||
else:
|
||
desired = gwif if home else iface_in_cidr(cfg.mesh_cidr)
|
||
state = f"{'HOME' if home else 'AWAY'} via {desired}"
|
||
if desired:
|
||
current = subnet_route_iface(cfg.lan_cidr)
|
||
if current != desired:
|
||
if set_subnet_route(cfg.lan_cidr, desired):
|
||
logger.info("%s → routing %s via %s (was %s)", state, cfg.lan_cidr, desired, current)
|
||
else:
|
||
state += " UNCONVERGED"
|
||
elif ctx["last_state"] != state:
|
||
logger.info("%s → %s already via %s", state, cfg.lan_cidr, desired)
|
||
elif ctx["last_state"] != state:
|
||
logger.warning("away and no wg interface up — leaving %s untouched", cfg.lan_cidr)
|
||
ctx["last_state"] = state
|
||
|
||
# 3. discover + render (any node that can see the home LAN right now).
|
||
# A node also discovers ITSELF from its own interfaces — ARP only sees
|
||
# peers, and without this a host's own vhosts render at the stale declared
|
||
# seed (bit apricot: its /etc/hosts said .116 while it sat on .118).
|
||
my_lan_ip = next((ip for ip in local_ipv4s()
|
||
if ipaddress.ip_address(ip) in ipaddress.ip_network(cfg.lan_cidr, strict=False)), None)
|
||
lan_visible = home or my_lan_ip is not None
|
||
if "discover" in roles and lan_visible:
|
||
found = discover(cfg, ctx["lan_hosts"], ctx) if ctx["lan_hosts"] else {}
|
||
if ctx["self_name"] and my_lan_ip:
|
||
found[ctx["self_name"]] = my_lan_ip
|
||
if found:
|
||
sync_names(ctx["repo_root"], found, ctx["render_user"], ctx["self_name"])
|
||
|
||
# 4. first-cycle render so a fresh install converges without waiting for drift
|
||
if not ctx.get("rendered_once"):
|
||
ctx["rendered_once"] = True
|
||
render_views(ctx["repo_root"], ctx["render_user"])
|
||
|
||
# 5. menu-bar tray (fennel only) — child of this agent, not a second service
|
||
if "route" in roles:
|
||
ensure_tray(ctx["repo_root"], ctx["render_user"])
|
||
return False
|
||
|
||
|
||
def build_ctx(data_file: str) -> dict:
|
||
data = load_json(data_file)
|
||
repo_root = os.path.dirname(os.path.dirname(data_file))
|
||
me = identify_self(data)
|
||
name = me.get("name") if me else None
|
||
roles = {"pull", "render"}
|
||
if me:
|
||
if me.get("lan") is not None or me.get("class") == "laptop":
|
||
roles.add("discover")
|
||
if me.get("class") == "laptop" and PLATFORM == "darwin":
|
||
roles.add("route")
|
||
if data.get("fleet", {}).get("enforce_hostname"):
|
||
roles.add("hostname")
|
||
ru = render_user(repo_root, me)
|
||
return {
|
||
"repo_root": repo_root, "self_name": name,
|
||
"self_lan": me.get("lan") if me else None,
|
||
"roles": roles, "render_user": ru,
|
||
"lan_hosts": lan_hosts_with_mac(data, name),
|
||
"last_state": None, "last_sweep": 0.0, "last_pull": 0.0,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def main(argv: list[str] | None = None) -> int:
|
||
ap = argparse.ArgumentParser(description="net-tools fleet agent (source of truth: data/mesh-hosts.json)")
|
||
ap.add_argument("--once", action="store_true", help="run one cycle and exit")
|
||
ap.add_argument("--interval", type=int, default=PROBE_INTERVAL_SEC)
|
||
ap.add_argument("--status", action="store_true", help="inspect, change nothing (no root needed)")
|
||
args = ap.parse_args(argv)
|
||
|
||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s",
|
||
datefmt="%Y-%m-%d %H:%M:%S")
|
||
|
||
try:
|
||
data_file = find_data_file()
|
||
cfg = load_config(data_file)
|
||
ctx = build_ctx(data_file)
|
||
except (FileNotFoundError, ValueError, KeyError) as e:
|
||
logger.error("config: %s", e)
|
||
return 1
|
||
|
||
if args.status:
|
||
home, gw, gwif, note = preview_location(cfg, ctx["roles"])
|
||
print(f"platform : {PLATFORM}")
|
||
print(f"self : {ctx['self_name'] or 'UNKNOWN (not in mesh-hosts.json!)'}"
|
||
f" roles: {', '.join(sorted(ctx['roles']))}")
|
||
print(f"location : {'HOME' if home else 'AWAY'} (gw {gw} on {gwif})")
|
||
if note:
|
||
print(f"route : {note}")
|
||
print(f"{cfg.lan_cidr} via: {subnet_route_iface(cfg.lan_cidr)} wg iface: {iface_in_cidr(cfg.mesh_cidr)}")
|
||
print(f"render user: {ctx['render_user']}")
|
||
sp = os.path.join(ctx["repo_root"], "data", "lan-state.json")
|
||
print(f"discovered: {load_json(sp) if os.path.isfile(sp) else '(none yet)'}")
|
||
return 0
|
||
|
||
if os.geteuid() != 0:
|
||
logger.error("must run as root (route + /etc/hosts changes); use --status to inspect")
|
||
return 1
|
||
|
||
logger.info("started — platform=%s self=%s roles=%s interval=%ds",
|
||
PLATFORM, ctx["self_name"], ",".join(sorted(ctx["roles"])), args.interval)
|
||
|
||
stop = [False]
|
||
def _sig(signum, _f):
|
||
logger.info("signal %d — exiting after cycle", signum)
|
||
stop[0] = True
|
||
signal.signal(signal.SIGTERM, _sig)
|
||
signal.signal(signal.SIGINT, _sig)
|
||
|
||
last_cfg, last_data = cfg, load_json(data_file)
|
||
while True:
|
||
try:
|
||
last_cfg = load_config(data_file)
|
||
last_data = load_json(data_file)
|
||
ctx["lan_hosts"] = lan_hosts_with_mac(last_data, ctx["self_name"])
|
||
except Exception:
|
||
logger.exception("config reload failed — using last-good")
|
||
try:
|
||
if reconcile(last_cfg, last_data, ctx):
|
||
write_status(last_cfg, ctx)
|
||
return 0 # HEAD moved: exit; launchd/systemd restart with new code
|
||
except Exception:
|
||
logger.exception("reconcile failed")
|
||
try:
|
||
write_status(last_cfg, ctx)
|
||
except Exception:
|
||
logger.exception("status write failed")
|
||
if args.once or stop[0]:
|
||
return 0
|
||
slept = 0
|
||
while slept < args.interval and not stop[0]:
|
||
time.sleep(1)
|
||
slept += 1
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|