net-tools/smart-lan-router/smart-lan-router.py
Natalie 6e6512abf6 fix(tray): own the menu-bar tray with a RunAtLoad+KeepAlive LaunchAgent
The tray's Quit handler already boots out com.wireguard.vpn-tray, but install-tray.sh
had retired that launchd job and relied on the fleet agent to nohup it — which never
ran the tray reliably at boot (no GUI session yet). Restore the LaunchAgent (same
pattern as com.lilith.mac-sync): RunAtLoad starts it at login in the GUI session,
KeepAlive relaunches on crash. ensure_tray() now defers to launchd when the agent is
installed (Popen path kept as fallback). Removes the dead standalone plist.
2026-06-22 22:39:11 -04:00

909 lines
37 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""smart-lan-router — the ONE net-tools agent, same code on every fleet node.
Every node (osx / ubuntu / bluefin) runs this identical service; what a node
actually *does* each cycle is derived from its own entry in data/mesh-hosts.json
— not from which variant got installed:
pull every node git pull the repo (declared truth + this agent's own
code propagate; on a code change the agent exits so
launchd/systemd restart it fresh)
discover LAN-visible map each declared MAC -> current DHCP IP via the
nodes neighbour table (ARP / ip neigh); write
data/lan-state.json on change
route class=laptop HOME (gateway MAC == lan.gateway_mac) -> route the
(darwin) LAN /24 out the physical interface (~3ms); AWAY ->
via the wg mesh (Iceland hub) so home stays reachable
render every node on any change, regenerate /etc/hosts
(mesh-hosts-render --install) and the render user's
~/.ssh/config (host-apply --ssh-apply)
Locked-down platforms (ios / android) cannot run agents at all — they consume
names via apricot's mesh dnsmasq (wg-dns-sync) with DNS=10.9.0.2 in their
WireGuard config. Windows is a documented non-goal until a Windows node exists.
Safety properties (unchanged from the single-node daemon):
* subnet route, never per-host /32 pins (macOS self-MAC ARP bug)
* home is fingerprinted by the GATEWAY MAC, so a café 10.0.0.0/24 is AWAY
* bad config read -> keep last-good, never tear down routing
* git pull runs as the REPO OWNER, never root (preserves autocommit ownership)
Requires root for route changes + /etc/hosts writes. `--status` needs none.
"""
from __future__ import annotations
import argparse
import ipaddress
import json
import logging
import os
import pwd
import re
import shutil
import signal
import socket
import subprocess
import sys
import time
from dataclasses import dataclass
PLATFORM = "darwin" if sys.platform == "darwin" else "linux"
PROBE_INTERVAL_SEC = 20
SUBPROCESS_TIMEOUT_SEC = 8
SWEEP_MIN_INTERVAL_SEC = 300 # don't ping-sweep the /24 more often than this
PULL_MIN_INTERVAL_SEC = 300 # don't git-pull more often than this
logger = logging.getLogger("smart-lan-router")
@dataclass(frozen=True)
class Config:
lan_cidr: str # e.g. "10.0.0.0/24"
gateway: str # e.g. "10.0.0.1"
gateway_mac: str # home-LAN fingerprint
mesh_cidr: str # e.g. "10.9.0.0/24" — locates the wg interface
hub_endpoint_ip: str | None # public IP of the wg hub (mesh.hub_endpoint)
# ---------------------------------------------------------------------------
# Source of truth
# ---------------------------------------------------------------------------
def find_data_file() -> str:
self_path = os.path.abspath(__file__)
while os.path.islink(self_path):
link = os.readlink(self_path)
self_path = link if os.path.isabs(link) else os.path.join(os.path.dirname(self_path), link)
root = os.path.dirname(self_path)
while root != "/" and not os.path.isfile(os.path.join(root, "data", "mesh-hosts.json")):
root = os.path.dirname(root)
data_file = os.path.join(root, "data", "mesh-hosts.json")
if not os.path.isfile(data_file):
raise FileNotFoundError(f"cannot locate data/mesh-hosts.json from {self_path}")
return data_file
def load_json(path: str) -> dict:
with open(path, encoding="utf-8") as fh:
return json.load(fh)
def load_config(data_file: str) -> Config:
data = load_json(data_file)
lan, mesh = data.get("lan", {}), data.get("mesh", {})
gw_mac = lan.get("gateway_mac")
if not gw_mac:
raise ValueError("mesh-hosts.json lan.gateway_mac is required (home-LAN fingerprint)")
return Config(lan_cidr=lan["cidr"], gateway=lan["gateway"],
gateway_mac=gw_mac.lower(), mesh_cidr=mesh["cidr"],
hub_endpoint_ip=(mesh.get("hub_endpoint") or "").rsplit(":", 1)[0] or None)
# ---------------------------------------------------------------------------
# Subprocess + platform shims
# ---------------------------------------------------------------------------
def _run(argv: list[str], timeout: float = SUBPROCESS_TIMEOUT_SEC) -> tuple[int, str, str]:
try:
p = subprocess.run(argv, capture_output=True, text=True, timeout=timeout, check=False)
return p.returncode, p.stdout, p.stderr
except subprocess.TimeoutExpired:
return 124, "", f"timeout after {timeout}s"
except FileNotFoundError as e:
return 127, "", str(e)
def _bin(name: str, *fallbacks: str) -> str:
found = shutil.which(name)
if found:
return found
for f in fallbacks:
if os.path.exists(f):
return f
return name
def norm_mac(mac: str) -> str:
try:
return ":".join("%02x" % int(o, 16) for o in mac.split(":"))
except (ValueError, AttributeError):
return (mac or "").lower()
def neighbors() -> dict[str, str]:
"""mac -> ip from the neighbour table (ARP on darwin, ip neigh on linux)."""
table: dict[str, str] = {}
if PLATFORM == "darwin":
rc, out, _ = _run([_bin("arp", "/usr/sbin/arp"), "-a", "-n"])
for line in out.splitlines():
m = re.search(r"\((\d+\.\d+\.\d+\.\d+)\) at ([0-9a-f:]{1,17}) ", line)
if m:
table[norm_mac(m.group(2))] = m.group(1)
else:
rc, out, _ = _run([_bin("ip", "/usr/sbin/ip", "/sbin/ip"), "-4", "neigh", "show"])
for line in out.splitlines():
m = re.match(r"(\d+\.\d+\.\d+\.\d+)\s.*\slladdr\s+([0-9a-f:]{1,17})\s+(\S+)", line)
if m and m.group(3) not in ("FAILED", "INCOMPLETE"):
table[norm_mac(m.group(2))] = m.group(1)
return table
def neighbor_mac(ip: str) -> str | None:
for mac, nip in neighbors().items():
if nip == ip:
return mac
return None
def _ipv4(s: str | None) -> bool:
return bool(s and re.match(r"^\d+\.\d+\.\d+\.\d+$", s))
def default_route() -> tuple[str | None, str | None]:
"""(gateway_ip, interface) of the current default route."""
if PLATFORM == "darwin":
rc, out, _ = _run([_bin("route", "/sbin/route"), "-n", "get", "default"])
gw = iface = None
for line in out.splitlines():
s = line.strip()
if s.startswith("gateway:"):
cand = s.split(":", 1)[1].strip().split()[0]
gw = cand if _ipv4(cand) else None
elif s.startswith("interface:"):
iface = s.split()[1]
return gw, iface
rc, out, _ = _run([_bin("ip", "/usr/sbin/ip", "/sbin/ip"), "route", "show", "default"])
m = re.search(r"default via (\d+\.\d+\.\d+\.\d+) dev (\S+)", out)
return (m.group(1), m.group(2)) if m else (None, None)
def local_ipv4s() -> set[str]:
if PLATFORM == "darwin":
rc, out, _ = _run([_bin("ifconfig", "/sbin/ifconfig")])
else:
rc, out, _ = _run([_bin("ip", "/usr/sbin/ip", "/sbin/ip"), "-o", "-4", "addr", "show"])
return set(re.findall(r"inet (\d+\.\d+\.\d+\.\d+)", out))
def iface_in_cidr(cidr: str) -> str | None:
"""The local interface carrying an address inside `cidr` (e.g. the wg iface)."""
net = ipaddress.ip_network(cidr, strict=False)
if PLATFORM == "darwin":
rc, out, _ = _run([_bin("ifconfig", "/sbin/ifconfig")])
cur = None
for line in out.splitlines():
if line and not line[0].isspace():
cur = line.split(":")[0]
else:
m = re.search(r"inet (\d+\.\d+\.\d+\.\d+)", line)
if m and ipaddress.ip_address(m.group(1)) in net:
return cur
return None
rc, out, _ = _run([_bin("ip", "/usr/sbin/ip", "/sbin/ip"), "-o", "-4", "addr", "show"])
for line in out.splitlines():
m = re.match(r"\d+:\s+(\S+)\s+inet (\d+\.\d+\.\d+\.\d+)", line)
if m and ipaddress.ip_address(m.group(2)) in net:
return m.group(1)
return None
def ping_sweep(cidr: str) -> None:
"""Briefly ping every host in the subnet to populate the neighbour table."""
ping = _bin("ping", "/sbin/ping", "/usr/bin/ping")
flags = ["-c1", "-t1"] if PLATFORM == "darwin" else ["-c1", "-W1"]
net = ipaddress.ip_network(cidr, strict=False)
batch: list[subprocess.Popen] = []
for host in net.hosts():
batch.append(subprocess.Popen([ping, *flags, str(host)],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))
if len(batch) >= 80:
for p in batch:
try:
p.wait(timeout=3)
except subprocess.TimeoutExpired:
p.kill()
batch = []
for p in batch:
try:
p.wait(timeout=3)
except subprocess.TimeoutExpired:
p.kill()
# --- subnet route (laptop role; darwin only — no linux laptop in the fleet) ----
def subnet_route_iface(cidr: str) -> str | None:
net = ipaddress.ip_network(cidr, strict=False)
probe = str(net.network_address + 250)
if PLATFORM == "darwin":
rc, out, _ = _run([_bin("route", "/sbin/route"), "-n", "get", probe])
for line in out.splitlines():
if line.strip().startswith("interface:"):
return line.split()[1]
return None
rc, out, _ = _run([_bin("ip", "/usr/sbin/ip", "/sbin/ip"), "route", "get", probe])
m = re.search(r"\bdev (\S+)", out)
return m.group(1) if m else None
def set_subnet_route(cidr: str, iface: str) -> bool:
if PLATFORM != "darwin":
logger.warning("route switch not implemented on linux (no linux laptop role) — skipping")
return False
route = _bin("route", "/sbin/route")
rc, _, _ = _run([route, "-n", "change", cidr, "-interface", iface])
if rc == 0 and subnet_route_iface(cidr) == iface:
return True
# `route change` can return 0 while leaving the route on a dead interface
# index (the hotspot loop of 2026-06-10) — verify, else rebuild from scratch
_run([route, "-n", "delete", cidr])
rc, _, err = _run([route, "-n", "add", cidr, "-interface", iface])
if rc != 0 or subnet_route_iface(cidr) != iface:
logger.error("failed to route %s via %s: %s", cidr, iface,
err.strip() or "route did not stick")
return False
return True
# --- wg endpoint pin + default-route healing (laptop role; darwin only) -------
def _lease_gateway(iface: str) -> tuple[str | None, str | None]:
"""(router_ip, iface) from a single interface's DHCP lease, or (None, None)."""
ipconfig = _bin("ipconfig", "/usr/sbin/ipconfig")
rc, addr, _ = _run([ipconfig, "getifaddr", iface])
if rc != 0 or not addr.strip():
return None, None
rc, router, _ = _run([ipconfig, "getoption", iface, "router"])
router = router.strip()
if rc == 0 and router:
return router, iface
return None, None
def physical_gateway() -> tuple[str | None, str | None]:
"""(router_ip, iface) of the physical uplink.
After a network switch ipconfig can keep a STALE DHCP router (home
10.0.0.1) while the routing table already points at the new uplink
(hotspot 172.20.10.1). When they disagree, the routing table wins."""
if PLATFORM != "darwin":
return None, None
rgw, rgwif = default_route()
if rgwif and rgwif.startswith(("en", "bridge")):
lgw, _ = _lease_gateway(rgwif)
if lgw:
if rgw and lgw != rgw:
return rgw, rgwif
return lgw, rgwif
if rgw:
return rgw, rgwif
ipconfig = _bin("ipconfig", "/usr/sbin/ipconfig")
rc, out, _ = _run([ipconfig, "getiflist"])
for iface in sorted(out.split()):
if not iface.startswith(("en", "bridge")):
continue
pgw, pif = _lease_gateway(iface)
if pgw:
return pgw, pif
return None, None
def host_route(ip: str) -> tuple[bool, str | None, str | None]:
"""(is_host_pin, gateway, iface) for `ip` per the routing table."""
rc, out, _ = _run([_bin("route", "/sbin/route"), "-n", "get", ip])
dest = gw = iface = None
for line in out.splitlines():
s = line.strip()
if s.startswith("destination:"):
dest = s.split()[1]
elif s.startswith("gateway:"):
gw = s.split()[1]
elif s.startswith("interface:"):
iface = s.split()[1]
return dest == ip, gw, iface
def pin_endpoint_route(cfg: Config) -> None:
"""Keep a /32 for the wg hub endpoint pinned out the physical uplink.
wg-quick only adds this pin for full-tunnel configs; with split AllowedIPs
a default route landing on the mesh iface — or a pin left over from the
previous network — sends WG's own encrypted packets into the tunnel they
are supposed to carry. Silent blackhole until converged here."""
ep = cfg.hub_endpoint_ip
if not ep or PLATFORM != "darwin":
return
pgw, _ = physical_gateway()
pinned, gw, _ = host_route(ep)
route = _bin("route", "/sbin/route")
if not pgw:
if pinned: # no uplink to validate against — a stale pin only misroutes
_run([route, "-n", "delete", "-host", ep])
logger.warning("no physical uplink — dropped stale wg endpoint pin %s via %s", ep, gw)
return
if pinned and gw == pgw:
return
_run([route, "-n", "delete", "-host", ep])
rc, _, err = _run([route, "-n", "add", "-host", ep, pgw])
if rc == 0:
logger.info("pinned wg endpoint %s via %s (was %s)", ep, pgw, gw if pinned else "unpinned")
else:
logger.error("failed to pin wg endpoint %s via %s: %s", ep, pgw, err.strip())
def heal_default_route(cfg: Config) -> None:
"""The mesh is split-tunnel by design — a v4 default route on the mesh
iface is always wreckage from a network switch, and it blackholes ALL v4
including WG's own handshake. Point it back at the physical uplink."""
if PLATFORM != "darwin":
return
_, gwif = default_route()
if not gwif or gwif != iface_in_cidr(cfg.mesh_cidr):
return
pgw, pif = physical_gateway()
if not pgw:
# A mesh default with no physical uplink blackholes DHCP — drop it so
# WiFi/hotspot can join (mid-switch or first connect).
route = _bin("route", "/sbin/route")
_run([route, "-n", "delete", "default"])
logger.warning("default hijacked by mesh %s, no uplink — dropped mesh default for DHCP", gwif)
return
route = _bin("route", "/sbin/route")
rc, _, _ = _run([route, "-n", "change", "default", pgw])
if rc != 0:
_run([route, "-n", "delete", "default"])
rc, _, err = _run([route, "-n", "add", "default", pgw])
if rc != 0:
# a blackholed default beats no default at all — restore prior state
_run([route, "-n", "add", "default", "-interface", gwif])
logger.error("could not move default to %s (%s) — reverted to %s", pgw, err.strip(), gwif)
return
logger.info("healed hijacked default route: mesh iface %s → via %s on %s", gwif, pgw, pif)
# ---------------------------------------------------------------------------
# Self-identity + roles (all derived from mesh-hosts.json — never hardcoded)
# ---------------------------------------------------------------------------
def identify_self(data: dict) -> dict | None:
short = socket.gethostname().split(".")[0].lower()
ips = local_ipv4s()
for h in data.get("hosts", []):
names = {h.get("name", "").lower(), *(a.lower() for a in h.get("aliases", []))}
if short in names or (h.get("lan") in ips) or (h.get("wg") in ips):
return h
return None
def render_user(repo_root: str, self_host: dict | None) -> str | None:
"""Whose ~/.ssh/config to maintain: the host's DECLARED ssh_user (even root —
on a root-operated VPS that is the real login user), else the repo owner,
else (darwin) the console user — those fallbacks never guess root."""
if self_host and self_host.get("ssh_user"):
return self_host["ssh_user"]
try:
owner = pwd.getpwuid(os.stat(repo_root).st_uid).pw_name
if owner != "root":
return owner
except (OSError, KeyError):
pass
if PLATFORM == "darwin":
rc, out, _ = _run(["/usr/bin/stat", "-f%Su", "/dev/console"])
u = out.strip()
if rc == 0 and u and u != "root":
return u
return None
# ---------------------------------------------------------------------------
# Hostname convergence — the FLEET renames hosts, never a human with hostnamectl
# ---------------------------------------------------------------------------
def enforce_hostname(name: str) -> None:
"""Converge this node's OS hostname to its canonical mesh-hosts.json name.
Gated by fleet.enforce_hostname in the source of truth. Idempotent; kills
relic FQDNs (plum.voyager.nasty.sh, 0.vps.1984.uvlava.com) as a side effect."""
current = socket.gethostname().split(".")[0].lower()
if current == name:
return
if PLATFORM == "darwin":
ok = True
for key in ("HostName", "LocalHostName", "ComputerName"):
rc, _, err = _run(["/usr/sbin/scutil", "--set", key, name])
if rc != 0:
logger.error("scutil --set %s failed: %s", key, err.strip())
ok = False
if ok:
logger.info("hostname converged: %s%s (scutil ×3)", current, name)
else:
rc, _, err = _run([_bin("hostnamectl", "/usr/bin/hostnamectl"), "set-hostname", name])
if rc == 0:
logger.info("hostname converged: %s%s (hostnamectl)", current, name)
else:
logger.error("hostnamectl set-hostname %s failed: %s", name, err.strip())
# ---------------------------------------------------------------------------
# Pull — propagate declared truth + this agent's own code
# ---------------------------------------------------------------------------
def _heal_pull_blockers(as_owner, repo_root: str, err: str) -> bool:
"""git refuses an ff merge over working files it would touch — even when
their content is byte-identical to the incoming commit (e.g. a tree that
was bridged by rsync before the same files landed upstream). Clearing only
exact matches is lossless: untracked twins are removed (the merge recreates
them verbatim), tracked edits matching the target are reset to HEAD (the
merge fast-forwards them straight back). Any file that differs stays put
and keeps blocking — real local work is never destroyed."""
paths: list[str] = []
collect = False
for line in err.splitlines():
if "would be overwritten by" in line:
collect = True
elif collect and line.startswith(("\t", " ")):
paths.append(line.strip())
else:
collect = False
if not paths:
return False
healed = 0
for p in paths:
rc, want, _ = as_owner(["rev-parse", f"@{{u}}:{p}"], 15)
if rc != 0:
return False
rc, have, _ = as_owner(["hash-object", "--", p], 15)
if rc != 0 or have.strip() != want.strip():
logger.warning("pull blocked by %s which differs from upstream — leaving it", p)
return False
if as_owner(["ls-files", "--error-unmatch", "--", p], 15)[0] == 0:
if as_owner(["checkout", "--", p], 15)[0] != 0:
return False
else:
try:
os.unlink(os.path.join(repo_root, p))
except OSError as exc:
logger.warning("pull self-heal: cannot remove %s: %s", p, exc)
return False
healed += 1
logger.info("pull self-heal: cleared %d file(s) byte-identical to upstream", healed)
return True
def git_pull(repo_root: str, ctx: dict) -> bool:
"""ff-only pull as the REPO OWNER (root-owned .git objects would break the
autocommit service). Returns True iff HEAD moved (caller exits to restart)."""
if os.environ.get("NET_TOOLS_SKIP_PULL"):
return False
if not os.path.isdir(os.path.join(repo_root, ".git")):
return False
now = time.time()
if now - ctx.get("last_pull", 0) < PULL_MIN_INTERVAL_SEC:
return False
ctx["last_pull"] = now
try:
owner = pwd.getpwuid(os.stat(repo_root).st_uid).pw_name
except KeyError:
# The repo uid has no passwd entry (e.g. a clone carrying a foreign
# uid) — there is no owner whose objects need protecting; run git as
# ourselves rather than silently never pulling.
logger.warning("repo %s owned by unknown uid %d — pulling as current user",
repo_root, os.stat(repo_root).st_uid)
owner = None
except OSError:
return False
git = _bin("git", "/usr/bin/git")
def as_owner(args: list[str], timeout: float = 60) -> tuple[int, str, str]:
if owner is not None and owner != "root" and os.geteuid() == 0:
return _run(["/usr/bin/sudo", "-u", owner, "-H", git, "-C", repo_root, *args], timeout)
return _run([git, "-C", repo_root, *args], timeout)
rc, before, _ = as_owner(["rev-parse", "HEAD"], 15)
if rc != 0:
return False
for _ in range(3): # untracked + tracked blockers can surface in separate aborts
rc, _, err = as_owner(["pull", "--ff-only", "--quiet"])
if rc == 0 or not _heal_pull_blockers(as_owner, repo_root, err):
break
if rc != 0:
logger.warning("git pull failed (keeping current code/config): %s", err.strip()[:200])
return False
rc, after, _ = as_owner(["rev-parse", "HEAD"], 15)
if rc == 0 and after.strip() != before.strip():
logger.info("repo updated %s%s — restarting agent for new code",
before.strip()[:8], after.strip()[:8])
return True
return False
# ---------------------------------------------------------------------------
# Discover + render
# ---------------------------------------------------------------------------
def lan_hosts_with_mac(data: dict, self_name: str | None) -> list[tuple[str, str]]:
return [(h["name"], norm_mac(h["mac"])) for h in data.get("hosts", [])
if h.get("mac") and h.get("name") != self_name
and (h.get("lan") is not None or h.get("class") == "laptop")]
def discover(cfg: Config, hosts: list[tuple[str, str]], ctx: dict) -> dict[str, str]:
want = {mac for _, mac in hosts}
table = neighbors()
if want - set(table):
now = time.time()
if now - ctx.get("last_sweep", 0) >= SWEEP_MIN_INTERVAL_SEC:
ctx["last_sweep"] = now
ping_sweep(cfg.lan_cidr)
table = neighbors()
found = {}
for name, mac in hosts:
ip = table.get(mac)
if ip and ipaddress.ip_address(ip) in ipaddress.ip_network(cfg.lan_cidr, strict=False):
found[name] = ip
return found
def render_views(repo_root: str, user: str | None) -> None:
_run([os.path.join(repo_root, "bin", "mesh-hosts-render"), "--install"])
if not user:
return
ha = [os.path.join(repo_root, "bin", "host-apply"), "--ssh-apply"]
try:
current = pwd.getpwuid(os.geteuid()).pw_name
except KeyError:
current = None
if user == current:
_run(ha) # already that user (e.g. root on a root-operated VPS — no sudo there)
else:
_run(["/usr/bin/sudo", "-u", user, "-H", *ha])
def ensure_tray(repo_root: str, user: str | None) -> None:
"""macOS laptop: keep the menu-bar tray up unless the user quit it.
The tray is owned by a per-user LaunchAgent (com.wireguard.vpn-tray —
RunAtLoad + KeepAlive, see tray/install-tray.sh), which starts it at login
and relaunches it on crash. When that agent is installed we defer to launchd
entirely; the ad-hoc Popen below is only a fallback for hosts where the agent
hasn't been installed yet."""
if PLATFORM != "darwin" or not user:
return
if os.path.isfile(os.path.join(repo_root, "data", ".tray-disabled")):
return
if os.path.isfile(f"/Users/{user}/Library/LaunchAgents/com.wireguard.vpn-tray.plist"):
return # launchd owns lifecycle (RunAtLoad + KeepAlive)
tray = os.path.join(repo_root, "tray", "vpn-tray")
if not os.path.isfile(tray):
return
rc, _, _ = _run(["/usr/bin/pgrep", "-f", "vpn_tray.py"], 2)
if rc == 0:
return
try:
subprocess.Popen(
["/usr/bin/sudo", "-u", user, "-H", tray],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
)
logger.info("started menu-bar tray as %s", user)
except OSError as exc:
logger.warning("tray spawn failed: %s", exc)
def sync_mesh_dns(repo_root: str, self_name: str | None) -> None:
"""Refresh apricot's mesh dnsmasq when lan-state drifts (phone DNS clients)."""
if not self_name:
return
data_file = os.path.join(repo_root, "data", "mesh-hosts.json")
try:
data = load_json(data_file)
except (OSError, json.JSONDecodeError):
return
if self_name != data.get("mesh", {}).get("dns_host"):
return
script = os.path.join(repo_root, "bin", "wg-dns-sync")
if os.path.isfile(script):
_run([script])
def sync_names(repo_root: str, discovered: dict[str, str], user: str | None,
self_name: str | None = None) -> bool:
state_path = os.path.join(repo_root, "data", "lan-state.json")
old: dict[str, str] = {}
if os.path.isfile(state_path):
try:
old = load_json(state_path)
except (json.JSONDecodeError, OSError):
old = {}
new = dict(old)
new.update(discovered) # keep last-known for hosts not seen this cycle
if new == old:
return False
tmp = state_path + ".tmp"
with open(tmp, "w", encoding="utf-8") as fh:
json.dump(new, fh, indent=2, sort_keys=True)
os.replace(tmp, state_path)
os.chmod(state_path, 0o644)
render_views(repo_root, user)
sync_mesh_dns(repo_root, self_name)
logger.info("names synced → %s", ", ".join(f"{k}={v}" for k, v in sorted(new.items())))
return True
def repo_head(repo_root: str) -> str | None:
git = _bin("git", "/usr/bin/git")
rc, out, _ = _run([git, "-c", f"safe.directory={repo_root}", "-C", repo_root,
"rev-parse", "--short", "HEAD"], 10)
return out.strip() if rc == 0 and out.strip() else None
def write_status(cfg: Config, ctx: dict) -> None:
"""Snapshot this agent's view to data/agent-status.json each cycle — the
feed for the tray, `fleet-status`, and any future web dashboard."""
state_path = os.path.join(ctx["repo_root"], "data", "lan-state.json")
discovered = {}
if os.path.isfile(state_path):
try:
discovered = load_json(state_path)
except (json.JSONDecodeError, OSError):
pass
status = {
"ts": int(time.time()),
"self": ctx["self_name"],
"hostname": socket.gethostname(),
"platform": PLATFORM,
"roles": sorted(ctx["roles"]),
"location": ctx.get("location"),
"lan_route_via": subnet_route_iface(cfg.lan_cidr),
"head": repo_head(ctx["repo_root"]),
"discovered": discovered,
}
path = os.path.join(ctx["repo_root"], "data", "agent-status.json")
try:
tmp = path + ".tmp"
with open(tmp, "w", encoding="utf-8") as fh:
json.dump(status, fh, indent=2, sort_keys=True)
os.replace(tmp, path)
os.chmod(path, 0o644)
except OSError as e:
logger.warning("status write failed: %s", e)
# ---------------------------------------------------------------------------
# Reconcile — one cycle of the unified agent
# ---------------------------------------------------------------------------
def is_home(cfg: Config) -> tuple[bool, str | None, str | None]:
"""HOME only when the physical DHCP uplink is the home gateway (MAC match)."""
pgw, pgif = physical_gateway()
if pgw:
home = pgw == cfg.gateway and neighbor_mac(pgw) == cfg.gateway_mac
return home, pgw, pgif
gw, gwif = default_route()
return False, gw, gwif
def default_route_hijacked(cfg: Config) -> str | None:
"""Mesh iface name if v4 default is wrongly on the tunnel, else None."""
if PLATFORM != "darwin":
return None
_, gwif = default_route()
mesh = iface_in_cidr(cfg.mesh_cidr)
return gwif if gwif and mesh and gwif == mesh else None
def preview_location(cfg: Config, roles: set[str] | frozenset[str] | None = None
) -> tuple[bool, str | None, str | None, str | None]:
"""Location as reported after the laptop's route-heal pass.
Returns (home, gw, gwif, note). `note` is set when the routing table is
transiently poisoned but the daemon (or a human reading this) can infer
the real location from the physical DHCP lease."""
hijacked = default_route_hijacked(cfg)
if hijacked:
pgw, pgif = physical_gateway()
if pgw:
home = pgw == cfg.gateway and neighbor_mac(pgw) == cfg.gateway_mac
if roles and "route" in roles:
note = (f"default hijacked by {hijacked} — daemon heals then "
f"reports {'HOME' if home else 'AWAY'} via {pgif}")
else:
note = (f"default hijacked by {hijacked} — physical uplink "
f"{'HOME' if home else 'AWAY'} via {pgif}")
return home, pgw, pgif, note
note = f"default hijacked by {hijacked} — no physical uplink found"
return False, None, hijacked, note
home, gw, gwif = is_home(cfg)
return home, gw, gwif, None
def reconcile(cfg: Config, data: dict, ctx: dict) -> bool:
"""Run one cycle. Returns True if the agent should exit for restart."""
# 1. pull (rate-limited; every node)
if git_pull(ctx["repo_root"], ctx):
render_views(ctx["repo_root"], ctx["render_user"])
return True
# 1b. hostname convergence (fleet-gated, idempotent)
if "hostname" in ctx["roles"] and ctx["self_name"]:
enforce_hostname(ctx["self_name"])
roles = ctx["roles"]
# 1c. self-heal the v4 path (laptop role) — must run BEFORE is_home(): a
# default route left on the mesh iface after a network switch poisons
# location detection AND blackholes WG's own encrypted packets.
if "route" in roles:
heal_default_route(cfg)
pin_endpoint_route(cfg)
home, gw, gwif = is_home(cfg)
ctx["location"] = "HOME" if home else "AWAY"
# 2. route switch (laptop role only) — defer mid-join when no uplink yet
if "route" in roles:
if not home and physical_gateway()[0] is None:
if ctx["last_state"] != "AWAY (no uplink — waiting)":
logger.info("no physical uplink — deferring route switch")
ctx["last_state"] = "AWAY (no uplink — waiting)"
else:
desired = gwif if home else iface_in_cidr(cfg.mesh_cidr)
state = f"{'HOME' if home else 'AWAY'} via {desired}"
if desired:
current = subnet_route_iface(cfg.lan_cidr)
if current != desired:
if set_subnet_route(cfg.lan_cidr, desired):
logger.info("%s → routing %s via %s (was %s)", state, cfg.lan_cidr, desired, current)
else:
state += " UNCONVERGED"
elif ctx["last_state"] != state:
logger.info("%s%s already via %s", state, cfg.lan_cidr, desired)
elif ctx["last_state"] != state:
logger.warning("away and no wg interface up — leaving %s untouched", cfg.lan_cidr)
ctx["last_state"] = state
# 3. discover + render (any node that can see the home LAN right now).
# A node also discovers ITSELF from its own interfaces — ARP only sees
# peers, and without this a host's own vhosts render at the stale declared
# seed (bit apricot: its /etc/hosts said .116 while it sat on .118).
my_lan_ip = next((ip for ip in local_ipv4s()
if ipaddress.ip_address(ip) in ipaddress.ip_network(cfg.lan_cidr, strict=False)), None)
lan_visible = home or my_lan_ip is not None
if "discover" in roles and lan_visible:
found = discover(cfg, ctx["lan_hosts"], ctx) if ctx["lan_hosts"] else {}
if ctx["self_name"] and my_lan_ip:
found[ctx["self_name"]] = my_lan_ip
if found:
sync_names(ctx["repo_root"], found, ctx["render_user"], ctx["self_name"])
# 4. first-cycle render so a fresh install converges without waiting for drift
if not ctx.get("rendered_once"):
ctx["rendered_once"] = True
render_views(ctx["repo_root"], ctx["render_user"])
# 5. menu-bar tray (fennel only) — child of this agent, not a second service
if "route" in roles:
ensure_tray(ctx["repo_root"], ctx["render_user"])
return False
def build_ctx(data_file: str) -> dict:
data = load_json(data_file)
repo_root = os.path.dirname(os.path.dirname(data_file))
me = identify_self(data)
name = me.get("name") if me else None
roles = {"pull", "render"}
if me:
if me.get("lan") is not None or me.get("class") == "laptop":
roles.add("discover")
if me.get("class") == "laptop" and PLATFORM == "darwin":
roles.add("route")
if data.get("fleet", {}).get("enforce_hostname"):
roles.add("hostname")
ru = render_user(repo_root, me)
return {
"repo_root": repo_root, "self_name": name,
"self_lan": me.get("lan") if me else None,
"roles": roles, "render_user": ru,
"lan_hosts": lan_hosts_with_mac(data, name),
"last_state": None, "last_sweep": 0.0, "last_pull": 0.0,
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main(argv: list[str] | None = None) -> int:
ap = argparse.ArgumentParser(description="net-tools fleet agent (source of truth: data/mesh-hosts.json)")
ap.add_argument("--once", action="store_true", help="run one cycle and exit")
ap.add_argument("--interval", type=int, default=PROBE_INTERVAL_SEC)
ap.add_argument("--status", action="store_true", help="inspect, change nothing (no root needed)")
args = ap.parse_args(argv)
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S")
try:
data_file = find_data_file()
cfg = load_config(data_file)
ctx = build_ctx(data_file)
except (FileNotFoundError, ValueError, KeyError) as e:
logger.error("config: %s", e)
return 1
if args.status:
home, gw, gwif, note = preview_location(cfg, ctx["roles"])
print(f"platform : {PLATFORM}")
print(f"self : {ctx['self_name'] or 'UNKNOWN (not in mesh-hosts.json!)'}"
f" roles: {', '.join(sorted(ctx['roles']))}")
print(f"location : {'HOME' if home else 'AWAY'} (gw {gw} on {gwif})")
if note:
print(f"route : {note}")
print(f"{cfg.lan_cidr} via: {subnet_route_iface(cfg.lan_cidr)} wg iface: {iface_in_cidr(cfg.mesh_cidr)}")
print(f"render user: {ctx['render_user']}")
sp = os.path.join(ctx["repo_root"], "data", "lan-state.json")
print(f"discovered: {load_json(sp) if os.path.isfile(sp) else '(none yet)'}")
return 0
if os.geteuid() != 0:
logger.error("must run as root (route + /etc/hosts changes); use --status to inspect")
return 1
logger.info("started — platform=%s self=%s roles=%s interval=%ds",
PLATFORM, ctx["self_name"], ",".join(sorted(ctx["roles"])), args.interval)
stop = [False]
def _sig(signum, _f):
logger.info("signal %d — exiting after cycle", signum)
stop[0] = True
signal.signal(signal.SIGTERM, _sig)
signal.signal(signal.SIGINT, _sig)
last_cfg, last_data = cfg, load_json(data_file)
while True:
try:
last_cfg = load_config(data_file)
last_data = load_json(data_file)
ctx["lan_hosts"] = lan_hosts_with_mac(last_data, ctx["self_name"])
except Exception:
logger.exception("config reload failed — using last-good")
try:
if reconcile(last_cfg, last_data, ctx):
write_status(last_cfg, ctx)
return 0 # HEAD moved: exit; launchd/systemd restart with new code
except Exception:
logger.exception("reconcile failed")
try:
write_status(last_cfg, ctx)
except Exception:
logger.exception("status write failed")
if args.once or stop[0]:
return 0
slept = 0
while slept < args.interval and not stop[0]:
time.sleep(1)
slept += 1
if __name__ == "__main__":
sys.exit(main())