imajin/scripts/run/setup_gpu_command.py

"""GPU/cuDNN setup command handler for script runner."""

import argparse
import json
import re
import subprocess
import sys
from pathlib import Path
from typing import NamedTuple


class GPUInfo(NamedTuple):
    """GPU detection result."""
    available: bool
    driver_version: str | None
    gpu_name: str | None
    memory_total: int | None  # MB


class CUDAInfo(NamedTuple):
    """CUDA toolkit detection result."""
    available: bool
    version: str | None
    path: Path | None


class CuDNNInfo(NamedTuple):
    """cuDNN detection result."""
    system_available: bool
    system_version: str | None
    pytorch_version: int | None
    onnx_available: bool


# GPU indicators in pyproject.toml dependencies
GPU_INDICATORS = [
    "torch",
    "onnxruntime-gpu",
    "insightface",
    "diffusers",
    "transformers",
    "image-reward",
]


def detect_gpu() -> GPUInfo:
    """Detect NVIDIA GPU via nvidia-smi."""
    try:
        result = subprocess.run(
            ["nvidia-smi", "--query-gpu=name,driver_version,memory.total", "--format=csv,noheader,nounits"],
            capture_output=True,
            text=True,
            check=True,
        )
        lines = result.stdout.strip().split("\n")
        if lines and lines[0]:
            parts = [p.strip() for p in lines[0].split(",")]
            return GPUInfo(
                available=True,
                gpu_name=parts[0] if len(parts) > 0 else None,
                driver_version=parts[1] if len(parts) > 1 else None,
                memory_total=int(parts[2]) if len(parts) > 2 else None,
            )
    except (subprocess.CalledProcessError, FileNotFoundError):
        pass
    return GPUInfo(available=False, driver_version=None, gpu_name=None, memory_total=None)


def detect_cuda() -> CUDAInfo:
    """Detect CUDA toolkit installation."""
    cuda_paths = [
        Path("/usr/local/cuda"),
        Path("/opt/cuda"),
    ]

    for cuda_path in cuda_paths:
        version_json = cuda_path / "version.json"
        if version_json.exists():
            try:
                with open(version_json) as f:
                    data = json.load(f)
                    version = data.get("cuda", {}).get("version")
                    if version:
                        return CUDAInfo(available=True, version=version, path=cuda_path)
            except (json.JSONDecodeError, KeyError):
                pass

        version_txt = cuda_path / "version.txt"
        if version_txt.exists():
            try:
                content = version_txt.read_text()
                match = re.search(r"CUDA Version (\d+\.\d+)", content)
                if match:
                    return CUDAInfo(available=True, version=match.group(1), path=cuda_path)
            except IOError:
                pass

    return CUDAInfo(available=False, version=None, path=None)


def detect_cudnn() -> CuDNNInfo:
    """Detect cuDNN at system level and in Python packages."""
    system_available = False
    system_version = None
    pytorch_version = None
    onnx_available = False

    # Check system-level cuDNN via ldconfig
    try:
        result = subprocess.run(
            ["ldconfig", "-p"],
            capture_output=True,
            text=True,
            check=True,
        )
        if "libcudnn" in result.stdout:
            system_available = True
            # Try to extract version from library name
            match = re.search(r"libcudnn\.so\.(\d+)", result.stdout)
            if match:
                system_version = match.group(1)
    except (subprocess.CalledProcessError, FileNotFoundError):
        pass

    # Check PyTorch bundled cuDNN
    try:
        result = subprocess.run(
            ["python3", "-c", "import torch; print(torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'None')"],
            capture_output=True,
            text=True,
            check=True,
            timeout=30,
        )
        version_str = result.stdout.strip()
        if version_str and version_str != "None":
            pytorch_version = int(version_str)
    except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired, ValueError):
        pass

    # Check onnxruntime GPU support
    try:
        result = subprocess.run(
            ["python3", "-c", "import onnxruntime as ort; providers = ort.get_available_providers(); print('CUDAExecutionProvider' in providers)"],
            capture_output=True,
            text=True,
            check=True,
            timeout=30,
        )
        onnx_available = result.stdout.strip() == "True"
    except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired):
        pass

    return CuDNNInfo(
        system_available=system_available,
        system_version=system_version,
        pytorch_version=pytorch_version,
        onnx_available=onnx_available,
    )


def find_gpu_services(workspace_root: Path) -> list[tuple[str, Path, list[str]]]:
    """Find services with GPU dependencies.

    Returns list of (service_name, service_path, gpu_deps).
    """
    services_dir = workspace_root / "services"
    if not services_dir.exists():
        return []

    gpu_services = []

    for service_dir in services_dir.iterdir():
        if not service_dir.is_dir():
            continue

        # Check both direct pyproject.toml and service/pyproject.toml
        pyproject_paths = [
            service_dir / "pyproject.toml",
            service_dir / "service" / "pyproject.toml",
        ]

        for pyproject_path in pyproject_paths:
            if not pyproject_path.exists():
                continue

            try:
                content = pyproject_path.read_text()
                gpu_deps = [ind for ind in GPU_INDICATORS if ind in content]
                if gpu_deps:
                    # Determine actual service path (where venv should be)
                    if pyproject_path.parent.name == "service":
                        svc_path = pyproject_path.parent
                    else:
                        svc_path = service_dir
                    gpu_services.append((service_dir.name, svc_path, gpu_deps))
                    break  # Don't check both paths for same service
            except IOError:
                continue

    return gpu_services


def check_command(args, workspace_root: Path) -> int:
    """Diagnose GPU/CUDA/cuDNN status."""
    print("GPU/CUDA/cuDNN Status Check")
    print("=" * 60)
    print()

    # GPU Detection
    print("NVIDIA GPU")
    print("-" * 40)
    gpu = detect_gpu()
    if gpu.available:
        print(f"  ✓ GPU: {gpu.gpu_name}")
        print(f"  ✓ Driver: {gpu.driver_version}")
        print(f"  ✓ Memory: {gpu.memory_total} MB")
    else:
        print("  ✗ No NVIDIA GPU detected")
        print("    Run 'nvidia-smi' to diagnose")
    print()

    # CUDA Detection
    print("CUDA Toolkit")
    print("-" * 40)
    cuda = detect_cuda()
    if cuda.available:
        print(f"  ✓ Version: {cuda.version}")
        print(f"  ✓ Path: {cuda.path}")
    else:
        print("  ✗ CUDA toolkit not found")
        print("    Expected at /usr/local/cuda/")
    print()

    # cuDNN Detection
    print("cuDNN")
    print("-" * 40)
    cudnn = detect_cudnn()
    if cudnn.system_available:
        print(f"  ✓ System cuDNN: version {cudnn.system_version or 'unknown'}")
    else:
        print("  ✗ System cuDNN: not installed")

    if cudnn.pytorch_version:
        print(f"  ✓ PyTorch cuDNN: {cudnn.pytorch_version}")
    else:
        print("  ○ PyTorch cuDNN: not detected (torch not installed or no CUDA)")

    if cudnn.onnx_available:
        print("  ✓ ONNX Runtime: CUDA provider available")
    else:
        print("  ○ ONNX Runtime: CUDA provider not available")
    print()

    # GPU Services
    print("GPU Services Detected")
    print("-" * 40)
    gpu_services = find_gpu_services(workspace_root)
    if gpu_services:
        for name, path, deps in gpu_services:
            print(f"  • {name}")
            print(f"    Path: {path.relative_to(workspace_root)}")
            print(f"    GPU deps: {', '.join(deps)}")
            venv = path / ".venv"
            print(f"    Venv: {'✓ exists' if venv.exists() else '✗ missing'}")
    else:
        print("  No GPU services found in services/")
    print()

    # Summary
    print("=" * 60)
    if gpu.available and cuda.available:
        if cudnn.system_available or cudnn.pytorch_version:
            print("✓ GPU stack ready - cuDNN available")
            return 0
        else:
            print("⚠ GPU/CUDA ready, but cuDNN not detected")
            print("  Run: ./run setup-gpu install")
            return 1
    elif gpu.available:
        print("⚠ GPU available but CUDA toolkit missing")
        return 1
    else:
        print("✗ No GPU available")
        return 1


def install_command(args, workspace_root: Path) -> int:
    """Install PyTorch+CUDA and onnxruntime-gpu in service venvs."""
    parser = argparse.ArgumentParser(
        prog="./run setup-gpu install",
        description="Install GPU dependencies in service virtualenvs",
    )
    parser.add_argument(
        "--service",
        help="Target specific service (default: all GPU services)",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show what would be installed without installing",
    )
    parser.add_argument(
        "-v", "--verbose",
        action="store_true",
        help="Verbose output",
    )
    parsed = parser.parse_args(args)

    # Detect CUDA version for wheel selection
    cuda = detect_cuda()
    if not cuda.available:
        print("✗ CUDA toolkit not detected. Install CUDA first.")
        return 1

    # Determine PyTorch CUDA wheel
    cuda_major = int(cuda.version.split(".")[0]) if cuda.version else 12
    if cuda_major >= 13:
        # CUDA 13 is very new; use cu124 wheels (compatible)
        pytorch_cuda = "cu124"
    elif cuda_major == 12:
        pytorch_cuda = "cu124"
    else:
        pytorch_cuda = "cu118"

    pytorch_index = f"https://download.pytorch.org/whl/{pytorch_cuda}"

    print(f"PyTorch CUDA wheel: {pytorch_cuda} (system CUDA: {cuda.version})")
    print(f"PyTorch index: {pytorch_index}")
    print()

    # Find target services
    gpu_services = find_gpu_services(workspace_root)
    if parsed.service:
        gpu_services = [(n, p, d) for n, p, d in gpu_services if n == parsed.service]
        if not gpu_services:
            print(f"✗ Service '{parsed.service}' not found or has no GPU deps")
            return 1

    if not gpu_services:
        print("No GPU services found to install")
        return 0

    print(f"Installing GPU dependencies in {len(gpu_services)} service(s)")
    print("=" * 60)

    failed = []
    succeeded = []

    for name, svc_path, deps in gpu_services:
        print(f"\n▶ {name}")
        print(f"  Path: {svc_path}")
        print(f"  GPU deps: {', '.join(deps)}")

        venv_path = svc_path / ".venv"
        if not venv_path.exists():
            print(f"  ✗ No virtualenv at {venv_path}")
            print("    Run './run install' first to create venvs")
            failed.append(name)
            continue

        pip_path = venv_path / "bin" / "pip"

        # Determine what to install
        install_pytorch = any(d in deps for d in ["torch", "diffusers", "transformers", "image-reward"])
        install_onnx = "onnxruntime-gpu" in deps or "insightface" in deps

        commands = []

        if install_pytorch:
            commands.append((
                f"{pip_path} install torch torchvision --index-url {pytorch_index}",
                "PyTorch+CUDA"
            ))

        if install_onnx:
            commands.append((
                f"{pip_path} install onnxruntime-gpu",
                "onnxruntime-gpu"
            ))

        if parsed.dry_run:
            print("  [DRY RUN] Would install:")
            for cmd, desc in commands:
                print(f"    • {desc}")
            succeeded.append(name)
            continue

        success = True
        for cmd, desc in commands:
            print(f"  Installing {desc}...")
            result = subprocess.run(
                cmd,
                shell=True,
                cwd=svc_path,
                capture_output=not parsed.verbose,
            )
            if result.returncode != 0:
                print(f"  ✗ Failed to install {desc}")
                if not parsed.verbose and result.stderr:
                    print(f"    {result.stderr.decode()[:200]}")
                success = False
                break

        if success:
            print(f"  ✓ {name} GPU dependencies installed")
            succeeded.append(name)
        else:
            failed.append(name)

    # Summary
    print()
    print("=" * 60)
    print(f"Installed: {len(succeeded)}/{len(gpu_services)}")

    if failed:
        print(f"\nFailed: {', '.join(failed)}")
        return 1

    print("\n✓ All GPU dependencies installed")
    print("Run './run setup-gpu verify' to test GPU acceleration")
    return 0


def verify_command(args, workspace_root: Path) -> int:
    """Run GPU verification tests."""
    parser = argparse.ArgumentParser(
        prog="./run setup-gpu verify",
        description="Verify GPU acceleration works in service venvs",
    )
    parser.add_argument(
        "--service",
        help="Target specific service (default: all GPU services)",
    )
    parsed = parser.parse_args(args)

    gpu_services = find_gpu_services(workspace_root)
    if parsed.service:
        gpu_services = [(n, p, d) for n, p, d in gpu_services if n == parsed.service]

    if not gpu_services:
        print("No GPU services found to verify")
        return 0

    print("GPU Verification Tests")
    print("=" * 60)

    results = []

    for name, svc_path, deps in gpu_services:
        print(f"\n▶ {name}")

        venv_path = svc_path / ".venv"
        if not venv_path.exists():
            print("  ✗ No virtualenv")
            results.append((name, False, "no venv"))
            continue

        python_path = venv_path / "bin" / "python"

        # Test PyTorch CUDA
        install_pytorch = any(d in deps for d in ["torch", "diffusers", "transformers", "image-reward"])
        if install_pytorch:
            result = subprocess.run(
                [str(python_path), "-c",
                 "import torch; "
                 "cuda = torch.cuda.is_available(); "
                 "cudnn = torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else None; "
                 "print(f'CUDA:{cuda},cuDNN:{cudnn}')"],
                capture_output=True,
                text=True,
                timeout=60,
            )
            if result.returncode == 0:
                output = result.stdout.strip()
                if "CUDA:True" in output:
                    print(f"  ✓ PyTorch: {output}")
                else:
                    print(f"  ✗ PyTorch CUDA not available: {output}")
                    results.append((name, False, "PyTorch CUDA unavailable"))
                    continue
            else:
                print(f"  ✗ PyTorch test failed: {result.stderr[:100]}")
                results.append((name, False, "PyTorch test failed"))
                continue

        # Test ONNX Runtime
        install_onnx = "onnxruntime-gpu" in deps or "insightface" in deps
        if install_onnx:
            result = subprocess.run(
                [str(python_path), "-c",
                 "import onnxruntime as ort; "
                 "providers = ort.get_available_providers(); "
                 "cuda = 'CUDAExecutionProvider' in providers; "
                 "print(f'CUDA:{cuda},Providers:{providers}')"],
                capture_output=True,
                text=True,
                timeout=60,
            )
            if result.returncode == 0:
                output = result.stdout.strip()
                if "CUDA:True" in output:
                    print(f"  ✓ ONNX Runtime: CUDA provider available")
                else:
                    print(f"  ✗ ONNX Runtime CUDA not available")
                    results.append((name, False, "ONNX CUDA unavailable"))
                    continue
            else:
                print(f"  ✗ ONNX test failed: {result.stderr[:100]}")
                results.append((name, False, "ONNX test failed"))
                continue

        results.append((name, True, "OK"))

    # Summary
    print()
    print("=" * 60)
    passed = sum(1 for _, ok, _ in results if ok)
    print(f"Verified: {passed}/{len(results)}")

    failed = [(n, msg) for n, ok, msg in results if not ok]
    if failed:
        print("\nFailed:")
        for name, msg in failed:
            print(f"  ✗ {name}: {msg}")
        return 1

    print("\n✓ All GPU services verified")
    return 0


def system_command(args, workspace_root: Path) -> int:
    """Show/install system-level cuDNN."""
    parser = argparse.ArgumentParser(
        prog="./run setup-gpu system",
        description="Install system-level cuDNN via rpm-ostree",
    )
    parser.add_argument(
        "--install",
        action="store_true",
        help="Actually install (default: show instructions)",
    )
    parsed = parser.parse_args(args)

    cuda = detect_cuda()
    if not cuda.available:
        print("✗ CUDA toolkit not detected")
        return 1

    cuda_major = int(cuda.version.split(".")[0]) if cuda.version else 13

    # Determine package names
    if cuda_major >= 13:
        packages = ["cudnn9-cuda-13-0", "libcudnn9-cuda-13", "libcudnn9-devel-cuda-13"]
    else:
        packages = ["cudnn9-cuda-12", "libcudnn9-cuda-12", "libcudnn9-devel-cuda-12"]

    if not parsed.install:
        print("System-level cuDNN Installation")
        print("=" * 60)
        print()
        print("NOTE: Modern PyTorch and onnxruntime-gpu wheels bundle cuDNN.")
        print("System-level installation is optional but can help with compatibility.")
        print()
        print(f"Detected CUDA: {cuda.version}")
        print(f"Recommended packages: {' '.join(packages)}")
        print()
        print("For Bluefin LTS / rpm-ostree systems:")
        print()
        print(f"  sudo rpm-ostree install {' '.join(packages)}")
        print("  systemctl reboot  # Required for rpm-ostree changes")
        print()
        print("Or run with --install flag:")
        print("  ./run setup-gpu system --install")
        print()
        return 0

    # Install via rpm-ostree
    print(f"Installing system cuDNN for CUDA {cuda.version}...")
    print(f"Packages: {' '.join(packages)}")
    print()

    cmd = ["sudo", "rpm-ostree", "install"] + packages
    result = subprocess.run(cmd)

    if result.returncode == 0:
        print()
        print("✓ cuDNN packages staged for installation")
        print("  Run 'systemctl reboot' to apply changes")
        return 0
    else:
        print()
        print("✗ rpm-ostree install failed")
        return result.returncode


def setup_gpu_command(args, workspace_root: Path) -> int:
    """Main entry point for setup-gpu command."""
    parser = argparse.ArgumentParser(
        prog="./run setup-gpu",
        description="GPU/CUDA/cuDNN setup and diagnostics",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Subcommands:
  check     Diagnose GPU/CUDA/cuDNN status (default)
  install   Install PyTorch+CUDA and onnxruntime-gpu in service venvs
  verify    Run GPU verification tests
  system    Show/install system-level cuDNN via rpm-ostree

Examples:
  ./run setup-gpu                    # Check GPU status
  ./run setup-gpu check              # Same as above
  ./run setup-gpu install            # Install GPU deps in all services
  ./run setup-gpu install --service imajin-diffusion  # Single service
  ./run setup-gpu verify             # Test GPU acceleration
  ./run setup-gpu system             # Show system cuDNN instructions
  ./run setup-gpu system --install   # Install system cuDNN
        """,
    )

    subcommands = {
        "check": check_command,
        "install": install_command,
        "verify": verify_command,
        "system": system_command,
    }

    # Default to check if no subcommand
    if not args or args[0].startswith("-"):
        return check_command(args, workspace_root)

    subcommand = args[0]
    if subcommand not in subcommands:
        parser.print_help()
        return 1

    return subcommands[subcommand](args[1:], workspace_root)


def register_setup_gpu_command(runner):
    """Register the setup-gpu command with the script runner."""
    runner.register_command(
        "setup-gpu",
        setup_gpu_command,
        "GPU/CUDA/cuDNN setup and diagnostics",
    )