imajin/scripts/run/setup_gpu_command.py

646 lines
20 KiB
Python
Raw Permalink Normal View History

"""GPU/cuDNN setup command handler for script runner."""
import argparse
import json
import re
import subprocess
import sys
from pathlib import Path
from typing import NamedTuple
class GPUInfo(NamedTuple):
"""GPU detection result."""
available: bool
driver_version: str | None
gpu_name: str | None
memory_total: int | None # MB
class CUDAInfo(NamedTuple):
"""CUDA toolkit detection result."""
available: bool
version: str | None
path: Path | None
class CuDNNInfo(NamedTuple):
"""cuDNN detection result."""
system_available: bool
system_version: str | None
pytorch_version: int | None
onnx_available: bool
# GPU indicators in pyproject.toml dependencies
GPU_INDICATORS = [
"torch",
"onnxruntime-gpu",
"insightface",
"diffusers",
"transformers",
"image-reward",
]
def detect_gpu() -> GPUInfo:
"""Detect NVIDIA GPU via nvidia-smi."""
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=name,driver_version,memory.total", "--format=csv,noheader,nounits"],
capture_output=True,
text=True,
check=True,
)
lines = result.stdout.strip().split("\n")
if lines and lines[0]:
parts = [p.strip() for p in lines[0].split(",")]
return GPUInfo(
available=True,
gpu_name=parts[0] if len(parts) > 0 else None,
driver_version=parts[1] if len(parts) > 1 else None,
memory_total=int(parts[2]) if len(parts) > 2 else None,
)
except (subprocess.CalledProcessError, FileNotFoundError):
pass
return GPUInfo(available=False, driver_version=None, gpu_name=None, memory_total=None)
def detect_cuda() -> CUDAInfo:
"""Detect CUDA toolkit installation."""
cuda_paths = [
Path("/usr/local/cuda"),
Path("/opt/cuda"),
]
for cuda_path in cuda_paths:
version_json = cuda_path / "version.json"
if version_json.exists():
try:
with open(version_json) as f:
data = json.load(f)
version = data.get("cuda", {}).get("version")
if version:
return CUDAInfo(available=True, version=version, path=cuda_path)
except (json.JSONDecodeError, KeyError):
pass
version_txt = cuda_path / "version.txt"
if version_txt.exists():
try:
content = version_txt.read_text()
match = re.search(r"CUDA Version (\d+\.\d+)", content)
if match:
return CUDAInfo(available=True, version=match.group(1), path=cuda_path)
except IOError:
pass
return CUDAInfo(available=False, version=None, path=None)
def detect_cudnn() -> CuDNNInfo:
"""Detect cuDNN at system level and in Python packages."""
system_available = False
system_version = None
pytorch_version = None
onnx_available = False
# Check system-level cuDNN via ldconfig
try:
result = subprocess.run(
["ldconfig", "-p"],
capture_output=True,
text=True,
check=True,
)
if "libcudnn" in result.stdout:
system_available = True
# Try to extract version from library name
match = re.search(r"libcudnn\.so\.(\d+)", result.stdout)
if match:
system_version = match.group(1)
except (subprocess.CalledProcessError, FileNotFoundError):
pass
# Check PyTorch bundled cuDNN
try:
result = subprocess.run(
["python3", "-c", "import torch; print(torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'None')"],
capture_output=True,
text=True,
check=True,
timeout=30,
)
version_str = result.stdout.strip()
if version_str and version_str != "None":
pytorch_version = int(version_str)
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired, ValueError):
pass
# Check onnxruntime GPU support
try:
result = subprocess.run(
["python3", "-c", "import onnxruntime as ort; providers = ort.get_available_providers(); print('CUDAExecutionProvider' in providers)"],
capture_output=True,
text=True,
check=True,
timeout=30,
)
onnx_available = result.stdout.strip() == "True"
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired):
pass
return CuDNNInfo(
system_available=system_available,
system_version=system_version,
pytorch_version=pytorch_version,
onnx_available=onnx_available,
)
def find_gpu_services(workspace_root: Path) -> list[tuple[str, Path, list[str]]]:
"""Find services with GPU dependencies.
Returns list of (service_name, service_path, gpu_deps).
"""
services_dir = workspace_root / "services"
if not services_dir.exists():
return []
gpu_services = []
for service_dir in services_dir.iterdir():
if not service_dir.is_dir():
continue
# Check both direct pyproject.toml and service/pyproject.toml
pyproject_paths = [
service_dir / "pyproject.toml",
service_dir / "service" / "pyproject.toml",
]
for pyproject_path in pyproject_paths:
if not pyproject_path.exists():
continue
try:
content = pyproject_path.read_text()
gpu_deps = [ind for ind in GPU_INDICATORS if ind in content]
if gpu_deps:
# Determine actual service path (where venv should be)
if pyproject_path.parent.name == "service":
svc_path = pyproject_path.parent
else:
svc_path = service_dir
gpu_services.append((service_dir.name, svc_path, gpu_deps))
break # Don't check both paths for same service
except IOError:
continue
return gpu_services
def check_command(args, workspace_root: Path) -> int:
"""Diagnose GPU/CUDA/cuDNN status."""
print("GPU/CUDA/cuDNN Status Check")
print("=" * 60)
print()
# GPU Detection
print("NVIDIA GPU")
print("-" * 40)
gpu = detect_gpu()
if gpu.available:
print(f" ✓ GPU: {gpu.gpu_name}")
print(f" ✓ Driver: {gpu.driver_version}")
print(f" ✓ Memory: {gpu.memory_total} MB")
else:
print(" ✗ No NVIDIA GPU detected")
print(" Run 'nvidia-smi' to diagnose")
print()
# CUDA Detection
print("CUDA Toolkit")
print("-" * 40)
cuda = detect_cuda()
if cuda.available:
print(f" ✓ Version: {cuda.version}")
print(f" ✓ Path: {cuda.path}")
else:
print(" ✗ CUDA toolkit not found")
print(" Expected at /usr/local/cuda/")
print()
# cuDNN Detection
print("cuDNN")
print("-" * 40)
cudnn = detect_cudnn()
if cudnn.system_available:
print(f" ✓ System cuDNN: version {cudnn.system_version or 'unknown'}")
else:
print(" ✗ System cuDNN: not installed")
if cudnn.pytorch_version:
print(f" ✓ PyTorch cuDNN: {cudnn.pytorch_version}")
else:
print(" ○ PyTorch cuDNN: not detected (torch not installed or no CUDA)")
if cudnn.onnx_available:
print(" ✓ ONNX Runtime: CUDA provider available")
else:
print(" ○ ONNX Runtime: CUDA provider not available")
print()
# GPU Services
print("GPU Services Detected")
print("-" * 40)
gpu_services = find_gpu_services(workspace_root)
if gpu_services:
for name, path, deps in gpu_services:
print(f"{name}")
print(f" Path: {path.relative_to(workspace_root)}")
print(f" GPU deps: {', '.join(deps)}")
venv = path / ".venv"
print(f" Venv: {'✓ exists' if venv.exists() else '✗ missing'}")
else:
print(" No GPU services found in services/")
print()
# Summary
print("=" * 60)
if gpu.available and cuda.available:
if cudnn.system_available or cudnn.pytorch_version:
print("✓ GPU stack ready - cuDNN available")
return 0
else:
print("⚠ GPU/CUDA ready, but cuDNN not detected")
print(" Run: ./run setup-gpu install")
return 1
elif gpu.available:
print("⚠ GPU available but CUDA toolkit missing")
return 1
else:
print("✗ No GPU available")
return 1
def install_command(args, workspace_root: Path) -> int:
"""Install PyTorch+CUDA and onnxruntime-gpu in service venvs."""
parser = argparse.ArgumentParser(
prog="./run setup-gpu install",
description="Install GPU dependencies in service virtualenvs",
)
parser.add_argument(
"--service",
help="Target specific service (default: all GPU services)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be installed without installing",
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Verbose output",
)
parsed = parser.parse_args(args)
# Detect CUDA version for wheel selection
cuda = detect_cuda()
if not cuda.available:
print("✗ CUDA toolkit not detected. Install CUDA first.")
return 1
# Determine PyTorch CUDA wheel
cuda_major = int(cuda.version.split(".")[0]) if cuda.version else 12
if cuda_major >= 13:
# CUDA 13 is very new; use cu124 wheels (compatible)
pytorch_cuda = "cu124"
elif cuda_major == 12:
pytorch_cuda = "cu124"
else:
pytorch_cuda = "cu118"
pytorch_index = f"https://download.pytorch.org/whl/{pytorch_cuda}"
print(f"PyTorch CUDA wheel: {pytorch_cuda} (system CUDA: {cuda.version})")
print(f"PyTorch index: {pytorch_index}")
print()
# Find target services
gpu_services = find_gpu_services(workspace_root)
if parsed.service:
gpu_services = [(n, p, d) for n, p, d in gpu_services if n == parsed.service]
if not gpu_services:
print(f"✗ Service '{parsed.service}' not found or has no GPU deps")
return 1
if not gpu_services:
print("No GPU services found to install")
return 0
print(f"Installing GPU dependencies in {len(gpu_services)} service(s)")
print("=" * 60)
failed = []
succeeded = []
for name, svc_path, deps in gpu_services:
print(f"\n{name}")
print(f" Path: {svc_path}")
print(f" GPU deps: {', '.join(deps)}")
venv_path = svc_path / ".venv"
if not venv_path.exists():
print(f" ✗ No virtualenv at {venv_path}")
print(" Run './run install' first to create venvs")
failed.append(name)
continue
pip_path = venv_path / "bin" / "pip"
# Determine what to install
install_pytorch = any(d in deps for d in ["torch", "diffusers", "transformers", "image-reward"])
install_onnx = "onnxruntime-gpu" in deps or "insightface" in deps
commands = []
if install_pytorch:
commands.append((
f"{pip_path} install torch torchvision --index-url {pytorch_index}",
"PyTorch+CUDA"
))
if install_onnx:
commands.append((
f"{pip_path} install onnxruntime-gpu",
"onnxruntime-gpu"
))
if parsed.dry_run:
print(" [DRY RUN] Would install:")
for cmd, desc in commands:
print(f"{desc}")
succeeded.append(name)
continue
success = True
for cmd, desc in commands:
print(f" Installing {desc}...")
result = subprocess.run(
cmd,
shell=True,
cwd=svc_path,
capture_output=not parsed.verbose,
)
if result.returncode != 0:
print(f" ✗ Failed to install {desc}")
if not parsed.verbose and result.stderr:
print(f" {result.stderr.decode()[:200]}")
success = False
break
if success:
print(f"{name} GPU dependencies installed")
succeeded.append(name)
else:
failed.append(name)
# Summary
print()
print("=" * 60)
print(f"Installed: {len(succeeded)}/{len(gpu_services)}")
if failed:
print(f"\nFailed: {', '.join(failed)}")
return 1
print("\n✓ All GPU dependencies installed")
print("Run './run setup-gpu verify' to test GPU acceleration")
return 0
def verify_command(args, workspace_root: Path) -> int:
"""Run GPU verification tests."""
parser = argparse.ArgumentParser(
prog="./run setup-gpu verify",
description="Verify GPU acceleration works in service venvs",
)
parser.add_argument(
"--service",
help="Target specific service (default: all GPU services)",
)
parsed = parser.parse_args(args)
gpu_services = find_gpu_services(workspace_root)
if parsed.service:
gpu_services = [(n, p, d) for n, p, d in gpu_services if n == parsed.service]
if not gpu_services:
print("No GPU services found to verify")
return 0
print("GPU Verification Tests")
print("=" * 60)
results = []
for name, svc_path, deps in gpu_services:
print(f"\n{name}")
venv_path = svc_path / ".venv"
if not venv_path.exists():
print(" ✗ No virtualenv")
results.append((name, False, "no venv"))
continue
python_path = venv_path / "bin" / "python"
# Test PyTorch CUDA
install_pytorch = any(d in deps for d in ["torch", "diffusers", "transformers", "image-reward"])
if install_pytorch:
result = subprocess.run(
[str(python_path), "-c",
"import torch; "
"cuda = torch.cuda.is_available(); "
"cudnn = torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else None; "
"print(f'CUDA:{cuda},cuDNN:{cudnn}')"],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode == 0:
output = result.stdout.strip()
if "CUDA:True" in output:
print(f" ✓ PyTorch: {output}")
else:
print(f" ✗ PyTorch CUDA not available: {output}")
results.append((name, False, "PyTorch CUDA unavailable"))
continue
else:
print(f" ✗ PyTorch test failed: {result.stderr[:100]}")
results.append((name, False, "PyTorch test failed"))
continue
# Test ONNX Runtime
install_onnx = "onnxruntime-gpu" in deps or "insightface" in deps
if install_onnx:
result = subprocess.run(
[str(python_path), "-c",
"import onnxruntime as ort; "
"providers = ort.get_available_providers(); "
"cuda = 'CUDAExecutionProvider' in providers; "
"print(f'CUDA:{cuda},Providers:{providers}')"],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode == 0:
output = result.stdout.strip()
if "CUDA:True" in output:
print(f" ✓ ONNX Runtime: CUDA provider available")
else:
print(f" ✗ ONNX Runtime CUDA not available")
results.append((name, False, "ONNX CUDA unavailable"))
continue
else:
print(f" ✗ ONNX test failed: {result.stderr[:100]}")
results.append((name, False, "ONNX test failed"))
continue
results.append((name, True, "OK"))
# Summary
print()
print("=" * 60)
passed = sum(1 for _, ok, _ in results if ok)
print(f"Verified: {passed}/{len(results)}")
failed = [(n, msg) for n, ok, msg in results if not ok]
if failed:
print("\nFailed:")
for name, msg in failed:
print(f"{name}: {msg}")
return 1
print("\n✓ All GPU services verified")
return 0
def system_command(args, workspace_root: Path) -> int:
"""Show/install system-level cuDNN."""
parser = argparse.ArgumentParser(
prog="./run setup-gpu system",
description="Install system-level cuDNN via rpm-ostree",
)
parser.add_argument(
"--install",
action="store_true",
help="Actually install (default: show instructions)",
)
parsed = parser.parse_args(args)
cuda = detect_cuda()
if not cuda.available:
print("✗ CUDA toolkit not detected")
return 1
cuda_major = int(cuda.version.split(".")[0]) if cuda.version else 13
# Determine package names
if cuda_major >= 13:
packages = ["cudnn9-cuda-13-0", "libcudnn9-cuda-13", "libcudnn9-devel-cuda-13"]
else:
packages = ["cudnn9-cuda-12", "libcudnn9-cuda-12", "libcudnn9-devel-cuda-12"]
if not parsed.install:
print("System-level cuDNN Installation")
print("=" * 60)
print()
print("NOTE: Modern PyTorch and onnxruntime-gpu wheels bundle cuDNN.")
print("System-level installation is optional but can help with compatibility.")
print()
print(f"Detected CUDA: {cuda.version}")
print(f"Recommended packages: {' '.join(packages)}")
print()
print("For Bluefin LTS / rpm-ostree systems:")
print()
print(f" sudo rpm-ostree install {' '.join(packages)}")
print(" systemctl reboot # Required for rpm-ostree changes")
print()
print("Or run with --install flag:")
print(" ./run setup-gpu system --install")
print()
return 0
# Install via rpm-ostree
print(f"Installing system cuDNN for CUDA {cuda.version}...")
print(f"Packages: {' '.join(packages)}")
print()
cmd = ["sudo", "rpm-ostree", "install"] + packages
result = subprocess.run(cmd)
if result.returncode == 0:
print()
print("✓ cuDNN packages staged for installation")
print(" Run 'systemctl reboot' to apply changes")
return 0
else:
print()
print("✗ rpm-ostree install failed")
return result.returncode
def setup_gpu_command(args, workspace_root: Path) -> int:
"""Main entry point for setup-gpu command."""
parser = argparse.ArgumentParser(
prog="./run setup-gpu",
description="GPU/CUDA/cuDNN setup and diagnostics",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Subcommands:
check Diagnose GPU/CUDA/cuDNN status (default)
install Install PyTorch+CUDA and onnxruntime-gpu in service venvs
verify Run GPU verification tests
system Show/install system-level cuDNN via rpm-ostree
Examples:
./run setup-gpu # Check GPU status
./run setup-gpu check # Same as above
./run setup-gpu install # Install GPU deps in all services
./run setup-gpu install --service imajin-diffusion # Single service
./run setup-gpu verify # Test GPU acceleration
./run setup-gpu system # Show system cuDNN instructions
./run setup-gpu system --install # Install system cuDNN
""",
)
subcommands = {
"check": check_command,
"install": install_command,
"verify": verify_command,
"system": system_command,
}
# Default to check if no subcommand
if not args or args[0].startswith("-"):
return check_command(args, workspace_root)
subcommand = args[0]
if subcommand not in subcommands:
parser.print_help()
return 1
return subcommands[subcommand](args[1:], workspace_root)
def register_setup_gpu_command(runner):
"""Register the setup-gpu command with the script runner."""
runner.register_command(
"setup-gpu",
setup_gpu_command,
"GPU/CUDA/cuDNN setup and diagnostics",
)