645 lines
20 KiB
Python
645 lines
20 KiB
Python
"""GPU/cuDNN setup command handler for script runner."""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import NamedTuple
|
|
|
|
|
|
class GPUInfo(NamedTuple):
|
|
"""GPU detection result."""
|
|
available: bool
|
|
driver_version: str | None
|
|
gpu_name: str | None
|
|
memory_total: int | None # MB
|
|
|
|
|
|
class CUDAInfo(NamedTuple):
|
|
"""CUDA toolkit detection result."""
|
|
available: bool
|
|
version: str | None
|
|
path: Path | None
|
|
|
|
|
|
class CuDNNInfo(NamedTuple):
|
|
"""cuDNN detection result."""
|
|
system_available: bool
|
|
system_version: str | None
|
|
pytorch_version: int | None
|
|
onnx_available: bool
|
|
|
|
|
|
# GPU indicators in pyproject.toml dependencies
|
|
GPU_INDICATORS = [
|
|
"torch",
|
|
"onnxruntime-gpu",
|
|
"insightface",
|
|
"diffusers",
|
|
"transformers",
|
|
"image-reward",
|
|
]
|
|
|
|
|
|
def detect_gpu() -> GPUInfo:
|
|
"""Detect NVIDIA GPU via nvidia-smi."""
|
|
try:
|
|
result = subprocess.run(
|
|
["nvidia-smi", "--query-gpu=name,driver_version,memory.total", "--format=csv,noheader,nounits"],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
)
|
|
lines = result.stdout.strip().split("\n")
|
|
if lines and lines[0]:
|
|
parts = [p.strip() for p in lines[0].split(",")]
|
|
return GPUInfo(
|
|
available=True,
|
|
gpu_name=parts[0] if len(parts) > 0 else None,
|
|
driver_version=parts[1] if len(parts) > 1 else None,
|
|
memory_total=int(parts[2]) if len(parts) > 2 else None,
|
|
)
|
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
pass
|
|
return GPUInfo(available=False, driver_version=None, gpu_name=None, memory_total=None)
|
|
|
|
|
|
def detect_cuda() -> CUDAInfo:
|
|
"""Detect CUDA toolkit installation."""
|
|
cuda_paths = [
|
|
Path("/usr/local/cuda"),
|
|
Path("/opt/cuda"),
|
|
]
|
|
|
|
for cuda_path in cuda_paths:
|
|
version_json = cuda_path / "version.json"
|
|
if version_json.exists():
|
|
try:
|
|
with open(version_json) as f:
|
|
data = json.load(f)
|
|
version = data.get("cuda", {}).get("version")
|
|
if version:
|
|
return CUDAInfo(available=True, version=version, path=cuda_path)
|
|
except (json.JSONDecodeError, KeyError):
|
|
pass
|
|
|
|
version_txt = cuda_path / "version.txt"
|
|
if version_txt.exists():
|
|
try:
|
|
content = version_txt.read_text()
|
|
match = re.search(r"CUDA Version (\d+\.\d+)", content)
|
|
if match:
|
|
return CUDAInfo(available=True, version=match.group(1), path=cuda_path)
|
|
except IOError:
|
|
pass
|
|
|
|
return CUDAInfo(available=False, version=None, path=None)
|
|
|
|
|
|
def detect_cudnn() -> CuDNNInfo:
|
|
"""Detect cuDNN at system level and in Python packages."""
|
|
system_available = False
|
|
system_version = None
|
|
pytorch_version = None
|
|
onnx_available = False
|
|
|
|
# Check system-level cuDNN via ldconfig
|
|
try:
|
|
result = subprocess.run(
|
|
["ldconfig", "-p"],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
)
|
|
if "libcudnn" in result.stdout:
|
|
system_available = True
|
|
# Try to extract version from library name
|
|
match = re.search(r"libcudnn\.so\.(\d+)", result.stdout)
|
|
if match:
|
|
system_version = match.group(1)
|
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
pass
|
|
|
|
# Check PyTorch bundled cuDNN
|
|
try:
|
|
result = subprocess.run(
|
|
["python3", "-c", "import torch; print(torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'None')"],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
timeout=30,
|
|
)
|
|
version_str = result.stdout.strip()
|
|
if version_str and version_str != "None":
|
|
pytorch_version = int(version_str)
|
|
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired, ValueError):
|
|
pass
|
|
|
|
# Check onnxruntime GPU support
|
|
try:
|
|
result = subprocess.run(
|
|
["python3", "-c", "import onnxruntime as ort; providers = ort.get_available_providers(); print('CUDAExecutionProvider' in providers)"],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
timeout=30,
|
|
)
|
|
onnx_available = result.stdout.strip() == "True"
|
|
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired):
|
|
pass
|
|
|
|
return CuDNNInfo(
|
|
system_available=system_available,
|
|
system_version=system_version,
|
|
pytorch_version=pytorch_version,
|
|
onnx_available=onnx_available,
|
|
)
|
|
|
|
|
|
def find_gpu_services(workspace_root: Path) -> list[tuple[str, Path, list[str]]]:
|
|
"""Find services with GPU dependencies.
|
|
|
|
Returns list of (service_name, service_path, gpu_deps).
|
|
"""
|
|
services_dir = workspace_root / "services"
|
|
if not services_dir.exists():
|
|
return []
|
|
|
|
gpu_services = []
|
|
|
|
for service_dir in services_dir.iterdir():
|
|
if not service_dir.is_dir():
|
|
continue
|
|
|
|
# Check both direct pyproject.toml and service/pyproject.toml
|
|
pyproject_paths = [
|
|
service_dir / "pyproject.toml",
|
|
service_dir / "service" / "pyproject.toml",
|
|
]
|
|
|
|
for pyproject_path in pyproject_paths:
|
|
if not pyproject_path.exists():
|
|
continue
|
|
|
|
try:
|
|
content = pyproject_path.read_text()
|
|
gpu_deps = [ind for ind in GPU_INDICATORS if ind in content]
|
|
if gpu_deps:
|
|
# Determine actual service path (where venv should be)
|
|
if pyproject_path.parent.name == "service":
|
|
svc_path = pyproject_path.parent
|
|
else:
|
|
svc_path = service_dir
|
|
gpu_services.append((service_dir.name, svc_path, gpu_deps))
|
|
break # Don't check both paths for same service
|
|
except IOError:
|
|
continue
|
|
|
|
return gpu_services
|
|
|
|
|
|
def check_command(args, workspace_root: Path) -> int:
|
|
"""Diagnose GPU/CUDA/cuDNN status."""
|
|
print("GPU/CUDA/cuDNN Status Check")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
# GPU Detection
|
|
print("NVIDIA GPU")
|
|
print("-" * 40)
|
|
gpu = detect_gpu()
|
|
if gpu.available:
|
|
print(f" ✓ GPU: {gpu.gpu_name}")
|
|
print(f" ✓ Driver: {gpu.driver_version}")
|
|
print(f" ✓ Memory: {gpu.memory_total} MB")
|
|
else:
|
|
print(" ✗ No NVIDIA GPU detected")
|
|
print(" Run 'nvidia-smi' to diagnose")
|
|
print()
|
|
|
|
# CUDA Detection
|
|
print("CUDA Toolkit")
|
|
print("-" * 40)
|
|
cuda = detect_cuda()
|
|
if cuda.available:
|
|
print(f" ✓ Version: {cuda.version}")
|
|
print(f" ✓ Path: {cuda.path}")
|
|
else:
|
|
print(" ✗ CUDA toolkit not found")
|
|
print(" Expected at /usr/local/cuda/")
|
|
print()
|
|
|
|
# cuDNN Detection
|
|
print("cuDNN")
|
|
print("-" * 40)
|
|
cudnn = detect_cudnn()
|
|
if cudnn.system_available:
|
|
print(f" ✓ System cuDNN: version {cudnn.system_version or 'unknown'}")
|
|
else:
|
|
print(" ✗ System cuDNN: not installed")
|
|
|
|
if cudnn.pytorch_version:
|
|
print(f" ✓ PyTorch cuDNN: {cudnn.pytorch_version}")
|
|
else:
|
|
print(" ○ PyTorch cuDNN: not detected (torch not installed or no CUDA)")
|
|
|
|
if cudnn.onnx_available:
|
|
print(" ✓ ONNX Runtime: CUDA provider available")
|
|
else:
|
|
print(" ○ ONNX Runtime: CUDA provider not available")
|
|
print()
|
|
|
|
# GPU Services
|
|
print("GPU Services Detected")
|
|
print("-" * 40)
|
|
gpu_services = find_gpu_services(workspace_root)
|
|
if gpu_services:
|
|
for name, path, deps in gpu_services:
|
|
print(f" • {name}")
|
|
print(f" Path: {path.relative_to(workspace_root)}")
|
|
print(f" GPU deps: {', '.join(deps)}")
|
|
venv = path / ".venv"
|
|
print(f" Venv: {'✓ exists' if venv.exists() else '✗ missing'}")
|
|
else:
|
|
print(" No GPU services found in services/")
|
|
print()
|
|
|
|
# Summary
|
|
print("=" * 60)
|
|
if gpu.available and cuda.available:
|
|
if cudnn.system_available or cudnn.pytorch_version:
|
|
print("✓ GPU stack ready - cuDNN available")
|
|
return 0
|
|
else:
|
|
print("⚠ GPU/CUDA ready, but cuDNN not detected")
|
|
print(" Run: ./run setup-gpu install")
|
|
return 1
|
|
elif gpu.available:
|
|
print("⚠ GPU available but CUDA toolkit missing")
|
|
return 1
|
|
else:
|
|
print("✗ No GPU available")
|
|
return 1
|
|
|
|
|
|
def install_command(args, workspace_root: Path) -> int:
|
|
"""Install PyTorch+CUDA and onnxruntime-gpu in service venvs."""
|
|
parser = argparse.ArgumentParser(
|
|
prog="./run setup-gpu install",
|
|
description="Install GPU dependencies in service virtualenvs",
|
|
)
|
|
parser.add_argument(
|
|
"--service",
|
|
help="Target specific service (default: all GPU services)",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Show what would be installed without installing",
|
|
)
|
|
parser.add_argument(
|
|
"-v", "--verbose",
|
|
action="store_true",
|
|
help="Verbose output",
|
|
)
|
|
parsed = parser.parse_args(args)
|
|
|
|
# Detect CUDA version for wheel selection
|
|
cuda = detect_cuda()
|
|
if not cuda.available:
|
|
print("✗ CUDA toolkit not detected. Install CUDA first.")
|
|
return 1
|
|
|
|
# Determine PyTorch CUDA wheel
|
|
cuda_major = int(cuda.version.split(".")[0]) if cuda.version else 12
|
|
if cuda_major >= 13:
|
|
# CUDA 13 is very new; use cu124 wheels (compatible)
|
|
pytorch_cuda = "cu124"
|
|
elif cuda_major == 12:
|
|
pytorch_cuda = "cu124"
|
|
else:
|
|
pytorch_cuda = "cu118"
|
|
|
|
pytorch_index = f"https://download.pytorch.org/whl/{pytorch_cuda}"
|
|
|
|
print(f"PyTorch CUDA wheel: {pytorch_cuda} (system CUDA: {cuda.version})")
|
|
print(f"PyTorch index: {pytorch_index}")
|
|
print()
|
|
|
|
# Find target services
|
|
gpu_services = find_gpu_services(workspace_root)
|
|
if parsed.service:
|
|
gpu_services = [(n, p, d) for n, p, d in gpu_services if n == parsed.service]
|
|
if not gpu_services:
|
|
print(f"✗ Service '{parsed.service}' not found or has no GPU deps")
|
|
return 1
|
|
|
|
if not gpu_services:
|
|
print("No GPU services found to install")
|
|
return 0
|
|
|
|
print(f"Installing GPU dependencies in {len(gpu_services)} service(s)")
|
|
print("=" * 60)
|
|
|
|
failed = []
|
|
succeeded = []
|
|
|
|
for name, svc_path, deps in gpu_services:
|
|
print(f"\n▶ {name}")
|
|
print(f" Path: {svc_path}")
|
|
print(f" GPU deps: {', '.join(deps)}")
|
|
|
|
venv_path = svc_path / ".venv"
|
|
if not venv_path.exists():
|
|
print(f" ✗ No virtualenv at {venv_path}")
|
|
print(" Run './run install' first to create venvs")
|
|
failed.append(name)
|
|
continue
|
|
|
|
pip_path = venv_path / "bin" / "pip"
|
|
|
|
# Determine what to install
|
|
install_pytorch = any(d in deps for d in ["torch", "diffusers", "transformers", "image-reward"])
|
|
install_onnx = "onnxruntime-gpu" in deps or "insightface" in deps
|
|
|
|
commands = []
|
|
|
|
if install_pytorch:
|
|
commands.append((
|
|
f"{pip_path} install torch torchvision --index-url {pytorch_index}",
|
|
"PyTorch+CUDA"
|
|
))
|
|
|
|
if install_onnx:
|
|
commands.append((
|
|
f"{pip_path} install onnxruntime-gpu",
|
|
"onnxruntime-gpu"
|
|
))
|
|
|
|
if parsed.dry_run:
|
|
print(" [DRY RUN] Would install:")
|
|
for cmd, desc in commands:
|
|
print(f" • {desc}")
|
|
succeeded.append(name)
|
|
continue
|
|
|
|
success = True
|
|
for cmd, desc in commands:
|
|
print(f" Installing {desc}...")
|
|
result = subprocess.run(
|
|
cmd,
|
|
shell=True,
|
|
cwd=svc_path,
|
|
capture_output=not parsed.verbose,
|
|
)
|
|
if result.returncode != 0:
|
|
print(f" ✗ Failed to install {desc}")
|
|
if not parsed.verbose and result.stderr:
|
|
print(f" {result.stderr.decode()[:200]}")
|
|
success = False
|
|
break
|
|
|
|
if success:
|
|
print(f" ✓ {name} GPU dependencies installed")
|
|
succeeded.append(name)
|
|
else:
|
|
failed.append(name)
|
|
|
|
# Summary
|
|
print()
|
|
print("=" * 60)
|
|
print(f"Installed: {len(succeeded)}/{len(gpu_services)}")
|
|
|
|
if failed:
|
|
print(f"\nFailed: {', '.join(failed)}")
|
|
return 1
|
|
|
|
print("\n✓ All GPU dependencies installed")
|
|
print("Run './run setup-gpu verify' to test GPU acceleration")
|
|
return 0
|
|
|
|
|
|
def verify_command(args, workspace_root: Path) -> int:
|
|
"""Run GPU verification tests."""
|
|
parser = argparse.ArgumentParser(
|
|
prog="./run setup-gpu verify",
|
|
description="Verify GPU acceleration works in service venvs",
|
|
)
|
|
parser.add_argument(
|
|
"--service",
|
|
help="Target specific service (default: all GPU services)",
|
|
)
|
|
parsed = parser.parse_args(args)
|
|
|
|
gpu_services = find_gpu_services(workspace_root)
|
|
if parsed.service:
|
|
gpu_services = [(n, p, d) for n, p, d in gpu_services if n == parsed.service]
|
|
|
|
if not gpu_services:
|
|
print("No GPU services found to verify")
|
|
return 0
|
|
|
|
print("GPU Verification Tests")
|
|
print("=" * 60)
|
|
|
|
results = []
|
|
|
|
for name, svc_path, deps in gpu_services:
|
|
print(f"\n▶ {name}")
|
|
|
|
venv_path = svc_path / ".venv"
|
|
if not venv_path.exists():
|
|
print(" ✗ No virtualenv")
|
|
results.append((name, False, "no venv"))
|
|
continue
|
|
|
|
python_path = venv_path / "bin" / "python"
|
|
|
|
# Test PyTorch CUDA
|
|
install_pytorch = any(d in deps for d in ["torch", "diffusers", "transformers", "image-reward"])
|
|
if install_pytorch:
|
|
result = subprocess.run(
|
|
[str(python_path), "-c",
|
|
"import torch; "
|
|
"cuda = torch.cuda.is_available(); "
|
|
"cudnn = torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else None; "
|
|
"print(f'CUDA:{cuda},cuDNN:{cudnn}')"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60,
|
|
)
|
|
if result.returncode == 0:
|
|
output = result.stdout.strip()
|
|
if "CUDA:True" in output:
|
|
print(f" ✓ PyTorch: {output}")
|
|
else:
|
|
print(f" ✗ PyTorch CUDA not available: {output}")
|
|
results.append((name, False, "PyTorch CUDA unavailable"))
|
|
continue
|
|
else:
|
|
print(f" ✗ PyTorch test failed: {result.stderr[:100]}")
|
|
results.append((name, False, "PyTorch test failed"))
|
|
continue
|
|
|
|
# Test ONNX Runtime
|
|
install_onnx = "onnxruntime-gpu" in deps or "insightface" in deps
|
|
if install_onnx:
|
|
result = subprocess.run(
|
|
[str(python_path), "-c",
|
|
"import onnxruntime as ort; "
|
|
"providers = ort.get_available_providers(); "
|
|
"cuda = 'CUDAExecutionProvider' in providers; "
|
|
"print(f'CUDA:{cuda},Providers:{providers}')"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60,
|
|
)
|
|
if result.returncode == 0:
|
|
output = result.stdout.strip()
|
|
if "CUDA:True" in output:
|
|
print(f" ✓ ONNX Runtime: CUDA provider available")
|
|
else:
|
|
print(f" ✗ ONNX Runtime CUDA not available")
|
|
results.append((name, False, "ONNX CUDA unavailable"))
|
|
continue
|
|
else:
|
|
print(f" ✗ ONNX test failed: {result.stderr[:100]}")
|
|
results.append((name, False, "ONNX test failed"))
|
|
continue
|
|
|
|
results.append((name, True, "OK"))
|
|
|
|
# Summary
|
|
print()
|
|
print("=" * 60)
|
|
passed = sum(1 for _, ok, _ in results if ok)
|
|
print(f"Verified: {passed}/{len(results)}")
|
|
|
|
failed = [(n, msg) for n, ok, msg in results if not ok]
|
|
if failed:
|
|
print("\nFailed:")
|
|
for name, msg in failed:
|
|
print(f" ✗ {name}: {msg}")
|
|
return 1
|
|
|
|
print("\n✓ All GPU services verified")
|
|
return 0
|
|
|
|
|
|
def system_command(args, workspace_root: Path) -> int:
|
|
"""Show/install system-level cuDNN."""
|
|
parser = argparse.ArgumentParser(
|
|
prog="./run setup-gpu system",
|
|
description="Install system-level cuDNN via rpm-ostree",
|
|
)
|
|
parser.add_argument(
|
|
"--install",
|
|
action="store_true",
|
|
help="Actually install (default: show instructions)",
|
|
)
|
|
parsed = parser.parse_args(args)
|
|
|
|
cuda = detect_cuda()
|
|
if not cuda.available:
|
|
print("✗ CUDA toolkit not detected")
|
|
return 1
|
|
|
|
cuda_major = int(cuda.version.split(".")[0]) if cuda.version else 13
|
|
|
|
# Determine package names
|
|
if cuda_major >= 13:
|
|
packages = ["cudnn9-cuda-13-0", "libcudnn9-cuda-13", "libcudnn9-devel-cuda-13"]
|
|
else:
|
|
packages = ["cudnn9-cuda-12", "libcudnn9-cuda-12", "libcudnn9-devel-cuda-12"]
|
|
|
|
if not parsed.install:
|
|
print("System-level cuDNN Installation")
|
|
print("=" * 60)
|
|
print()
|
|
print("NOTE: Modern PyTorch and onnxruntime-gpu wheels bundle cuDNN.")
|
|
print("System-level installation is optional but can help with compatibility.")
|
|
print()
|
|
print(f"Detected CUDA: {cuda.version}")
|
|
print(f"Recommended packages: {' '.join(packages)}")
|
|
print()
|
|
print("For Bluefin LTS / rpm-ostree systems:")
|
|
print()
|
|
print(f" sudo rpm-ostree install {' '.join(packages)}")
|
|
print(" systemctl reboot # Required for rpm-ostree changes")
|
|
print()
|
|
print("Or run with --install flag:")
|
|
print(" ./run setup-gpu system --install")
|
|
print()
|
|
return 0
|
|
|
|
# Install via rpm-ostree
|
|
print(f"Installing system cuDNN for CUDA {cuda.version}...")
|
|
print(f"Packages: {' '.join(packages)}")
|
|
print()
|
|
|
|
cmd = ["sudo", "rpm-ostree", "install"] + packages
|
|
result = subprocess.run(cmd)
|
|
|
|
if result.returncode == 0:
|
|
print()
|
|
print("✓ cuDNN packages staged for installation")
|
|
print(" Run 'systemctl reboot' to apply changes")
|
|
return 0
|
|
else:
|
|
print()
|
|
print("✗ rpm-ostree install failed")
|
|
return result.returncode
|
|
|
|
|
|
def setup_gpu_command(args, workspace_root: Path) -> int:
|
|
"""Main entry point for setup-gpu command."""
|
|
parser = argparse.ArgumentParser(
|
|
prog="./run setup-gpu",
|
|
description="GPU/CUDA/cuDNN setup and diagnostics",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Subcommands:
|
|
check Diagnose GPU/CUDA/cuDNN status (default)
|
|
install Install PyTorch+CUDA and onnxruntime-gpu in service venvs
|
|
verify Run GPU verification tests
|
|
system Show/install system-level cuDNN via rpm-ostree
|
|
|
|
Examples:
|
|
./run setup-gpu # Check GPU status
|
|
./run setup-gpu check # Same as above
|
|
./run setup-gpu install # Install GPU deps in all services
|
|
./run setup-gpu install --service imajin-diffusion # Single service
|
|
./run setup-gpu verify # Test GPU acceleration
|
|
./run setup-gpu system # Show system cuDNN instructions
|
|
./run setup-gpu system --install # Install system cuDNN
|
|
""",
|
|
)
|
|
|
|
subcommands = {
|
|
"check": check_command,
|
|
"install": install_command,
|
|
"verify": verify_command,
|
|
"system": system_command,
|
|
}
|
|
|
|
# Default to check if no subcommand
|
|
if not args or args[0].startswith("-"):
|
|
return check_command(args, workspace_root)
|
|
|
|
subcommand = args[0]
|
|
if subcommand not in subcommands:
|
|
parser.print_help()
|
|
return 1
|
|
|
|
return subcommands[subcommand](args[1:], workspace_root)
|
|
|
|
|
|
def register_setup_gpu_command(runner):
|
|
"""Register the setup-gpu command with the script runner."""
|
|
runner.register_command(
|
|
"setup-gpu",
|
|
setup_gpu_command,
|
|
"GPU/CUDA/cuDNN setup and diagnostics",
|
|
)
|