feat(gpu): GPU health check + RENDER_DEVICE_USED token + strict mode
- gpu_probe.py: Blender script that probes OPTIX/CUDA/HIP/ONEAPI and
exits 1 on no GPU — used at startup + on-demand from Admin UI
- blender_render.py, still_render.py, turntable_render.py: emit
RENDER_DEVICE_USED: engine=CYCLES device=GPU|CPU compute_type=...
after GPU activation; exit 2 when CYCLES_DEVICE=gpu and CPU fallback
- render_blender.py: parse RENDER_DEVICE_USED token into render_log
(device_used, compute_type, gpu_fallback); handle exit code 2 as
explicit GPU strict-mode failure
- check_version.py: check_gpu() runs gpu_probe.py at container startup;
CYCLES_DEVICE=gpu aborts startup if no GPU found
- docker-compose.yml: CYCLES_DEVICE=${CYCLES_DEVICE:-auto} env var
- gpu_tasks.py: probe_gpu Celery task on thumbnail_rendering queue;
saves result to system_settings.gpu_probe_last_result; beat every 30min
- worker.py: POST /probe/gpu (trigger) + GET /probe/gpu/result (last result)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,88 @@
|
||||
"""Celery task for GPU health probe."""
|
||||
import logging
|
||||
from app.tasks.celery_app import celery_app
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@celery_app.task(name="app.tasks.gpu_tasks.probe_gpu", queue="thumbnail_rendering")
|
||||
def probe_gpu() -> dict:
|
||||
"""Run Blender GPU probe on the render-worker. Stores result in system_settings."""
|
||||
import subprocess
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from app.services.render_blender import find_blender
|
||||
|
||||
result = {
|
||||
"status": "unknown",
|
||||
"device_type": None,
|
||||
"devices": [],
|
||||
"error": None,
|
||||
"probed_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
|
||||
try:
|
||||
blender_bin = find_blender()
|
||||
if not blender_bin:
|
||||
result["status"] = "error"
|
||||
result["error"] = "Blender binary not found — check BLENDER_BIN env or PATH"
|
||||
else:
|
||||
probe_script = Path("/render-scripts/gpu_probe.py")
|
||||
|
||||
if not probe_script.exists():
|
||||
result["status"] = "error"
|
||||
result["error"] = f"gpu_probe.py not found at {probe_script}"
|
||||
else:
|
||||
proc = subprocess.run(
|
||||
[blender_bin, "--background", "--python", str(probe_script)],
|
||||
capture_output=True, text=True, timeout=60,
|
||||
)
|
||||
|
||||
for line in proc.stdout.splitlines():
|
||||
if "GPU_PROBE_OK:" in line:
|
||||
result["status"] = "ok"
|
||||
# Parse device_type and devices from line:
|
||||
# GPU_PROBE_OK: device_type=OPTIX devices=[...]
|
||||
parts = line.split("GPU_PROBE_OK:", 1)[1].strip()
|
||||
for p in parts.split():
|
||||
if p.startswith("device_type="):
|
||||
result["device_type"] = p.split("=", 1)[1]
|
||||
break
|
||||
elif "GPU_PROBE_FAIL:" in line:
|
||||
result["status"] = "failed"
|
||||
result["error"] = line.split("GPU_PROBE_FAIL:", 1)[1].strip()
|
||||
break
|
||||
|
||||
if result["status"] == "unknown":
|
||||
result["status"] = "failed" if proc.returncode != 0 else "unknown"
|
||||
result["error"] = proc.stderr[:500] if proc.stderr else "No probe output"
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
result["status"] = "error"
|
||||
result["error"] = "GPU probe timed out after 60s"
|
||||
except Exception as exc:
|
||||
result["status"] = "error"
|
||||
result["error"] = str(exc)
|
||||
|
||||
# Save to system_settings
|
||||
_save_probe_result(result)
|
||||
return result
|
||||
|
||||
|
||||
def _save_probe_result(result: dict) -> None:
|
||||
import json
|
||||
from sqlalchemy import create_engine, text
|
||||
from app.config import settings as app_settings
|
||||
|
||||
sync_url = app_settings.database_url.replace("+asyncpg", "")
|
||||
eng = create_engine(sync_url)
|
||||
try:
|
||||
with eng.connect() as conn:
|
||||
conn.execute(text("""
|
||||
INSERT INTO system_settings (key, value) VALUES (:key, :value)
|
||||
ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value
|
||||
"""), {"key": "gpu_probe_last_result", "value": json.dumps(result)})
|
||||
conn.commit()
|
||||
finally:
|
||||
eng.dispose()
|
||||
Reference in New Issue
Block a user