feat(gpu): GPU health check + RENDER_DEVICE_USED token + strict mode

- gpu_probe.py: Blender script that probes OPTIX/CUDA/HIP/ONEAPI and
  exits 1 on no GPU — used at startup + on-demand from Admin UI
- blender_render.py, still_render.py, turntable_render.py: emit
  RENDER_DEVICE_USED: engine=CYCLES device=GPU|CPU compute_type=...
  after GPU activation; exit 2 when CYCLES_DEVICE=gpu and CPU fallback
- render_blender.py: parse RENDER_DEVICE_USED token into render_log
  (device_used, compute_type, gpu_fallback); handle exit code 2 as
  explicit GPU strict-mode failure
- check_version.py: check_gpu() runs gpu_probe.py at container startup;
  CYCLES_DEVICE=gpu aborts startup if no GPU found
- docker-compose.yml: CYCLES_DEVICE=${CYCLES_DEVICE:-auto} env var
- gpu_tasks.py: probe_gpu Celery task on thumbnail_rendering queue;
  saves result to system_settings.gpu_probe_last_result; beat every 30min
- worker.py: POST /probe/gpu (trigger) + GET /probe/gpu/result (last result)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-08 20:57:36 +01:00
parent c6556434d6
commit 34f89cc225
10 changed files with 269 additions and 2 deletions
+29
View File
@@ -16,6 +16,7 @@ from app.models.order_line import OrderLine
from app.models.product import Product
from app.models.user import User
from app.models.worker_config import WorkerConfig
from app.models.system_setting import SystemSetting
from app.utils.auth import get_current_user, require_admin_or_pm, require_admin
router = APIRouter(prefix="/worker", tags=["worker"])
@@ -456,6 +457,34 @@ async def scale_workers(
return {"service": body.service, "count": body.count, "status": "scaling"}
# ---------------------------------------------------------------------------
# GPU probe
# ---------------------------------------------------------------------------
@router.post("/probe/gpu", status_code=http_status.HTTP_202_ACCEPTED)
async def trigger_gpu_probe(current_user: User = Depends(require_admin)):
"""Queue a GPU probe task on the render-worker."""
from app.tasks.gpu_tasks import probe_gpu
result = probe_gpu.delay()
return {"task_id": str(result.id), "queued": True}
@router.get("/probe/gpu/result")
async def get_gpu_probe_result(
current_user: User = Depends(require_admin),
db: AsyncSession = Depends(get_db),
):
"""Return the last GPU probe result from system_settings."""
import json
row = await db.execute(
select(SystemSetting).where(SystemSetting.key == "gpu_probe_last_result")
)
setting = row.scalar_one_or_none()
if not setting:
return {"status": "unknown", "message": "No probe run yet. Click Run GPU Check."}
return json.loads(setting.value)
# ---------------------------------------------------------------------------
# Render health check
# ---------------------------------------------------------------------------
+24
View File
@@ -224,9 +224,30 @@ def render_still(
log_lines = [l for l in stdout_lines if "[blender_render]" in l]
# Parse RENDER_DEVICE_USED token from stdout
device_used = "unknown"
compute_type = "unknown"
gpu_fallback = False
for line in stdout_lines:
if line.startswith("RENDER_DEVICE_USED:"):
parts = line.split()
for part in parts:
if part.startswith("device="):
device_used = part.split("=", 1)[1]
elif part.startswith("compute_type="):
compute_type = part.split("=", 1)[1]
gpu_fallback = (device_used == "CPU")
break
# EEVEE fallback removed (Phase 5.2): EEVEE Next in Blender 5.0+ is stable.
# If EEVEE fails, it is a hard failure — no silent retry.
if returncode == 2:
raise RuntimeError(
"GPU required but render used CPU — strict mode (CYCLES_DEVICE=gpu). "
"Check that the render-worker has a visible NVIDIA GPU."
)
if returncode != 0:
stdout_tail = "\n".join(stdout_lines[-50:]) if stdout_lines else ""
stderr_tail = "\n".join(stderr_lines[-20:]) if stderr_lines else ""
@@ -246,6 +267,9 @@ def render_still(
"output_size_bytes": output_path.stat().st_size if output_path.exists() else 0,
"parts_count": 0,
"engine_used": engine_used,
"device_used": device_used,
"compute_type": compute_type,
"gpu_fallback": gpu_fallback,
"log_lines": log_lines,
}
+5
View File
@@ -18,6 +18,7 @@ celery_app = Celery(
"app.domains.products.tasks",
"app.domains.imports.tasks",
"app.domains.materials.tasks",
"app.tasks.gpu_tasks",
],
)
@@ -56,5 +57,9 @@ celery_app.conf.update(
"task": "app.tasks.beat_tasks.apply_worker_concurrency",
"schedule": 300.0, # every 5 minutes
},
"probe-gpu-every-30m": {
"task": "app.tasks.gpu_tasks.probe_gpu",
"schedule": 1800.0, # every 30 minutes
},
},
)
+88
View File
@@ -0,0 +1,88 @@
"""Celery task for GPU health probe."""
import logging
from app.tasks.celery_app import celery_app
logger = logging.getLogger(__name__)
@celery_app.task(name="app.tasks.gpu_tasks.probe_gpu", queue="thumbnail_rendering")
def probe_gpu() -> dict:
"""Run Blender GPU probe on the render-worker. Stores result in system_settings."""
import subprocess
import json
from datetime import datetime, timezone
from pathlib import Path
from app.services.render_blender import find_blender
result = {
"status": "unknown",
"device_type": None,
"devices": [],
"error": None,
"probed_at": datetime.now(timezone.utc).isoformat(),
}
try:
blender_bin = find_blender()
if not blender_bin:
result["status"] = "error"
result["error"] = "Blender binary not found — check BLENDER_BIN env or PATH"
else:
probe_script = Path("/render-scripts/gpu_probe.py")
if not probe_script.exists():
result["status"] = "error"
result["error"] = f"gpu_probe.py not found at {probe_script}"
else:
proc = subprocess.run(
[blender_bin, "--background", "--python", str(probe_script)],
capture_output=True, text=True, timeout=60,
)
for line in proc.stdout.splitlines():
if "GPU_PROBE_OK:" in line:
result["status"] = "ok"
# Parse device_type and devices from line:
# GPU_PROBE_OK: device_type=OPTIX devices=[...]
parts = line.split("GPU_PROBE_OK:", 1)[1].strip()
for p in parts.split():
if p.startswith("device_type="):
result["device_type"] = p.split("=", 1)[1]
break
elif "GPU_PROBE_FAIL:" in line:
result["status"] = "failed"
result["error"] = line.split("GPU_PROBE_FAIL:", 1)[1].strip()
break
if result["status"] == "unknown":
result["status"] = "failed" if proc.returncode != 0 else "unknown"
result["error"] = proc.stderr[:500] if proc.stderr else "No probe output"
except subprocess.TimeoutExpired:
result["status"] = "error"
result["error"] = "GPU probe timed out after 60s"
except Exception as exc:
result["status"] = "error"
result["error"] = str(exc)
# Save to system_settings
_save_probe_result(result)
return result
def _save_probe_result(result: dict) -> None:
import json
from sqlalchemy import create_engine, text
from app.config import settings as app_settings
sync_url = app_settings.database_url.replace("+asyncpg", "")
eng = create_engine(sync_url)
try:
with eng.connect() as conn:
conn.execute(text("""
INSERT INTO system_settings (key, value) VALUES (:key, :value)
ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value
"""), {"key": "gpu_probe_last_result", "value": json.dumps(result)})
conn.commit()
finally:
eng.dispose()