feat(gpu): GPU health check + RENDER_DEVICE_USED token + strict mode

- gpu_probe.py: Blender script that probes OPTIX/CUDA/HIP/ONEAPI and exits 1 on no GPU — used at startup + on-demand from Admin UI - blender_render.py, still_render.py, turntable_render.py: emit RENDER_DEVICE_USED: engine=CYCLES device=GPU|CPU compute_type=... after GPU activation; exit 2 when CYCLES_DEVICE=gpu and CPU fallback - render_blender.py: parse RENDER_DEVICE_USED token into render_log (device_used, compute_type, gpu_fallback); handle exit code 2 as explicit GPU strict-mode failure - check_version.py: check_gpu() runs gpu_probe.py at container startup; CYCLES_DEVICE=gpu aborts startup if no GPU found - docker-compose.yml: CYCLES_DEVICE=${CYCLES_DEVICE:-auto} env var - gpu_tasks.py: probe_gpu Celery task on thumbnail_rendering queue; saves result to system_settings.gpu_probe_last_result; beat every 30min - worker.py: POST /probe/gpu (trigger) + GET /probe/gpu/result (last result) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-08 20:57:36 +01:00
parent c6556434d6
commit 34f89cc225
10 changed files with 269 additions and 2 deletions
@@ -16,6 +16,7 @@ from app.models.order_line import OrderLine
 from app.models.product import Product
 from app.models.user import User
 from app.models.worker_config import WorkerConfig
+from app.models.system_setting import SystemSetting
 from app.utils.auth import get_current_user, require_admin_or_pm, require_admin

 router = APIRouter(prefix="/worker", tags=["worker"])
@@ -456,6 +457,34 @@ async def scale_workers(
    return {"service": body.service, "count": body.count, "status": "scaling"}


+# ---------------------------------------------------------------------------
+# GPU probe
+# ---------------------------------------------------------------------------
+
+@router.post("/probe/gpu", status_code=http_status.HTTP_202_ACCEPTED)
+async def trigger_gpu_probe(current_user: User = Depends(require_admin)):
+    """Queue a GPU probe task on the render-worker."""
+    from app.tasks.gpu_tasks import probe_gpu
+    result = probe_gpu.delay()
+    return {"task_id": str(result.id), "queued": True}
+
+
+@router.get("/probe/gpu/result")
+async def get_gpu_probe_result(
+    current_user: User = Depends(require_admin),
+    db: AsyncSession = Depends(get_db),
+):
+    """Return the last GPU probe result from system_settings."""
+    import json
+    row = await db.execute(
+        select(SystemSetting).where(SystemSetting.key == "gpu_probe_last_result")
+    )
+    setting = row.scalar_one_or_none()
+    if not setting:
+        return {"status": "unknown", "message": "No probe run yet. Click Run GPU Check."}
+    return json.loads(setting.value)
+
+
 # ---------------------------------------------------------------------------
 # Render health check
 # ---------------------------------------------------------------------------
@@ -224,9 +224,30 @@ def render_still(

    log_lines = [l for l in stdout_lines if "[blender_render]" in l]

+    # Parse RENDER_DEVICE_USED token from stdout
+    device_used = "unknown"
+    compute_type = "unknown"
+    gpu_fallback = False
+    for line in stdout_lines:
+        if line.startswith("RENDER_DEVICE_USED:"):
+            parts = line.split()
+            for part in parts:
+                if part.startswith("device="):
+                    device_used = part.split("=", 1)[1]
+                elif part.startswith("compute_type="):
+                    compute_type = part.split("=", 1)[1]
+            gpu_fallback = (device_used == "CPU")
+            break
+
    # EEVEE fallback removed (Phase 5.2): EEVEE Next in Blender 5.0+ is stable.
    # If EEVEE fails, it is a hard failure — no silent retry.

+    if returncode == 2:
+        raise RuntimeError(
+            "GPU required but render used CPU — strict mode (CYCLES_DEVICE=gpu). "
+            "Check that the render-worker has a visible NVIDIA GPU."
+        )
+
    if returncode != 0:
        stdout_tail = "\n".join(stdout_lines[-50:]) if stdout_lines else ""
        stderr_tail = "\n".join(stderr_lines[-20:]) if stderr_lines else ""
@@ -246,6 +267,9 @@ def render_still(
        "output_size_bytes": output_path.stat().st_size if output_path.exists() else 0,
        "parts_count": 0,
        "engine_used": engine_used,
+        "device_used": device_used,
+        "compute_type": compute_type,
+        "gpu_fallback": gpu_fallback,
        "log_lines": log_lines,
    }

@@ -18,6 +18,7 @@ celery_app = Celery(
        "app.domains.products.tasks",
        "app.domains.imports.tasks",
        "app.domains.materials.tasks",
+        "app.tasks.gpu_tasks",
    ],
 )

@@ -56,5 +57,9 @@ celery_app.conf.update(
            "task": "app.tasks.beat_tasks.apply_worker_concurrency",
            "schedule": 300.0,  # every 5 minutes
        },
+        "probe-gpu-every-30m": {
+            "task": "app.tasks.gpu_tasks.probe_gpu",
+            "schedule": 1800.0,  # every 30 minutes
+        },
    },
 )
@@ -0,0 +1,88 @@
+"""Celery task for GPU health probe."""
+import logging
+from app.tasks.celery_app import celery_app
+
+logger = logging.getLogger(__name__)
+
+
+@celery_app.task(name="app.tasks.gpu_tasks.probe_gpu", queue="thumbnail_rendering")
+def probe_gpu() -> dict:
+    """Run Blender GPU probe on the render-worker. Stores result in system_settings."""
+    import subprocess
+    import json
+    from datetime import datetime, timezone
+    from pathlib import Path
+    from app.services.render_blender import find_blender
+
+    result = {
+        "status": "unknown",
+        "device_type": None,
+        "devices": [],
+        "error": None,
+        "probed_at": datetime.now(timezone.utc).isoformat(),
+    }
+
+    try:
+        blender_bin = find_blender()
+        if not blender_bin:
+            result["status"] = "error"
+            result["error"] = "Blender binary not found — check BLENDER_BIN env or PATH"
+        else:
+            probe_script = Path("/render-scripts/gpu_probe.py")
+
+            if not probe_script.exists():
+                result["status"] = "error"
+                result["error"] = f"gpu_probe.py not found at {probe_script}"
+            else:
+                proc = subprocess.run(
+                    [blender_bin, "--background", "--python", str(probe_script)],
+                    capture_output=True, text=True, timeout=60,
+                )
+
+                for line in proc.stdout.splitlines():
+                    if "GPU_PROBE_OK:" in line:
+                        result["status"] = "ok"
+                        # Parse device_type and devices from line:
+                        # GPU_PROBE_OK: device_type=OPTIX devices=[...]
+                        parts = line.split("GPU_PROBE_OK:", 1)[1].strip()
+                        for p in parts.split():
+                            if p.startswith("device_type="):
+                                result["device_type"] = p.split("=", 1)[1]
+                        break
+                    elif "GPU_PROBE_FAIL:" in line:
+                        result["status"] = "failed"
+                        result["error"] = line.split("GPU_PROBE_FAIL:", 1)[1].strip()
+                        break
+
+                if result["status"] == "unknown":
+                    result["status"] = "failed" if proc.returncode != 0 else "unknown"
+                    result["error"] = proc.stderr[:500] if proc.stderr else "No probe output"
+
+    except subprocess.TimeoutExpired:
+        result["status"] = "error"
+        result["error"] = "GPU probe timed out after 60s"
+    except Exception as exc:
+        result["status"] = "error"
+        result["error"] = str(exc)
+
+    # Save to system_settings
+    _save_probe_result(result)
+    return result
+
+
+def _save_probe_result(result: dict) -> None:
+    import json
+    from sqlalchemy import create_engine, text
+    from app.config import settings as app_settings
+
+    sync_url = app_settings.database_url.replace("+asyncpg", "")
+    eng = create_engine(sync_url)
+    try:
+        with eng.connect() as conn:
+            conn.execute(text("""
+                INSERT INTO system_settings (key, value) VALUES (:key, :value)
+                ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value
+            """), {"key": "gpu_probe_last_result", "value": json.dumps(result)})
+            conn.commit()
+    finally:
+        eng.dispose()
@@ -133,6 +133,7 @@ services:
      - UPLOAD_DIR=/app/uploads
      - BLENDER_BIN=/opt/blender/blender
      - RENDER_SCRIPTS_DIR=/render-scripts
+      - CYCLES_DEVICE=${CYCLES_DEVICE:-auto}
      - MINIO_URL=${MINIO_URL:-http://minio:9000}
      - MINIO_USER=${MINIO_USER:-minioadmin}
      - MINIO_PASSWORD=${MINIO_PASSWORD:-minioadmin}
@@ -64,5 +64,62 @@ def check_version():
    print(f"Blender {version_str} OK (>= {MIN_VERSION_STR})")


+def check_gpu():
+    """Run the Blender GPU probe script and report results.
+
+    Respects CYCLES_DEVICE env var:
+      - "cpu" → skip probe entirely
+      - "gpu" → require GPU; abort startup if none found
+      - "auto" (default) → warn if no GPU found, but continue
+    """
+    cycles_device = os.environ.get("CYCLES_DEVICE", "auto").lower()
+    if cycles_device == "cpu":
+        print("[check_version] GPU check skipped (CYCLES_DEVICE=cpu)", flush=True)
+        return
+
+    blender_bin = find_blender()
+    probe_script = Path("/render-scripts/gpu_probe.py")
+    if not probe_script.exists():
+        print(
+            f"[check_version] WARNING: gpu_probe.py not found at {probe_script}",
+            flush=True,
+        )
+        return
+
+    try:
+        result = subprocess.run(
+            [blender_bin, "--background", "--python", str(probe_script)],
+            capture_output=True, text=True, timeout=45,
+        )
+        if result.returncode == 0:
+            for line in result.stdout.splitlines():
+                if "GPU_PROBE_OK" in line:
+                    print(f"[check_version] {line}", flush=True)
+                    break
+        else:
+            msg = "No GPU detected — renders will use CPU"
+            for line in result.stdout.splitlines():
+                if "GPU_PROBE_FAIL" in line:
+                    msg = line
+                    break
+            if cycles_device == "gpu":
+                print(f"[check_version] ERROR: {msg}", flush=True)
+                print(
+                    "[check_version] CYCLES_DEVICE=gpu requires GPU — aborting startup",
+                    flush=True,
+                )
+                sys.exit(1)
+            else:
+                print(
+                    f"[check_version] WARNING: {msg} (set CYCLES_DEVICE=gpu to enforce)",
+                    flush=True,
+                )
+    except subprocess.TimeoutExpired:
+        print("[check_version] WARNING: GPU probe timed out after 45s", flush=True)
+    except Exception as e:
+        print(f"[check_version] WARNING: GPU probe failed: {e}", flush=True)
+
+
 if __name__ == "__main__":
    check_version()
+    check_gpu()
@@ -718,9 +718,15 @@ if engine != "eevee":   # covers both explicit Cycles and EEVEE-fallback
        # Re-ensure preferences are set (engine activation may have reset them)
        _activate_gpu()
        print(f"[blender_render] Cycles GPU ({gpu_type_found}), samples={samples}", flush=True)
+        print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={gpu_type_found}", flush=True)
    else:
        scene.cycles.device = 'CPU'
        print(f"[blender_render] WARNING: GPU not found — falling back to CPU, samples={samples}", flush=True)
+        print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
+        import os as _os
+        if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
+            print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
+            sys.exit(2)

    scene.cycles.samples       = samples
    scene.cycles.use_denoising = True
@@ -0,0 +1,43 @@
+"""Blender Python script: GPU compute device probe.
+
+Run via:
+  blender --background --python gpu_probe.py
+
+Exit codes:
+  0 — GPU found, prints GPU_PROBE_OK line
+  1 — No GPU found or error, prints GPU_PROBE_FAIL line
+"""
+import sys
+
+
+def main():
+    try:
+        import bpy
+
+        cprefs = bpy.context.preferences.addons['cycles'].preferences
+        for device_type in ('OPTIX', 'CUDA', 'HIP', 'ONEAPI'):
+            try:
+                cprefs.compute_device_type = device_type
+                cprefs.get_devices()
+                gpu_devices = [d for d in cprefs.devices if d.type != 'CPU']
+                if gpu_devices:
+                    device_names = [(d.name, d.type) for d in gpu_devices]
+                    print(
+                        f"GPU_PROBE_OK: device_type={device_type} devices={device_names}",
+                        flush=True,
+                    )
+                    sys.exit(0)
+            except Exception as e:
+                print(f"GPU_PROBE: {device_type} not available: {e}", flush=True)
+
+        print("GPU_PROBE_FAIL: no GPU compute device found", flush=True)
+        sys.exit(1)
+
+    except SystemExit:
+        raise
+    except Exception as e:
+        print(f"GPU_PROBE_FAIL: exception during probe: {e}", flush=True)
+        sys.exit(1)
+
+
+main()
@@ -728,9 +728,16 @@ def main():
                        continue
            except Exception:
                pass
-        if not gpu_found:
+        if gpu_found:
+            print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={device_type}", flush=True)
+        else:
            scene.cycles.device = 'CPU'
            print("[still_render] WARNING: GPU not found — falling back to CPU")
+            print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
+            import os as _os
+            if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
+                print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
+                sys.exit(2)

    # ── Render settings ──────────────────────────────────────────────────────
    scene.render.resolution_x = width
@@ -682,9 +682,16 @@ def main():
                        continue
            except Exception:
                pass
-        if not gpu_found:
+        if gpu_found:
+            print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={device_type}", flush=True)
+        else:
            scene.cycles.device = 'CPU'
            print("[turntable_render] WARNING: GPU not found — falling back to CPU")
+            print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
+            import os as _os
+            if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
+                print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
+                sys.exit(2)

    # ── Render settings ──────────────────────────────────────────────────────
    scene.render.resolution_x = width