feat(gpu): GPU health check + RENDER_DEVICE_USED token + strict mode

- gpu_probe.py: Blender script that probes OPTIX/CUDA/HIP/ONEAPI and exits 1 on no GPU — used at startup + on-demand from Admin UI - blender_render.py, still_render.py, turntable_render.py: emit RENDER_DEVICE_USED: engine=CYCLES device=GPU|CPU compute_type=... after GPU activation; exit 2 when CYCLES_DEVICE=gpu and CPU fallback - render_blender.py: parse RENDER_DEVICE_USED token into render_log (device_used, compute_type, gpu_fallback); handle exit code 2 as explicit GPU strict-mode failure - check_version.py: check_gpu() runs gpu_probe.py at container startup; CYCLES_DEVICE=gpu aborts startup if no GPU found - docker-compose.yml: CYCLES_DEVICE=${CYCLES_DEVICE:-auto} env var - gpu_tasks.py: probe_gpu Celery task on thumbnail_rendering queue; saves result to system_settings.gpu_probe_last_result; beat every 30min - worker.py: POST /probe/gpu (trigger) + GET /probe/gpu/result (last result) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-08 20:57:36 +01:00
parent c6556434d6
commit 34f89cc225
10 changed files with 269 additions and 2 deletions
@@ -16,6 +16,7 @@ from app.models.order_line import OrderLine
 from app.models.product import Product
 from app.models.user import User
 from app.models.worker_config import WorkerConfig
 from app.models.system_setting import SystemSetting
 from app.utils.auth import get_current_user, require_admin_or_pm, require_admin
 router = APIRouter(prefix="/worker", tags=["worker"])
@@ -456,6 +457,34 @@ async def scale_workers(
    return {"service": body.service, "count": body.count, "status": "scaling"}
 # ---------------------------------------------------------------------------
 # GPU probe
 # ---------------------------------------------------------------------------
@router.post("/probe/gpu", status_code=http_status.HTTP_202_ACCEPTED)
 async def trigger_gpu_probe(current_user: User = Depends(require_admin)):
    """Queue a GPU probe task on the render-worker."""
    from app.tasks.gpu_tasks import probe_gpu
    result = probe_gpu.delay()
    return {"task_id": str(result.id), "queued": True}
@router.get("/probe/gpu/result")
 async def get_gpu_probe_result(
    current_user: User = Depends(require_admin),
    db: AsyncSession = Depends(get_db),
 ):
    """Return the last GPU probe result from system_settings."""
    import json
    row = await db.execute(
        select(SystemSetting).where(SystemSetting.key == "gpu_probe_last_result")
    )
    setting = row.scalar_one_or_none()
    if not setting:
        return {"status": "unknown", "message": "No probe run yet. Click Run GPU Check."}
    return json.loads(setting.value)
 # ---------------------------------------------------------------------------
 # Render health check
 # ---------------------------------------------------------------------------
@@ -224,9 +224,30 @@ def render_still(
    log_lines = [l for l in stdout_lines if "[blender_render]" in l]
    # Parse RENDER_DEVICE_USED token from stdout
    device_used = "unknown"
    compute_type = "unknown"
    gpu_fallback = False
    for line in stdout_lines:
        if line.startswith("RENDER_DEVICE_USED:"):
            parts = line.split()
            for part in parts:
                if part.startswith("device="):
                    device_used = part.split("=", 1)[1]
                elif part.startswith("compute_type="):
                    compute_type = part.split("=", 1)[1]
            gpu_fallback = (device_used == "CPU")
            break
    # EEVEE fallback removed (Phase 5.2): EEVEE Next in Blender 5.0+ is stable.
    # If EEVEE fails, it is a hard failure — no silent retry.
    if returncode == 2:
        raise RuntimeError(
            "GPU required but render used CPU — strict mode (CYCLES_DEVICE=gpu). "
            "Check that the render-worker has a visible NVIDIA GPU."
        )
    if returncode != 0:
        stdout_tail = "\n".join(stdout_lines[-50:]) if stdout_lines else ""
        stderr_tail = "\n".join(stderr_lines[-20:]) if stderr_lines else ""
@@ -246,6 +267,9 @@ def render_still(
        "output_size_bytes": output_path.stat().st_size if output_path.exists() else 0,
        "parts_count": 0,
        "engine_used": engine_used,
        "device_used": device_used,
        "compute_type": compute_type,
        "gpu_fallback": gpu_fallback,
        "log_lines": log_lines,
    }
@@ -18,6 +18,7 @@ celery_app = Celery(
        "app.domains.products.tasks",
        "app.domains.imports.tasks",
        "app.domains.materials.tasks",
        "app.tasks.gpu_tasks",
    ],
 )
@@ -56,5 +57,9 @@ celery_app.conf.update(
            "task": "app.tasks.beat_tasks.apply_worker_concurrency",
            "schedule": 300.0,  # every 5 minutes
        },
        "probe-gpu-every-30m": {
            "task": "app.tasks.gpu_tasks.probe_gpu",
            "schedule": 1800.0,  # every 30 minutes
        },
    },
 )
@@ -0,0 +1,88 @@
 """Celery task for GPU health probe."""
 import logging
 from app.tasks.celery_app import celery_app
 logger = logging.getLogger(__name__)
@celery_app.task(name="app.tasks.gpu_tasks.probe_gpu", queue="thumbnail_rendering")
 def probe_gpu() -> dict:
    """Run Blender GPU probe on the render-worker. Stores result in system_settings."""
    import subprocess
    import json
    from datetime import datetime, timezone
    from pathlib import Path
    from app.services.render_blender import find_blender
    result = {
        "status": "unknown",
        "device_type": None,
        "devices": [],
        "error": None,
        "probed_at": datetime.now(timezone.utc).isoformat(),
    }
    try:
        blender_bin = find_blender()
        if not blender_bin:
            result["status"] = "error"
            result["error"] = "Blender binary not found — check BLENDER_BIN env or PATH"
        else:
            probe_script = Path("/render-scripts/gpu_probe.py")
            if not probe_script.exists():
                result["status"] = "error"
                result["error"] = f"gpu_probe.py not found at {probe_script}"
            else:
                proc = subprocess.run(
                    [blender_bin, "--background", "--python", str(probe_script)],
                    capture_output=True, text=True, timeout=60,
                )
                for line in proc.stdout.splitlines():
                    if "GPU_PROBE_OK:" in line:
                        result["status"] = "ok"
                        # Parse device_type and devices from line:
                        # GPU_PROBE_OK: device_type=OPTIX devices=[...]
                        parts = line.split("GPU_PROBE_OK:", 1)[1].strip()
                        for p in parts.split():
                            if p.startswith("device_type="):
                                result["device_type"] = p.split("=", 1)[1]
                        break
                    elif "GPU_PROBE_FAIL:" in line:
                        result["status"] = "failed"
                        result["error"] = line.split("GPU_PROBE_FAIL:", 1)[1].strip()
                        break
                if result["status"] == "unknown":
                    result["status"] = "failed" if proc.returncode != 0 else "unknown"
                    result["error"] = proc.stderr[:500] if proc.stderr else "No probe output"
    except subprocess.TimeoutExpired:
        result["status"] = "error"
        result["error"] = "GPU probe timed out after 60s"
    except Exception as exc:
        result["status"] = "error"
        result["error"] = str(exc)
    # Save to system_settings
    _save_probe_result(result)
    return result
 def _save_probe_result(result: dict) -> None:
    import json
    from sqlalchemy import create_engine, text
    from app.config import settings as app_settings
    sync_url = app_settings.database_url.replace("+asyncpg", "")
    eng = create_engine(sync_url)
    try:
        with eng.connect() as conn:
            conn.execute(text("""
                INSERT INTO system_settings (key, value) VALUES (:key, :value)
                ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value
            """), {"key": "gpu_probe_last_result", "value": json.dumps(result)})
            conn.commit()
    finally:
        eng.dispose()
@@ -133,6 +133,7 @@ services:
      - UPLOAD_DIR=/app/uploads
      - BLENDER_BIN=/opt/blender/blender
      - RENDER_SCRIPTS_DIR=/render-scripts
      - CYCLES_DEVICE=${CYCLES_DEVICE:-auto}
      - MINIO_URL=${MINIO_URL:-http://minio:9000}
      - MINIO_USER=${MINIO_USER:-minioadmin}
      - MINIO_PASSWORD=${MINIO_PASSWORD:-minioadmin}
@@ -64,5 +64,62 @@ def check_version():
    print(f"Blender {version_str} OK (>= {MIN_VERSION_STR})")
 def check_gpu():
    """Run the Blender GPU probe script and report results.
    Respects CYCLES_DEVICE env var:
      - "cpu" → skip probe entirely
      - "gpu" → require GPU; abort startup if none found
      - "auto" (default) → warn if no GPU found, but continue
    """
    cycles_device = os.environ.get("CYCLES_DEVICE", "auto").lower()
    if cycles_device == "cpu":
        print("[check_version] GPU check skipped (CYCLES_DEVICE=cpu)", flush=True)
        return
    blender_bin = find_blender()
    probe_script = Path("/render-scripts/gpu_probe.py")
    if not probe_script.exists():
        print(
            f"[check_version] WARNING: gpu_probe.py not found at {probe_script}",
            flush=True,
        )
        return
    try:
        result = subprocess.run(
            [blender_bin, "--background", "--python", str(probe_script)],
            capture_output=True, text=True, timeout=45,
        )
        if result.returncode == 0:
            for line in result.stdout.splitlines():
                if "GPU_PROBE_OK" in line:
                    print(f"[check_version] {line}", flush=True)
                    break
        else:
            msg = "No GPU detected — renders will use CPU"
            for line in result.stdout.splitlines():
                if "GPU_PROBE_FAIL" in line:
                    msg = line
                    break
            if cycles_device == "gpu":
                print(f"[check_version] ERROR: {msg}", flush=True)
                print(
                    "[check_version] CYCLES_DEVICE=gpu requires GPU — aborting startup",
                    flush=True,
                )
                sys.exit(1)
            else:
                print(
                    f"[check_version] WARNING: {msg} (set CYCLES_DEVICE=gpu to enforce)",
                    flush=True,
                )
    except subprocess.TimeoutExpired:
        print("[check_version] WARNING: GPU probe timed out after 45s", flush=True)
    except Exception as e:
        print(f"[check_version] WARNING: GPU probe failed: {e}", flush=True)
 if __name__ == "__main__":
    check_version()
    check_gpu()
@@ -718,9 +718,15 @@ if engine != "eevee":   # covers both explicit Cycles and EEVEE-fallback
        # Re-ensure preferences are set (engine activation may have reset them)
        _activate_gpu()
        print(f"[blender_render] Cycles GPU ({gpu_type_found}), samples={samples}", flush=True)
        print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={gpu_type_found}", flush=True)
    else:
        scene.cycles.device = 'CPU'
        print(f"[blender_render] WARNING: GPU not found — falling back to CPU, samples={samples}", flush=True)
        print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
        import os as _os
        if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
            print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
            sys.exit(2)
    scene.cycles.samples       = samples
    scene.cycles.use_denoising = True
@@ -0,0 +1,43 @@
 """Blender Python script: GPU compute device probe.
 Run via:
  blender --background --python gpu_probe.py
 Exit codes:
  0 — GPU found, prints GPU_PROBE_OK line
  1 — No GPU found or error, prints GPU_PROBE_FAIL line
 """
 import sys
 def main():
    try:
        import bpy
        cprefs = bpy.context.preferences.addons['cycles'].preferences
        for device_type in ('OPTIX', 'CUDA', 'HIP', 'ONEAPI'):
            try:
                cprefs.compute_device_type = device_type
                cprefs.get_devices()
                gpu_devices = [d for d in cprefs.devices if d.type != 'CPU']
                if gpu_devices:
                    device_names = [(d.name, d.type) for d in gpu_devices]
                    print(
                        f"GPU_PROBE_OK: device_type={device_type} devices={device_names}",
                        flush=True,
                    )
                    sys.exit(0)
            except Exception as e:
                print(f"GPU_PROBE: {device_type} not available: {e}", flush=True)
        print("GPU_PROBE_FAIL: no GPU compute device found", flush=True)
        sys.exit(1)
    except SystemExit:
        raise
    except Exception as e:
        print(f"GPU_PROBE_FAIL: exception during probe: {e}", flush=True)
        sys.exit(1)
 main()
@@ -728,9 +728,16 @@ def main():
                        continue
            except Exception:
                pass
-        if not gpu_found:
+        if gpu_found:
            print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={device_type}", flush=True)
        else:
            scene.cycles.device = 'CPU'
            print("[still_render] WARNING: GPU not found — falling back to CPU")
            print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
            import os as _os
            if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
                print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
                sys.exit(2)
    # ── Render settings ──────────────────────────────────────────────────────
    scene.render.resolution_x = width
@@ -682,9 +682,16 @@ def main():
                        continue
            except Exception:
                pass
-        if not gpu_found:
+        if gpu_found:
            print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={device_type}", flush=True)
        else:
            scene.cycles.device = 'CPU'
            print("[turntable_render] WARNING: GPU not found — falling back to CPU")
            print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
            import os as _os
            if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
                print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
                sys.exit(2)
    # ── Render settings ──────────────────────────────────────────────────────
    scene.render.resolution_x = width