feat(gpu): GPU health check + RENDER_DEVICE_USED token + strict mode
- gpu_probe.py: Blender script that probes OPTIX/CUDA/HIP/ONEAPI and
exits 1 on no GPU — used at startup + on-demand from Admin UI
- blender_render.py, still_render.py, turntable_render.py: emit
RENDER_DEVICE_USED: engine=CYCLES device=GPU|CPU compute_type=...
after GPU activation; exit 2 when CYCLES_DEVICE=gpu and CPU fallback
- render_blender.py: parse RENDER_DEVICE_USED token into render_log
(device_used, compute_type, gpu_fallback); handle exit code 2 as
explicit GPU strict-mode failure
- check_version.py: check_gpu() runs gpu_probe.py at container startup;
CYCLES_DEVICE=gpu aborts startup if no GPU found
- docker-compose.yml: CYCLES_DEVICE=${CYCLES_DEVICE:-auto} env var
- gpu_tasks.py: probe_gpu Celery task on thumbnail_rendering queue;
saves result to system_settings.gpu_probe_last_result; beat every 30min
- worker.py: POST /probe/gpu (trigger) + GET /probe/gpu/result (last result)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ from app.models.order_line import OrderLine
|
|||||||
from app.models.product import Product
|
from app.models.product import Product
|
||||||
from app.models.user import User
|
from app.models.user import User
|
||||||
from app.models.worker_config import WorkerConfig
|
from app.models.worker_config import WorkerConfig
|
||||||
|
from app.models.system_setting import SystemSetting
|
||||||
from app.utils.auth import get_current_user, require_admin_or_pm, require_admin
|
from app.utils.auth import get_current_user, require_admin_or_pm, require_admin
|
||||||
|
|
||||||
router = APIRouter(prefix="/worker", tags=["worker"])
|
router = APIRouter(prefix="/worker", tags=["worker"])
|
||||||
@@ -456,6 +457,34 @@ async def scale_workers(
|
|||||||
return {"service": body.service, "count": body.count, "status": "scaling"}
|
return {"service": body.service, "count": body.count, "status": "scaling"}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# GPU probe
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@router.post("/probe/gpu", status_code=http_status.HTTP_202_ACCEPTED)
|
||||||
|
async def trigger_gpu_probe(current_user: User = Depends(require_admin)):
|
||||||
|
"""Queue a GPU probe task on the render-worker."""
|
||||||
|
from app.tasks.gpu_tasks import probe_gpu
|
||||||
|
result = probe_gpu.delay()
|
||||||
|
return {"task_id": str(result.id), "queued": True}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/probe/gpu/result")
|
||||||
|
async def get_gpu_probe_result(
|
||||||
|
current_user: User = Depends(require_admin),
|
||||||
|
db: AsyncSession = Depends(get_db),
|
||||||
|
):
|
||||||
|
"""Return the last GPU probe result from system_settings."""
|
||||||
|
import json
|
||||||
|
row = await db.execute(
|
||||||
|
select(SystemSetting).where(SystemSetting.key == "gpu_probe_last_result")
|
||||||
|
)
|
||||||
|
setting = row.scalar_one_or_none()
|
||||||
|
if not setting:
|
||||||
|
return {"status": "unknown", "message": "No probe run yet. Click Run GPU Check."}
|
||||||
|
return json.loads(setting.value)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Render health check
|
# Render health check
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -224,9 +224,30 @@ def render_still(
|
|||||||
|
|
||||||
log_lines = [l for l in stdout_lines if "[blender_render]" in l]
|
log_lines = [l for l in stdout_lines if "[blender_render]" in l]
|
||||||
|
|
||||||
|
# Parse RENDER_DEVICE_USED token from stdout
|
||||||
|
device_used = "unknown"
|
||||||
|
compute_type = "unknown"
|
||||||
|
gpu_fallback = False
|
||||||
|
for line in stdout_lines:
|
||||||
|
if line.startswith("RENDER_DEVICE_USED:"):
|
||||||
|
parts = line.split()
|
||||||
|
for part in parts:
|
||||||
|
if part.startswith("device="):
|
||||||
|
device_used = part.split("=", 1)[1]
|
||||||
|
elif part.startswith("compute_type="):
|
||||||
|
compute_type = part.split("=", 1)[1]
|
||||||
|
gpu_fallback = (device_used == "CPU")
|
||||||
|
break
|
||||||
|
|
||||||
# EEVEE fallback removed (Phase 5.2): EEVEE Next in Blender 5.0+ is stable.
|
# EEVEE fallback removed (Phase 5.2): EEVEE Next in Blender 5.0+ is stable.
|
||||||
# If EEVEE fails, it is a hard failure — no silent retry.
|
# If EEVEE fails, it is a hard failure — no silent retry.
|
||||||
|
|
||||||
|
if returncode == 2:
|
||||||
|
raise RuntimeError(
|
||||||
|
"GPU required but render used CPU — strict mode (CYCLES_DEVICE=gpu). "
|
||||||
|
"Check that the render-worker has a visible NVIDIA GPU."
|
||||||
|
)
|
||||||
|
|
||||||
if returncode != 0:
|
if returncode != 0:
|
||||||
stdout_tail = "\n".join(stdout_lines[-50:]) if stdout_lines else ""
|
stdout_tail = "\n".join(stdout_lines[-50:]) if stdout_lines else ""
|
||||||
stderr_tail = "\n".join(stderr_lines[-20:]) if stderr_lines else ""
|
stderr_tail = "\n".join(stderr_lines[-20:]) if stderr_lines else ""
|
||||||
@@ -246,6 +267,9 @@ def render_still(
|
|||||||
"output_size_bytes": output_path.stat().st_size if output_path.exists() else 0,
|
"output_size_bytes": output_path.stat().st_size if output_path.exists() else 0,
|
||||||
"parts_count": 0,
|
"parts_count": 0,
|
||||||
"engine_used": engine_used,
|
"engine_used": engine_used,
|
||||||
|
"device_used": device_used,
|
||||||
|
"compute_type": compute_type,
|
||||||
|
"gpu_fallback": gpu_fallback,
|
||||||
"log_lines": log_lines,
|
"log_lines": log_lines,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ celery_app = Celery(
|
|||||||
"app.domains.products.tasks",
|
"app.domains.products.tasks",
|
||||||
"app.domains.imports.tasks",
|
"app.domains.imports.tasks",
|
||||||
"app.domains.materials.tasks",
|
"app.domains.materials.tasks",
|
||||||
|
"app.tasks.gpu_tasks",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -56,5 +57,9 @@ celery_app.conf.update(
|
|||||||
"task": "app.tasks.beat_tasks.apply_worker_concurrency",
|
"task": "app.tasks.beat_tasks.apply_worker_concurrency",
|
||||||
"schedule": 300.0, # every 5 minutes
|
"schedule": 300.0, # every 5 minutes
|
||||||
},
|
},
|
||||||
|
"probe-gpu-every-30m": {
|
||||||
|
"task": "app.tasks.gpu_tasks.probe_gpu",
|
||||||
|
"schedule": 1800.0, # every 30 minutes
|
||||||
|
},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -0,0 +1,88 @@
|
|||||||
|
"""Celery task for GPU health probe."""
|
||||||
|
import logging
|
||||||
|
from app.tasks.celery_app import celery_app
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@celery_app.task(name="app.tasks.gpu_tasks.probe_gpu", queue="thumbnail_rendering")
|
||||||
|
def probe_gpu() -> dict:
|
||||||
|
"""Run Blender GPU probe on the render-worker. Stores result in system_settings."""
|
||||||
|
import subprocess
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from app.services.render_blender import find_blender
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"status": "unknown",
|
||||||
|
"device_type": None,
|
||||||
|
"devices": [],
|
||||||
|
"error": None,
|
||||||
|
"probed_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
blender_bin = find_blender()
|
||||||
|
if not blender_bin:
|
||||||
|
result["status"] = "error"
|
||||||
|
result["error"] = "Blender binary not found — check BLENDER_BIN env or PATH"
|
||||||
|
else:
|
||||||
|
probe_script = Path("/render-scripts/gpu_probe.py")
|
||||||
|
|
||||||
|
if not probe_script.exists():
|
||||||
|
result["status"] = "error"
|
||||||
|
result["error"] = f"gpu_probe.py not found at {probe_script}"
|
||||||
|
else:
|
||||||
|
proc = subprocess.run(
|
||||||
|
[blender_bin, "--background", "--python", str(probe_script)],
|
||||||
|
capture_output=True, text=True, timeout=60,
|
||||||
|
)
|
||||||
|
|
||||||
|
for line in proc.stdout.splitlines():
|
||||||
|
if "GPU_PROBE_OK:" in line:
|
||||||
|
result["status"] = "ok"
|
||||||
|
# Parse device_type and devices from line:
|
||||||
|
# GPU_PROBE_OK: device_type=OPTIX devices=[...]
|
||||||
|
parts = line.split("GPU_PROBE_OK:", 1)[1].strip()
|
||||||
|
for p in parts.split():
|
||||||
|
if p.startswith("device_type="):
|
||||||
|
result["device_type"] = p.split("=", 1)[1]
|
||||||
|
break
|
||||||
|
elif "GPU_PROBE_FAIL:" in line:
|
||||||
|
result["status"] = "failed"
|
||||||
|
result["error"] = line.split("GPU_PROBE_FAIL:", 1)[1].strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
if result["status"] == "unknown":
|
||||||
|
result["status"] = "failed" if proc.returncode != 0 else "unknown"
|
||||||
|
result["error"] = proc.stderr[:500] if proc.stderr else "No probe output"
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
result["status"] = "error"
|
||||||
|
result["error"] = "GPU probe timed out after 60s"
|
||||||
|
except Exception as exc:
|
||||||
|
result["status"] = "error"
|
||||||
|
result["error"] = str(exc)
|
||||||
|
|
||||||
|
# Save to system_settings
|
||||||
|
_save_probe_result(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _save_probe_result(result: dict) -> None:
|
||||||
|
import json
|
||||||
|
from sqlalchemy import create_engine, text
|
||||||
|
from app.config import settings as app_settings
|
||||||
|
|
||||||
|
sync_url = app_settings.database_url.replace("+asyncpg", "")
|
||||||
|
eng = create_engine(sync_url)
|
||||||
|
try:
|
||||||
|
with eng.connect() as conn:
|
||||||
|
conn.execute(text("""
|
||||||
|
INSERT INTO system_settings (key, value) VALUES (:key, :value)
|
||||||
|
ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value
|
||||||
|
"""), {"key": "gpu_probe_last_result", "value": json.dumps(result)})
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
eng.dispose()
|
||||||
@@ -133,6 +133,7 @@ services:
|
|||||||
- UPLOAD_DIR=/app/uploads
|
- UPLOAD_DIR=/app/uploads
|
||||||
- BLENDER_BIN=/opt/blender/blender
|
- BLENDER_BIN=/opt/blender/blender
|
||||||
- RENDER_SCRIPTS_DIR=/render-scripts
|
- RENDER_SCRIPTS_DIR=/render-scripts
|
||||||
|
- CYCLES_DEVICE=${CYCLES_DEVICE:-auto}
|
||||||
- MINIO_URL=${MINIO_URL:-http://minio:9000}
|
- MINIO_URL=${MINIO_URL:-http://minio:9000}
|
||||||
- MINIO_USER=${MINIO_USER:-minioadmin}
|
- MINIO_USER=${MINIO_USER:-minioadmin}
|
||||||
- MINIO_PASSWORD=${MINIO_PASSWORD:-minioadmin}
|
- MINIO_PASSWORD=${MINIO_PASSWORD:-minioadmin}
|
||||||
|
|||||||
@@ -64,5 +64,62 @@ def check_version():
|
|||||||
print(f"Blender {version_str} OK (>= {MIN_VERSION_STR})")
|
print(f"Blender {version_str} OK (>= {MIN_VERSION_STR})")
|
||||||
|
|
||||||
|
|
||||||
|
def check_gpu():
|
||||||
|
"""Run the Blender GPU probe script and report results.
|
||||||
|
|
||||||
|
Respects CYCLES_DEVICE env var:
|
||||||
|
- "cpu" → skip probe entirely
|
||||||
|
- "gpu" → require GPU; abort startup if none found
|
||||||
|
- "auto" (default) → warn if no GPU found, but continue
|
||||||
|
"""
|
||||||
|
cycles_device = os.environ.get("CYCLES_DEVICE", "auto").lower()
|
||||||
|
if cycles_device == "cpu":
|
||||||
|
print("[check_version] GPU check skipped (CYCLES_DEVICE=cpu)", flush=True)
|
||||||
|
return
|
||||||
|
|
||||||
|
blender_bin = find_blender()
|
||||||
|
probe_script = Path("/render-scripts/gpu_probe.py")
|
||||||
|
if not probe_script.exists():
|
||||||
|
print(
|
||||||
|
f"[check_version] WARNING: gpu_probe.py not found at {probe_script}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[blender_bin, "--background", "--python", str(probe_script)],
|
||||||
|
capture_output=True, text=True, timeout=45,
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
for line in result.stdout.splitlines():
|
||||||
|
if "GPU_PROBE_OK" in line:
|
||||||
|
print(f"[check_version] {line}", flush=True)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
msg = "No GPU detected — renders will use CPU"
|
||||||
|
for line in result.stdout.splitlines():
|
||||||
|
if "GPU_PROBE_FAIL" in line:
|
||||||
|
msg = line
|
||||||
|
break
|
||||||
|
if cycles_device == "gpu":
|
||||||
|
print(f"[check_version] ERROR: {msg}", flush=True)
|
||||||
|
print(
|
||||||
|
"[check_version] CYCLES_DEVICE=gpu requires GPU — aborting startup",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"[check_version] WARNING: {msg} (set CYCLES_DEVICE=gpu to enforce)",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print("[check_version] WARNING: GPU probe timed out after 45s", flush=True)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[check_version] WARNING: GPU probe failed: {e}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
check_version()
|
check_version()
|
||||||
|
check_gpu()
|
||||||
|
|||||||
@@ -718,9 +718,15 @@ if engine != "eevee": # covers both explicit Cycles and EEVEE-fallback
|
|||||||
# Re-ensure preferences are set (engine activation may have reset them)
|
# Re-ensure preferences are set (engine activation may have reset them)
|
||||||
_activate_gpu()
|
_activate_gpu()
|
||||||
print(f"[blender_render] Cycles GPU ({gpu_type_found}), samples={samples}", flush=True)
|
print(f"[blender_render] Cycles GPU ({gpu_type_found}), samples={samples}", flush=True)
|
||||||
|
print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={gpu_type_found}", flush=True)
|
||||||
else:
|
else:
|
||||||
scene.cycles.device = 'CPU'
|
scene.cycles.device = 'CPU'
|
||||||
print(f"[blender_render] WARNING: GPU not found — falling back to CPU, samples={samples}", flush=True)
|
print(f"[blender_render] WARNING: GPU not found — falling back to CPU, samples={samples}", flush=True)
|
||||||
|
print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
|
||||||
|
import os as _os
|
||||||
|
if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
|
||||||
|
print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
scene.cycles.samples = samples
|
scene.cycles.samples = samples
|
||||||
scene.cycles.use_denoising = True
|
scene.cycles.use_denoising = True
|
||||||
|
|||||||
@@ -0,0 +1,43 @@
|
|||||||
|
"""Blender Python script: GPU compute device probe.
|
||||||
|
|
||||||
|
Run via:
|
||||||
|
blender --background --python gpu_probe.py
|
||||||
|
|
||||||
|
Exit codes:
|
||||||
|
0 — GPU found, prints GPU_PROBE_OK line
|
||||||
|
1 — No GPU found or error, prints GPU_PROBE_FAIL line
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
try:
|
||||||
|
import bpy
|
||||||
|
|
||||||
|
cprefs = bpy.context.preferences.addons['cycles'].preferences
|
||||||
|
for device_type in ('OPTIX', 'CUDA', 'HIP', 'ONEAPI'):
|
||||||
|
try:
|
||||||
|
cprefs.compute_device_type = device_type
|
||||||
|
cprefs.get_devices()
|
||||||
|
gpu_devices = [d for d in cprefs.devices if d.type != 'CPU']
|
||||||
|
if gpu_devices:
|
||||||
|
device_names = [(d.name, d.type) for d in gpu_devices]
|
||||||
|
print(
|
||||||
|
f"GPU_PROBE_OK: device_type={device_type} devices={device_names}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
sys.exit(0)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"GPU_PROBE: {device_type} not available: {e}", flush=True)
|
||||||
|
|
||||||
|
print("GPU_PROBE_FAIL: no GPU compute device found", flush=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
except SystemExit:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
print(f"GPU_PROBE_FAIL: exception during probe: {e}", flush=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
main()
|
||||||
@@ -728,9 +728,16 @@ def main():
|
|||||||
continue
|
continue
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
if not gpu_found:
|
if gpu_found:
|
||||||
|
print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={device_type}", flush=True)
|
||||||
|
else:
|
||||||
scene.cycles.device = 'CPU'
|
scene.cycles.device = 'CPU'
|
||||||
print("[still_render] WARNING: GPU not found — falling back to CPU")
|
print("[still_render] WARNING: GPU not found — falling back to CPU")
|
||||||
|
print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
|
||||||
|
import os as _os
|
||||||
|
if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
|
||||||
|
print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
# ── Render settings ──────────────────────────────────────────────────────
|
# ── Render settings ──────────────────────────────────────────────────────
|
||||||
scene.render.resolution_x = width
|
scene.render.resolution_x = width
|
||||||
|
|||||||
@@ -682,9 +682,16 @@ def main():
|
|||||||
continue
|
continue
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
if not gpu_found:
|
if gpu_found:
|
||||||
|
print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={device_type}", flush=True)
|
||||||
|
else:
|
||||||
scene.cycles.device = 'CPU'
|
scene.cycles.device = 'CPU'
|
||||||
print("[turntable_render] WARNING: GPU not found — falling back to CPU")
|
print("[turntable_render] WARNING: GPU not found — falling back to CPU")
|
||||||
|
print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
|
||||||
|
import os as _os
|
||||||
|
if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
|
||||||
|
print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
# ── Render settings ──────────────────────────────────────────────────────
|
# ── Render settings ──────────────────────────────────────────────────────
|
||||||
scene.render.resolution_x = width
|
scene.render.resolution_x = width
|
||||||
|
|||||||
Reference in New Issue
Block a user