diff --git a/backend/app/api/routers/worker.py b/backend/app/api/routers/worker.py index b52e94f..5e74203 100644 --- a/backend/app/api/routers/worker.py +++ b/backend/app/api/routers/worker.py @@ -16,6 +16,7 @@ from app.models.order_line import OrderLine from app.models.product import Product from app.models.user import User from app.models.worker_config import WorkerConfig +from app.models.system_setting import SystemSetting from app.utils.auth import get_current_user, require_admin_or_pm, require_admin router = APIRouter(prefix="/worker", tags=["worker"]) @@ -456,6 +457,34 @@ async def scale_workers( return {"service": body.service, "count": body.count, "status": "scaling"} +# --------------------------------------------------------------------------- +# GPU probe +# --------------------------------------------------------------------------- + +@router.post("/probe/gpu", status_code=http_status.HTTP_202_ACCEPTED) +async def trigger_gpu_probe(current_user: User = Depends(require_admin)): + """Queue a GPU probe task on the render-worker.""" + from app.tasks.gpu_tasks import probe_gpu + result = probe_gpu.delay() + return {"task_id": str(result.id), "queued": True} + + +@router.get("/probe/gpu/result") +async def get_gpu_probe_result( + current_user: User = Depends(require_admin), + db: AsyncSession = Depends(get_db), +): + """Return the last GPU probe result from system_settings.""" + import json + row = await db.execute( + select(SystemSetting).where(SystemSetting.key == "gpu_probe_last_result") + ) + setting = row.scalar_one_or_none() + if not setting: + return {"status": "unknown", "message": "No probe run yet. Click Run GPU Check."} + return json.loads(setting.value) + + # --------------------------------------------------------------------------- # Render health check # --------------------------------------------------------------------------- diff --git a/backend/app/services/render_blender.py b/backend/app/services/render_blender.py index 905fb74..0f761c8 100644 --- a/backend/app/services/render_blender.py +++ b/backend/app/services/render_blender.py @@ -224,9 +224,30 @@ def render_still( log_lines = [l for l in stdout_lines if "[blender_render]" in l] + # Parse RENDER_DEVICE_USED token from stdout + device_used = "unknown" + compute_type = "unknown" + gpu_fallback = False + for line in stdout_lines: + if line.startswith("RENDER_DEVICE_USED:"): + parts = line.split() + for part in parts: + if part.startswith("device="): + device_used = part.split("=", 1)[1] + elif part.startswith("compute_type="): + compute_type = part.split("=", 1)[1] + gpu_fallback = (device_used == "CPU") + break + # EEVEE fallback removed (Phase 5.2): EEVEE Next in Blender 5.0+ is stable. # If EEVEE fails, it is a hard failure — no silent retry. + if returncode == 2: + raise RuntimeError( + "GPU required but render used CPU — strict mode (CYCLES_DEVICE=gpu). " + "Check that the render-worker has a visible NVIDIA GPU." + ) + if returncode != 0: stdout_tail = "\n".join(stdout_lines[-50:]) if stdout_lines else "" stderr_tail = "\n".join(stderr_lines[-20:]) if stderr_lines else "" @@ -246,6 +267,9 @@ def render_still( "output_size_bytes": output_path.stat().st_size if output_path.exists() else 0, "parts_count": 0, "engine_used": engine_used, + "device_used": device_used, + "compute_type": compute_type, + "gpu_fallback": gpu_fallback, "log_lines": log_lines, } diff --git a/backend/app/tasks/celery_app.py b/backend/app/tasks/celery_app.py index 62954ab..6707053 100644 --- a/backend/app/tasks/celery_app.py +++ b/backend/app/tasks/celery_app.py @@ -18,6 +18,7 @@ celery_app = Celery( "app.domains.products.tasks", "app.domains.imports.tasks", "app.domains.materials.tasks", + "app.tasks.gpu_tasks", ], ) @@ -56,5 +57,9 @@ celery_app.conf.update( "task": "app.tasks.beat_tasks.apply_worker_concurrency", "schedule": 300.0, # every 5 minutes }, + "probe-gpu-every-30m": { + "task": "app.tasks.gpu_tasks.probe_gpu", + "schedule": 1800.0, # every 30 minutes + }, }, ) diff --git a/backend/app/tasks/gpu_tasks.py b/backend/app/tasks/gpu_tasks.py new file mode 100644 index 0000000..53e4eaf --- /dev/null +++ b/backend/app/tasks/gpu_tasks.py @@ -0,0 +1,88 @@ +"""Celery task for GPU health probe.""" +import logging +from app.tasks.celery_app import celery_app + +logger = logging.getLogger(__name__) + + +@celery_app.task(name="app.tasks.gpu_tasks.probe_gpu", queue="thumbnail_rendering") +def probe_gpu() -> dict: + """Run Blender GPU probe on the render-worker. Stores result in system_settings.""" + import subprocess + import json + from datetime import datetime, timezone + from pathlib import Path + from app.services.render_blender import find_blender + + result = { + "status": "unknown", + "device_type": None, + "devices": [], + "error": None, + "probed_at": datetime.now(timezone.utc).isoformat(), + } + + try: + blender_bin = find_blender() + if not blender_bin: + result["status"] = "error" + result["error"] = "Blender binary not found — check BLENDER_BIN env or PATH" + else: + probe_script = Path("/render-scripts/gpu_probe.py") + + if not probe_script.exists(): + result["status"] = "error" + result["error"] = f"gpu_probe.py not found at {probe_script}" + else: + proc = subprocess.run( + [blender_bin, "--background", "--python", str(probe_script)], + capture_output=True, text=True, timeout=60, + ) + + for line in proc.stdout.splitlines(): + if "GPU_PROBE_OK:" in line: + result["status"] = "ok" + # Parse device_type and devices from line: + # GPU_PROBE_OK: device_type=OPTIX devices=[...] + parts = line.split("GPU_PROBE_OK:", 1)[1].strip() + for p in parts.split(): + if p.startswith("device_type="): + result["device_type"] = p.split("=", 1)[1] + break + elif "GPU_PROBE_FAIL:" in line: + result["status"] = "failed" + result["error"] = line.split("GPU_PROBE_FAIL:", 1)[1].strip() + break + + if result["status"] == "unknown": + result["status"] = "failed" if proc.returncode != 0 else "unknown" + result["error"] = proc.stderr[:500] if proc.stderr else "No probe output" + + except subprocess.TimeoutExpired: + result["status"] = "error" + result["error"] = "GPU probe timed out after 60s" + except Exception as exc: + result["status"] = "error" + result["error"] = str(exc) + + # Save to system_settings + _save_probe_result(result) + return result + + +def _save_probe_result(result: dict) -> None: + import json + from sqlalchemy import create_engine, text + from app.config import settings as app_settings + + sync_url = app_settings.database_url.replace("+asyncpg", "") + eng = create_engine(sync_url) + try: + with eng.connect() as conn: + conn.execute(text(""" + INSERT INTO system_settings (key, value) VALUES (:key, :value) + ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value + """), {"key": "gpu_probe_last_result", "value": json.dumps(result)}) + conn.commit() + finally: + eng.dispose() diff --git a/docker-compose.yml b/docker-compose.yml index 9603694..e2abe3d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -133,6 +133,7 @@ services: - UPLOAD_DIR=/app/uploads - BLENDER_BIN=/opt/blender/blender - RENDER_SCRIPTS_DIR=/render-scripts + - CYCLES_DEVICE=${CYCLES_DEVICE:-auto} - MINIO_URL=${MINIO_URL:-http://minio:9000} - MINIO_USER=${MINIO_USER:-minioadmin} - MINIO_PASSWORD=${MINIO_PASSWORD:-minioadmin} diff --git a/render-worker/check_version.py b/render-worker/check_version.py index 7cefdaf..b4dbca7 100644 --- a/render-worker/check_version.py +++ b/render-worker/check_version.py @@ -64,5 +64,62 @@ def check_version(): print(f"Blender {version_str} OK (>= {MIN_VERSION_STR})") +def check_gpu(): + """Run the Blender GPU probe script and report results. + + Respects CYCLES_DEVICE env var: + - "cpu" → skip probe entirely + - "gpu" → require GPU; abort startup if none found + - "auto" (default) → warn if no GPU found, but continue + """ + cycles_device = os.environ.get("CYCLES_DEVICE", "auto").lower() + if cycles_device == "cpu": + print("[check_version] GPU check skipped (CYCLES_DEVICE=cpu)", flush=True) + return + + blender_bin = find_blender() + probe_script = Path("/render-scripts/gpu_probe.py") + if not probe_script.exists(): + print( + f"[check_version] WARNING: gpu_probe.py not found at {probe_script}", + flush=True, + ) + return + + try: + result = subprocess.run( + [blender_bin, "--background", "--python", str(probe_script)], + capture_output=True, text=True, timeout=45, + ) + if result.returncode == 0: + for line in result.stdout.splitlines(): + if "GPU_PROBE_OK" in line: + print(f"[check_version] {line}", flush=True) + break + else: + msg = "No GPU detected — renders will use CPU" + for line in result.stdout.splitlines(): + if "GPU_PROBE_FAIL" in line: + msg = line + break + if cycles_device == "gpu": + print(f"[check_version] ERROR: {msg}", flush=True) + print( + "[check_version] CYCLES_DEVICE=gpu requires GPU — aborting startup", + flush=True, + ) + sys.exit(1) + else: + print( + f"[check_version] WARNING: {msg} (set CYCLES_DEVICE=gpu to enforce)", + flush=True, + ) + except subprocess.TimeoutExpired: + print("[check_version] WARNING: GPU probe timed out after 45s", flush=True) + except Exception as e: + print(f"[check_version] WARNING: GPU probe failed: {e}", flush=True) + + if __name__ == "__main__": check_version() + check_gpu() diff --git a/render-worker/scripts/blender_render.py b/render-worker/scripts/blender_render.py index c598cf7..4fbab48 100644 --- a/render-worker/scripts/blender_render.py +++ b/render-worker/scripts/blender_render.py @@ -718,9 +718,15 @@ if engine != "eevee": # covers both explicit Cycles and EEVEE-fallback # Re-ensure preferences are set (engine activation may have reset them) _activate_gpu() print(f"[blender_render] Cycles GPU ({gpu_type_found}), samples={samples}", flush=True) + print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={gpu_type_found}", flush=True) else: scene.cycles.device = 'CPU' print(f"[blender_render] WARNING: GPU not found — falling back to CPU, samples={samples}", flush=True) + print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True) + import os as _os + if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu": + print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True) + sys.exit(2) scene.cycles.samples = samples scene.cycles.use_denoising = True diff --git a/render-worker/scripts/gpu_probe.py b/render-worker/scripts/gpu_probe.py new file mode 100644 index 0000000..813a44c --- /dev/null +++ b/render-worker/scripts/gpu_probe.py @@ -0,0 +1,43 @@ +"""Blender Python script: GPU compute device probe. + +Run via: + blender --background --python gpu_probe.py + +Exit codes: + 0 — GPU found, prints GPU_PROBE_OK line + 1 — No GPU found or error, prints GPU_PROBE_FAIL line +""" +import sys + + +def main(): + try: + import bpy + + cprefs = bpy.context.preferences.addons['cycles'].preferences + for device_type in ('OPTIX', 'CUDA', 'HIP', 'ONEAPI'): + try: + cprefs.compute_device_type = device_type + cprefs.get_devices() + gpu_devices = [d for d in cprefs.devices if d.type != 'CPU'] + if gpu_devices: + device_names = [(d.name, d.type) for d in gpu_devices] + print( + f"GPU_PROBE_OK: device_type={device_type} devices={device_names}", + flush=True, + ) + sys.exit(0) + except Exception as e: + print(f"GPU_PROBE: {device_type} not available: {e}", flush=True) + + print("GPU_PROBE_FAIL: no GPU compute device found", flush=True) + sys.exit(1) + + except SystemExit: + raise + except Exception as e: + print(f"GPU_PROBE_FAIL: exception during probe: {e}", flush=True) + sys.exit(1) + + +main() diff --git a/render-worker/scripts/still_render.py b/render-worker/scripts/still_render.py index 990f41f..0627fd7 100644 --- a/render-worker/scripts/still_render.py +++ b/render-worker/scripts/still_render.py @@ -728,9 +728,16 @@ def main(): continue except Exception: pass - if not gpu_found: + if gpu_found: + print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={device_type}", flush=True) + else: scene.cycles.device = 'CPU' print("[still_render] WARNING: GPU not found — falling back to CPU") + print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True) + import os as _os + if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu": + print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True) + sys.exit(2) # ── Render settings ────────────────────────────────────────────────────── scene.render.resolution_x = width diff --git a/render-worker/scripts/turntable_render.py b/render-worker/scripts/turntable_render.py index bc6735f..2502e24 100644 --- a/render-worker/scripts/turntable_render.py +++ b/render-worker/scripts/turntable_render.py @@ -682,9 +682,16 @@ def main(): continue except Exception: pass - if not gpu_found: + if gpu_found: + print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={device_type}", flush=True) + else: scene.cycles.device = 'CPU' print("[turntable_render] WARNING: GPU not found — falling back to CPU") + print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True) + import os as _os + if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu": + print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True) + sys.exit(2) # ── Render settings ────────────────────────────────────────────────────── scene.render.resolution_x = width