feat: render health endpoint + test script + pipeline fixes

- GET /api/worker/health/render: checks render-worker (thumbnail_rendering
  queue), Blender availability via active_queues inspect, queue depth,
  last render recency — returns ok/degraded/down status
- scripts/test_render_pipeline.py: integration test for full pipeline
  (--health, --sample, --full modes)
- PLAN.md: appended Render Pipeline Fixes section with all B-Fixes
- LEARNINGS.md: documented 5 new learnings (queue mismatch, circular
  import, 307 redirect, worker capability detection)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-06 19:34:12 +01:00
parent 979b0082ec
commit 381f44bc8b
4 changed files with 703 additions and 1 deletions
+117
View File
@@ -352,3 +352,120 @@ async def cancel_task(task_id: str, user: User = Depends(require_admin_or_pm)):
from app.tasks.celery_app import celery_app
celery_app.control.revoke(task_id, terminate=True, signal="SIGTERM")
return {"revoked": task_id}
# ---------------------------------------------------------------------------
# Render health check
# ---------------------------------------------------------------------------
class RenderHealthStatus(BaseModel):
status: str # "ok" | "degraded" | "down"
render_worker_connected: bool
blender_available: bool
thumbnail_queue_depth: int
thumbnail_queue_ok: bool
last_render_at: str | None
last_render_success: bool | None
last_render_age_minutes: float | None
details: dict
@router.get("/health/render", response_model=RenderHealthStatus)
async def render_health(
user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
"""Check render pipeline health: worker connectivity, Blender, queue depth, last render."""
import asyncio
import redis as redis_lib
from app.config import settings as app_settings
from app.tasks.celery_app import celery_app
from app.models.order_line import OrderLine
details: dict = {}
# 1. Check if render-worker (thumbnail_rendering queue) is connected + has Blender
render_worker_connected = False
blender_available = False
def _inspect_workers() -> dict:
try:
insp = celery_app.control.inspect(timeout=2.0)
ping = insp.ping() or {}
active_queues = insp.active_queues() or {}
return {"ping": ping, "active_queues": active_queues}
except Exception as exc:
return {"error": str(exc)}
inspect_result = await asyncio.to_thread(_inspect_workers)
if "error" in inspect_result:
details["inspect_error"] = inspect_result["error"]
else:
all_workers = list(inspect_result.get("ping", {}).keys())
details["workers"] = all_workers
# Find any worker consuming thumbnail_rendering queue
for worker_name, queues in inspect_result.get("active_queues", {}).items():
queue_names = [q.get("name") for q in (queues or [])]
if "thumbnail_rendering" in queue_names:
render_worker_connected = True
# render-worker always has Blender — it starts Blender successfully
blender_available = True
details["render_worker"] = worker_name
# Fallback: workers present but queue info unavailable
if not render_worker_connected and all_workers:
render_worker_connected = True
details["worker_detection"] = "fallback"
# 3. Queue depth for thumbnail_rendering
thumbnail_queue_depth = 0
try:
r = redis_lib.from_url(app_settings.redis_url, decode_responses=True)
thumbnail_queue_depth = r.llen("thumbnail_rendering") or 0
except Exception as exc:
details["redis_error"] = str(exc)
thumbnail_queue_ok = thumbnail_queue_depth < 10
# 4. Last render time and success
last_render_at = None
last_render_success = None
last_render_age_minutes = None
try:
from sqlalchemy import select as sa_select, desc
result = await db.execute(
sa_select(OrderLine.render_completed_at, OrderLine.render_status)
.where(OrderLine.render_completed_at.isnot(None))
.order_by(desc(OrderLine.render_completed_at))
.limit(1)
)
row = result.first()
if row:
last_render_at = row[0].isoformat()
last_render_success = row[1] == "completed"
from datetime import datetime
age = (datetime.utcnow() - row[0]).total_seconds() / 60
last_render_age_minutes = round(age, 1)
except Exception as exc:
details["db_error"] = str(exc)
# Determine overall status
if not render_worker_connected or not blender_available:
status = "down"
elif not thumbnail_queue_ok:
status = "degraded"
elif last_render_success is False and last_render_age_minutes is not None and last_render_age_minutes < 30:
status = "degraded"
else:
status = "ok"
return RenderHealthStatus(
status=status,
render_worker_connected=render_worker_connected,
blender_available=blender_available,
thumbnail_queue_depth=thumbnail_queue_depth,
thumbnail_queue_ok=thumbnail_queue_ok,
last_render_at=last_render_at,
last_render_success=last_render_success,
last_render_age_minutes=last_render_age_minutes,
details=details,
)