Files
HartOMat/backend/app/tasks/flamenco_tasks.py
T
2026-03-05 22:12:38 +01:00

336 lines
14 KiB
Python

"""Celery tasks for polling Flamenco job status and watchdog recovery."""
import logging
from datetime import datetime, timedelta
from app.tasks.celery_app import celery_app
logger = logging.getLogger(__name__)
# Flamenco status → our render_status mapping
FLAMENCO_STATUS_MAP = {
"queued": "processing",
"active": "processing",
"completed": "completed",
"failed": "failed",
"canceled": "failed",
"cancel-requested": "processing",
"paused": "processing",
}
@celery_app.task(name="app.tasks.flamenco_tasks.poll_flamenco_jobs", queue="step_processing")
def poll_flamenco_jobs():
"""Poll Flamenco Manager for active render jobs and update OrderLine status.
Runs on a Celery Beat schedule (every 10 seconds).
Uses a Redis lock (TTL=9s) to ensure at most one poll executes per 10-second
window. When the queue backs up with many duplicates (e.g. all workers are
busy with long STEP/render tasks), duplicates acquire the lock, find it taken,
and return immediately — draining the queue without doing redundant work.
"""
import redis as redis_lib
from app.config import settings as app_settings
# Deduplicate: skip if a poll ran within the last 9 seconds
try:
r = redis_lib.from_url(app_settings.redis_url)
acquired = r.set("flamenco_poll_lock", "1", nx=True, ex=9)
if not acquired:
return {"skipped": "deduplicated"}
except Exception:
pass # Redis unavailable — proceed anyway
from sqlalchemy import create_engine, select, update as sql_update
from sqlalchemy.orm import Session
from app.models.order_line import OrderLine
from app.models.system_setting import SystemSetting
from app.services.flamenco_client import get_flamenco_client
sync_url = app_settings.database_url.replace("+asyncpg", "")
engine = create_engine(sync_url)
# Track orders whose lines transitioned to a terminal state
completed_order_ids = set()
with Session(engine) as session:
# Load Flamenco Manager URL
row = session.execute(
select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url")
).scalar_one_or_none()
manager_url = row.value if row else "http://flamenco-manager:8080"
# Find all OrderLines dispatched to Flamenco that are still processing
lines = session.execute(
select(OrderLine).where(
OrderLine.render_backend_used == "flamenco",
OrderLine.render_status == "processing",
OrderLine.flamenco_job_id.isnot(None),
)
).scalars().all()
if not lines:
engine.dispose()
return {"polled": 0}
client = get_flamenco_client(manager_url)
updated = 0
for line in lines:
try:
job = client.get_job(line.flamenco_job_id)
flamenco_status = job.get("status", "")
our_status = FLAMENCO_STATUS_MAP.get(flamenco_status, "processing")
if our_status == line.render_status:
continue # No change
updates = {"render_status": our_status}
if our_status == "completed":
updates["render_completed_at"] = datetime.utcnow()
# Try to extract result path from job activity
activity = job.get("activity", "")
if activity:
updates["render_log"] = {
"flamenco_job_id": line.flamenco_job_id,
"flamenco_status": flamenco_status,
"activity": activity,
}
# Set result path based on job type
job_type = job.get("type", "")
metadata = job.get("metadata", {})
if job_type == "schaeffler-turntable":
output_dir = job.get("settings", {}).get("output_dir", "")
output_name = job.get("settings", {}).get("output_name", "turntable")
updates["result_path"] = f"{output_dir}/{output_name}.mp4"
elif job_type == "schaeffler-still":
updates["result_path"] = job.get("settings", {}).get("output_path", "")
elif our_status == "failed":
updates["render_completed_at"] = datetime.utcnow()
updates["render_log"] = {
"flamenco_job_id": line.flamenco_job_id,
"flamenco_status": flamenco_status,
"error": job.get("activity", "Job failed"),
}
session.execute(
sql_update(OrderLine)
.where(OrderLine.id == line.id)
.values(**updates)
)
updated += 1
logger.info(
f"Flamenco job {line.flamenco_job_id}: "
f"{flamenco_status} → render_status={our_status}"
)
# Track orders with lines that reached a terminal state
if our_status in ("completed", "failed"):
completed_order_ids.add(str(line.order_id))
except Exception as exc:
logger.warning(
f"Failed to poll Flamenco job {line.flamenco_job_id}: {exc}"
)
if updated:
session.commit()
engine.dispose()
# Auto-advance orders if all renderable lines are done
if completed_order_ids:
from app.services.order_status_service import check_order_completion
for oid in completed_order_ids:
check_order_completion(oid)
return {"polled": len(lines), "updated": updated}
# ---------------------------------------------------------------------------
# Stalled-render watchdog
# ---------------------------------------------------------------------------
@celery_app.task(name="app.tasks.flamenco_tasks.check_stalled_renders", queue="step_processing")
def check_stalled_renders():
"""Watchdog: detect and re-dispatch render jobs stuck in 'processing'.
Runs on a Celery Beat schedule (every 5 minutes).
After a docker restart, Celery workers lose in-flight tasks — the DB still
shows render_status='processing' indefinitely. This task:
* For **Celery** lines: uses Celery inspect to check whether any worker is
still actively executing the task. If not (e.g. after a restart), and
the job has been stuck longer than ``render_stall_timeout_minutes``
(default: 120 min), it is reset to 'pending' and re-dispatched.
* For **Flamenco** lines: queries the Flamenco Manager. If the manager
reports the job as still active the line is left alone; if the job is
gone or in a terminal/error state it is re-dispatched.
"""
from sqlalchemy import create_engine, select, update as sql_update
from sqlalchemy.orm import Session
from app.config import settings as app_settings
from app.models.order_line import OrderLine
from app.models.system_setting import SystemSetting
sync_url = app_settings.database_url.replace("+asyncpg", "")
engine = create_engine(sync_url)
with Session(engine) as session:
# ── Read timeout from system settings ────────────────────────────────
row = session.execute(
select(SystemSetting).where(SystemSetting.key == "render_stall_timeout_minutes")
).scalar_one_or_none()
try:
timeout_minutes = int(row.value) if row else 120
except (ValueError, TypeError):
timeout_minutes = 120
cutoff = datetime.utcnow() - timedelta(minutes=timeout_minutes)
stalled_lines = session.execute(
select(OrderLine).where(
OrderLine.render_status == "processing",
OrderLine.render_started_at.isnot(None),
OrderLine.render_started_at < cutoff,
)
).scalars().all()
if not stalled_lines:
engine.dispose()
return {"checked": 0, "restarted": 0, "timeout_minutes": timeout_minutes}
logger.info(
"[watchdog] Found %d stalled render(s) older than %d minutes",
len(stalled_lines), timeout_minutes,
)
# ── Build set of order_line_ids actively running on Celery workers ───
active_celery_line_ids: set[str] = set()
inspect_ok = False
try:
inspect = celery_app.control.inspect(timeout=2)
active_tasks = inspect.active() or {}
for worker_tasks in active_tasks.values():
for task_info in (worker_tasks or []):
args = task_info.get("args", [])
if args:
active_celery_line_ids.add(str(args[0]))
inspect_ok = True
except Exception as exc:
logger.warning(
"[watchdog] Celery inspect failed (%s) — will re-dispatch all timed-out Celery jobs",
exc,
)
# ── Load Flamenco Manager URL ─────────────────────────────────────────
manager_url = "http://flamenco-manager:8080"
try:
url_row = session.execute(
select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url")
).scalar_one_or_none()
if url_row:
manager_url = url_row.value
except Exception:
pass
# ── Decide which lines to restart ────────────────────────────────────
to_restart: list[OrderLine] = []
for line in stalled_lines:
line_id = str(line.id)
if line.flamenco_job_id:
# Flamenco job: verify with manager before re-dispatching
try:
from app.services.flamenco_client import get_flamenco_client
client = get_flamenco_client(manager_url)
job = client.get_job(line.flamenco_job_id)
flamenco_status = job.get("status", "")
if flamenco_status in (
"active", "queued", "paused",
"pause-requested", "cancel-requested",
):
logger.info(
"[watchdog] Flamenco job %s is still %s — skipping line %s",
line.flamenco_job_id, flamenco_status, line_id,
)
continue
logger.info(
"[watchdog] Flamenco job %s status=%r → re-dispatching line %s",
line.flamenco_job_id, flamenco_status, line_id,
)
except Exception as exc:
# Manager unreachable — skip to avoid false restarts
logger.warning(
"[watchdog] Cannot reach Flamenco for job %s (%s) — skipping line %s",
line.flamenco_job_id, exc, line_id,
)
continue
else:
# Celery job: skip if still actively running on a worker
if inspect_ok and line_id in active_celery_line_ids:
logger.info(
"[watchdog] Celery render for line %s still active — skipping", line_id
)
continue
logger.info(
"[watchdog] Celery render for line %s not found in active tasks — re-dispatching",
line_id,
)
to_restart.append(line)
if not to_restart:
engine.dispose()
return {
"checked": len(stalled_lines),
"restarted": 0,
"timeout_minutes": timeout_minutes,
}
# ── Reset stalled lines to pending ───────────────────────────────────
for line in to_restart:
session.execute(
sql_update(OrderLine)
.where(OrderLine.id == line.id)
.values(
render_status="pending",
render_started_at=None,
render_backend_used=None,
flamenco_job_id=None,
render_log={
"watchdog": (
f"Auto-restarted after {timeout_minutes} min stall "
f"(previous backend: {line.render_backend_used or 'unknown'})"
)
},
)
)
session.commit()
engine.dispose()
# ── Re-dispatch outside DB session ───────────────────────────────────────
from app.services.render_dispatcher import dispatch_render
restarted = 0
for line in to_restart:
try:
dispatch_render(str(line.id))
restarted += 1
logger.info("[watchdog] Re-dispatched render for order line %s", line.id)
except Exception as exc:
logger.error(
"[watchdog] Failed to re-dispatch line %s: %s — left as pending", line.id, exc
)
return {
"checked": len(stalled_lines),
"restarted": restarted,
"timeout_minutes": timeout_minutes,
}