refactor(A1): remove Flamenco, simplify render pipeline to Celery-only

- Remove flamenco-manager and flamenco-worker from docker-compose.yml
- Delete flamenco_client.py, flamenco_tasks.py, docker_scaler.py
- Simplify render_dispatcher.py to Celery-only (removes ~300 lines)
- Remove Flamenco beat schedule from celery_app.py
- Clean admin.py: remove flamenco settings, endpoints, threejs validation
- Clean orders.py cancel-render: Celery revoke only
- Clean worker.py: remove flamenco_job_id from activity response
- Migration 032: cancel lingering flamenco jobs, remove flamenco settings
- PLAN.md: mark all decisions confirmed, status IN UMSETZUNG

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-06 15:38:37 +01:00
parent 552922eb8a
commit 1d6864fb64
13 changed files with 1524 additions and 1151 deletions
@@ -0,0 +1,49 @@
"""Remove Flamenco: cancel lingering flamenco jobs, clean up settings.
Revision ID: 032
Revises: 031
Create Date: 2026-03-06
"""
from alembic import op
import sqlalchemy as sa
revision = '032'
down_revision = '031'
branch_labels = None
depends_on = None
def upgrade():
# Cancel any order lines that were dispatched to Flamenco and never completed
op.execute("""
UPDATE order_lines
SET render_status = 'cancelled',
render_completed_at = NOW(),
render_log = render_log || '{"cancelled_reason": "flamenco_removed_in_v2"}'::jsonb
WHERE render_backend_used = 'flamenco'
AND render_status IN ('processing', 'pending')
""")
# Remove Flamenco-specific system settings
op.execute("""
DELETE FROM system_settings
WHERE key IN ('flamenco_manager_url', 'flamenco_worker_count')
""")
# Reset render_backend setting to 'celery' if it was 'flamenco' or 'auto'
op.execute("""
UPDATE system_settings
SET value = 'celery'
WHERE key = 'render_backend' AND value IN ('flamenco', 'auto')
""")
def downgrade():
# Re-insert default Flamenco settings
op.execute("""
INSERT INTO system_settings (key, value, updated_at)
VALUES
('flamenco_manager_url', 'http://flamenco-manager:8080', NOW()),
('flamenco_worker_count', '1', NOW())
ON CONFLICT (key) DO NOTHING
""")
+6 -110
View File
@@ -1,4 +1,3 @@
import asyncio
import json
import uuid
from datetime import datetime
@@ -17,27 +16,21 @@ from app.utils.auth import require_admin, hash_password
router = APIRouter(prefix="/admin", tags=["admin"])
VALID_RENDERERS = {"pillow", "blender", "threejs"}
VALID_ENGINES = {"cycles", "eevee"}
VALID_THREEJS_SIZES = {512, 1024, 2048}
VALID_FORMATS = {"jpg", "png"}
VALID_STL_QUALITIES = {"low", "high"}
VALID_RENDERERS = {"pillow", "blender"}
VALID_ENGINES = {"cycles", "eevee"}
VALID_FORMATS = {"jpg", "png"}
VALID_STL_QUALITIES = {"low", "high"}
VALID_CYCLES_DEVICES = {"auto", "gpu", "cpu"}
VALID_RENDER_BACKENDS = {"celery", "flamenco", "auto"}
SETTINGS_DEFAULTS: dict[str, str] = {
"thumbnail_renderer": "pillow",
"thumbnail_renderer": "blender",
"blender_engine": "cycles",
"blender_cycles_samples": "256",
"blender_eevee_samples": "64",
"threejs_render_size": "1024",
"thumbnail_format": "jpg",
"stl_quality": "low",
"blender_smooth_angle": "30",
"cycles_device": "auto",
"render_backend": "celery",
"flamenco_manager_url": "http://flamenco-manager:8080",
"flamenco_worker_count": "1",
"blender_max_concurrent_renders": "3",
"product_thumbnail_priority": '["latest_render","cad_thumbnail"]',
"render_stall_timeout_minutes": "120",
@@ -45,18 +38,15 @@ SETTINGS_DEFAULTS: dict[str, str] = {
class SettingsOut(BaseModel):
thumbnail_renderer: str = "pillow"
thumbnail_renderer: str = "blender"
blender_engine: str = "cycles"
blender_cycles_samples: int = 256
blender_eevee_samples: int = 64
threejs_render_size: int = 1024
thumbnail_format: str = "jpg"
stl_quality: str = "low"
blender_smooth_angle: int = 30
cycles_device: str = "auto"
render_backend: str = "celery"
flamenco_manager_url: str = "http://flamenco-manager:8080"
flamenco_worker_count: int = 1
blender_max_concurrent_renders: int = 3
product_thumbnail_priority: str = '["latest_render","cad_thumbnail"]'
render_stall_timeout_minutes: int = 120
@@ -67,14 +57,11 @@ class SettingsUpdate(BaseModel):
blender_engine: str | None = None
blender_cycles_samples: int | None = None
blender_eevee_samples: int | None = None
threejs_render_size: int | None = None
thumbnail_format: str | None = None
stl_quality: str | None = None
blender_smooth_angle: int | None = None
cycles_device: str | None = None
render_backend: str | None = None
flamenco_manager_url: str | None = None
flamenco_worker_count: int | None = None
blender_max_concurrent_renders: int | None = None
product_thumbnail_priority: str | None = None
render_stall_timeout_minutes: int | None = None
@@ -171,14 +158,11 @@ def _settings_to_out(raw: dict[str, str]) -> SettingsOut:
blender_engine=raw["blender_engine"],
blender_cycles_samples=int(raw["blender_cycles_samples"]),
blender_eevee_samples=int(raw["blender_eevee_samples"]),
threejs_render_size=int(raw["threejs_render_size"]),
thumbnail_format=raw["thumbnail_format"],
stl_quality=raw["stl_quality"],
blender_smooth_angle=int(raw["blender_smooth_angle"]),
cycles_device=raw["cycles_device"],
render_backend=raw["render_backend"],
flamenco_manager_url=raw["flamenco_manager_url"],
flamenco_worker_count=int(raw["flamenco_worker_count"]),
blender_max_concurrent_renders=int(raw["blender_max_concurrent_renders"]),
product_thumbnail_priority=raw.get("product_thumbnail_priority", '["latest_render","cad_thumbnail"]'),
render_stall_timeout_minutes=int(raw.get("render_stall_timeout_minutes", "120")),
@@ -207,8 +191,6 @@ async def update_settings(
raise HTTPException(400, detail="blender_cycles_samples must be 14096")
if body.blender_eevee_samples is not None and not (1 <= body.blender_eevee_samples <= 1024):
raise HTTPException(400, detail="blender_eevee_samples must be 11024")
if body.threejs_render_size is not None and body.threejs_render_size not in VALID_THREEJS_SIZES:
raise HTTPException(400, detail=f"Invalid threejs_render_size. Choose: {', '.join(str(s) for s in sorted(VALID_THREEJS_SIZES))}")
if body.thumbnail_format is not None and body.thumbnail_format not in VALID_FORMATS:
raise HTTPException(400, detail=f"Invalid thumbnail_format. Choose: {', '.join(sorted(VALID_FORMATS))}")
if body.stl_quality is not None and body.stl_quality not in VALID_STL_QUALITIES:
@@ -217,10 +199,6 @@ async def update_settings(
raise HTTPException(400, detail="blender_smooth_angle must be 0180 degrees")
if body.cycles_device is not None and body.cycles_device not in VALID_CYCLES_DEVICES:
raise HTTPException(400, detail=f"Invalid cycles_device. Choose: {', '.join(sorted(VALID_CYCLES_DEVICES))}")
if body.render_backend is not None and body.render_backend not in VALID_RENDER_BACKENDS:
raise HTTPException(400, detail=f"Invalid render_backend. Choose: {', '.join(sorted(VALID_RENDER_BACKENDS))}")
if body.flamenco_worker_count is not None and not (1 <= body.flamenco_worker_count <= 16):
raise HTTPException(400, detail="flamenco_worker_count must be 116")
if body.blender_max_concurrent_renders is not None and not (1 <= body.blender_max_concurrent_renders <= 16):
raise HTTPException(400, detail="blender_max_concurrent_renders must be 116")
if body.render_stall_timeout_minutes is not None and not (10 <= body.render_stall_timeout_minutes <= 10080):
@@ -252,8 +230,6 @@ async def update_settings(
updates["blender_cycles_samples"] = str(body.blender_cycles_samples)
if body.blender_eevee_samples is not None:
updates["blender_eevee_samples"] = str(body.blender_eevee_samples)
if body.threejs_render_size is not None:
updates["threejs_render_size"] = str(body.threejs_render_size)
if body.thumbnail_format is not None:
updates["thumbnail_format"] = body.thumbnail_format
if body.stl_quality is not None:
@@ -264,10 +240,6 @@ async def update_settings(
updates["cycles_device"] = body.cycles_device
if body.render_backend is not None:
updates["render_backend"] = body.render_backend
if body.flamenco_manager_url is not None:
updates["flamenco_manager_url"] = body.flamenco_manager_url
if body.flamenco_worker_count is not None:
updates["flamenco_worker_count"] = str(body.flamenco_worker_count)
if body.blender_max_concurrent_renders is not None:
updates["blender_max_concurrent_renders"] = str(body.blender_max_concurrent_renders)
if body.render_stall_timeout_minutes is not None:
@@ -392,7 +364,6 @@ async def renderer_status(
services = {
"pillow": {"url": None, "available": True, "note": "Built-in (always available)"},
"blender": {"url": "http://blender-renderer:8100/health", "available": False, "note": ""},
"threejs": {"url": "http://threejs-renderer:8101/health", "available": False, "note": ""},
}
async with httpx.AsyncClient(timeout=3.0) as client:
for name, info in services.items():
@@ -409,78 +380,3 @@ async def renderer_status(
return services
@router.get("/settings/flamenco-status")
async def flamenco_status(
admin: User = Depends(require_admin),
db: AsyncSession = Depends(get_db),
):
"""Check Flamenco Manager health and list workers."""
raw = await _load_settings(db)
manager_url = raw.get("flamenco_manager_url", "http://flamenco-manager:8080")
from app.services.flamenco_client import get_flamenco_client
client = get_flamenco_client(manager_url)
health = client.health_check()
workers: list[dict] = []
if health["available"]:
try:
workers = client.list_workers()
except Exception as exc:
workers = [{"error": str(exc)[:200]}]
return {
"manager": health,
"workers": workers,
"manager_url": manager_url,
}
class WorkerCountBody(BaseModel):
count: int
@router.get("/settings/flamenco-worker-actual")
async def get_flamenco_worker_actual(admin: User = Depends(require_admin)):
"""Return the number of flamenco-worker containers currently running."""
from app.services.docker_scaler import get_running_worker_count
count = await asyncio.get_event_loop().run_in_executor(None, get_running_worker_count)
return {"running": count, "available": count >= 0}
@router.post("/settings/flamenco-worker-count")
async def set_flamenco_worker_count(
body: WorkerCountBody,
admin: User = Depends(require_admin),
db: AsyncSession = Depends(get_db),
):
"""Scale Flamenco worker containers to the requested count via Docker socket."""
if not (1 <= body.count <= 16):
raise HTTPException(400, detail="Worker count must be 116")
# Save desired count to settings first
await _save_setting(db, "flamenco_worker_count", str(body.count))
await db.commit()
# Perform actual Docker scaling in a thread (blocking SDK call)
from app.services.docker_scaler import scale_workers
try:
result = await asyncio.get_event_loop().run_in_executor(None, scale_workers, body.count)
return {
"count": body.count,
"previous": result["previous"],
"current": result["current"],
"delta": result["delta"],
"message": result["message"],
}
except Exception as exc:
# Scaling failed — return a warning but keep the saved setting
return {
"count": body.count,
"previous": -1,
"current": -1,
"delta": 0,
"message": f"Setting saved, but Docker scaling failed: {exc}. "
f"Run `docker compose up -d --scale flamenco-worker={body.count}` manually.",
}
+16 -69
View File
@@ -920,44 +920,17 @@ async def cancel_line_render(
if line.render_status not in ("processing", "pending"):
raise HTTPException(400, detail=f"Line render_status is '{line.render_status}', nothing to cancel")
cancelled_backend = line.render_backend_used or "unknown"
cancelled_backend = line.render_backend_used or "celery"
errors: list[str] = []
# Cancel Flamenco job if applicable
if line.render_backend_used == "flamenco" and line.flamenco_job_id:
try:
from app.services.flamenco_client import get_flamenco_client
from app.models.system_setting import SystemSetting
row = await db.execute(
select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url")
)
setting = row.scalar_one_or_none()
url = setting.value if setting else "http://flamenco-manager:8080"
client = get_flamenco_client(url)
client.cancel_job(line.flamenco_job_id)
except Exception as exc:
errors.append(f"Flamenco cancel failed: {str(exc)[:200]}")
# Revoke Celery task if applicable
if line.render_backend_used == "celery" or not line.render_backend_used:
try:
from app.tasks.celery_app import celery_app
celery_app.control.revoke(
f"render-{line_id}", terminate=True, signal="SIGTERM"
)
except Exception as exc:
errors.append(f"Celery revoke failed: {str(exc)[:200]}")
# Also kill the Blender subprocess in the renderer microservice.
# The job_id sent to blender-renderer equals the order_line_id.
try:
import httpx as _httpx
_httpx.post(
f"http://blender-renderer:8100/cancel/{line_id}",
timeout=5.0,
)
except Exception:
pass # best-effort; renderer may not be running a job for this line
# Revoke Celery task (best-effort)
try:
from app.tasks.celery_app import celery_app
celery_app.control.revoke(
f"render-{line_id}", terminate=True, signal="SIGTERM"
)
except Exception as exc:
errors.append(f"Celery revoke failed: {str(exc)[:200]}")
# Mark line as cancelled
from sqlalchemy import update as sql_update
@@ -1013,47 +986,21 @@ async def cancel_order_renders(
if not lines:
raise HTTPException(400, detail="No active renders to cancel")
from app.services.flamenco_client import get_flamenco_client
from app.models.system_setting import SystemSetting
from app.tasks.celery_app import celery_app
from sqlalchemy import update as sql_update
# Load Flamenco URL once
row = await db.execute(
select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url")
)
setting = row.scalar_one_or_none()
flamenco_url = setting.value if setting else "http://flamenco-manager:8080"
now = datetime.utcnow()
cancelled_count = 0
errors: list[str] = []
for line in lines:
# Cancel Flamenco job
if line.render_backend_used == "flamenco" and line.flamenco_job_id:
try:
client = get_flamenco_client(flamenco_url)
client.cancel_job(line.flamenco_job_id)
except Exception as exc:
errors.append(f"Line {line.id}: Flamenco cancel failed: {str(exc)[:100]}")
# Revoke Celery task + kill Blender subprocess in renderer service
if line.render_backend_used == "celery" or not line.render_backend_used:
try:
celery_app.control.revoke(
f"render-{line.id}", terminate=True, signal="SIGTERM"
)
except Exception:
pass # Celery revoke is best-effort
try:
import httpx as _httpx
_httpx.post(
f"http://blender-renderer:8100/cancel/{line.id}",
timeout=5.0,
)
except Exception:
pass # best-effort
# Revoke Celery task (best-effort)
try:
celery_app.control.revoke(
f"render-{line.id}", terminate=True, signal="SIGTERM"
)
except Exception:
pass
await db.execute(
sql_update(OrderLine)
-2
View File
@@ -38,7 +38,6 @@ class RenderJobEntry(BaseModel):
output_type_name: str | None
render_status: str
render_backend_used: str | None
flamenco_job_id: str | None
render_started_at: str | None
render_completed_at: str | None
updated_at: str
@@ -140,7 +139,6 @@ async def get_worker_activity(
output_type_name=rl.output_type.name if rl.output_type else None,
render_status=rl.render_status,
render_backend_used=rl.render_backend_used,
flamenco_job_id=rl.flamenco_job_id,
render_started_at=rl.render_started_at.isoformat() if rl.render_started_at else None,
render_completed_at=rl.render_completed_at.isoformat() if rl.render_completed_at else None,
updated_at=rl.updated_at.isoformat(),
+1 -1
View File
@@ -4,7 +4,7 @@ from sqlalchemy import String, DateTime, Boolean, Text, Integer, ForeignKey
from sqlalchemy.orm import Mapped, mapped_column, relationship
from sqlalchemy.dialects.postgresql import UUID, JSONB
VALID_RENDER_BACKENDS = {"celery", "flamenco", "auto"}
VALID_RENDER_BACKENDS = {"celery"}
from app.database import Base
-177
View File
@@ -1,177 +0,0 @@
"""Scale Flamenco worker containers via the Docker socket.
Uses the Docker Python SDK (docker>=6.1.0) to list, start, and stop containers.
Requires /var/run/docker.sock to be mounted into the backend container.
"""
import os
import logging
log = logging.getLogger(__name__)
COMPOSE_PROJECT = os.getenv("COMPOSE_PROJECT_NAME", "schaefflerautomat")
SERVICE_NAME = "flamenco-worker"
def _get_client():
import docker
return docker.from_env()
def get_worker_containers(client=None):
"""Return all flamenco-worker containers (running + stopped) sorted by name."""
if client is None:
client = _get_client()
return sorted(
client.containers.list(
all=True,
filters={
"label": [
f"com.docker.compose.project={COMPOSE_PROJECT}",
f"com.docker.compose.service={SERVICE_NAME}",
]
},
),
key=lambda c: c.name,
)
def get_running_worker_count(client=None) -> int:
"""Return how many flamenco-worker containers are currently running."""
try:
if client is None:
client = _get_client()
containers = get_worker_containers(client)
return sum(1 for c in containers if c.status == "running")
except Exception as exc:
log.warning("docker_scaler: could not read worker count: %s", exc)
return -1
def scale_workers(target: int) -> dict:
"""Scale flamenco-worker containers to *target* count.
Returns a dict with keys:
previous containers running before
current containers running after
delta change (negative = stopped, positive = started)
message human-readable summary
"""
import docker
from docker.types import Mount
client = _get_client()
all_workers = get_worker_containers(client)
running = [c for c in all_workers if c.status == "running"]
previous = len(running)
if target == previous:
return {"previous": previous, "current": previous, "delta": 0,
"message": f"Already at {previous} worker(s) — no change"}
# ── Scale down ────────────────────────────────────────────────────────────
if target < previous:
# Stop highest-numbered containers first to minimise disruption
to_stop = sorted(running, key=lambda c: c.name, reverse=True)[: previous - target]
for c in to_stop:
log.info("docker_scaler: stopping %s", c.name)
c.stop(timeout=20)
c.remove()
return {
"previous": previous,
"current": target,
"delta": target - previous,
"message": f"Stopped {len(to_stop)} worker(s): {[c.name for c in to_stop]}",
}
# ── Scale up ──────────────────────────────────────────────────────────────
template = running[0] if running else (all_workers[0] if all_workers else None)
if template is None:
raise RuntimeError(
"No existing flamenco-worker container found to clone configuration from. "
"Ensure at least one worker container exists (even if stopped)."
)
attrs = template.attrs
image = attrs["Config"]["Image"]
env = attrs["Config"].get("Env") or []
# Reconstruct mounts from the template container
mounts = []
for m in (attrs.get("Mounts") or []):
mount_type = m.get("Type", "bind")
source = m.get("Name", "") if mount_type == "volume" else m.get("Source", "")
mounts.append(
Mount(
target=m["Destination"],
source=source,
type=mount_type,
read_only=not m.get("RW", True),
)
)
# Reconstruct GPU device requests (nvidia)
device_requests = None
raw_dr = (attrs.get("HostConfig") or {}).get("DeviceRequests") or []
if raw_dr:
device_requests = []
for dr in raw_dr:
device_requests.append(
docker.types.DeviceRequest(
driver=dr.get("Driver", ""),
count=dr.get("Count", -1),
device_ids=dr.get("DeviceIDs") or [],
capabilities=dr.get("Capabilities") or [],
options=dr.get("Options") or {},
)
)
# Network(s) the template is connected to
network_names = list(
(attrs.get("NetworkSettings") or {}).get("Networks", {}).keys()
)
restart_policy_name = (
(attrs.get("HostConfig") or {})
.get("RestartPolicy", {})
.get("Name", "unless-stopped")
) or "unless-stopped"
started = []
for i in range(previous + 1, target + 1):
new_name = f"{COMPOSE_PROJECT}-{SERVICE_NAME}-{i}"
labels = {
"com.docker.compose.project": COMPOSE_PROJECT,
"com.docker.compose.service": SERVICE_NAME,
"com.docker.compose.container-number": str(i),
}
log.info("docker_scaler: creating %s from image %s", new_name, image)
container = client.containers.create(
image=image,
name=new_name,
environment=env,
labels=labels,
mounts=mounts,
restart_policy={"Name": restart_policy_name},
device_requests=device_requests,
)
for net_name in network_names:
try:
net = client.networks.get(net_name)
net.connect(container)
log.info("docker_scaler: connected %s to network %s", new_name, net_name)
except Exception as exc:
log.warning("docker_scaler: could not connect to network %s: %s", net_name, exc)
container.start()
started.append(new_name)
log.info("docker_scaler: started %s", new_name)
return {
"previous": previous,
"current": target,
"delta": target - previous,
"message": f"Started {len(started)} new worker(s): {started}",
}
-121
View File
@@ -1,121 +0,0 @@
"""Flamenco Manager REST API client.
Uses httpx (sync) for compatibility with Celery tasks and FastAPI endpoints.
"""
import logging
from typing import Any
import httpx
logger = logging.getLogger(__name__)
DEFAULT_TIMEOUT = 10.0
class FlamencoClient:
"""Thin wrapper around the Flamenco Manager v3 REST API."""
def __init__(self, manager_url: str):
self.base_url = manager_url.rstrip("/")
def _url(self, path: str) -> str:
return f"{self.base_url}{path}"
# ── Job management ──────────────────────────────────────────────────────
def submit_job(
self,
name: str,
job_type: str,
settings: dict[str, Any],
metadata: dict[str, str] | None = None,
priority: int = 50,
) -> dict:
"""Submit a new render job to Flamenco Manager.
Returns the created job dict (includes 'id').
"""
payload = {
"name": name,
"type": job_type,
"submitter_platform": "linux",
"settings": settings,
"metadata": metadata or {},
"priority": priority,
}
resp = httpx.post(
self._url("/api/v3/jobs"),
json=payload,
timeout=DEFAULT_TIMEOUT,
)
resp.raise_for_status()
return resp.json()
def get_job(self, job_id: str) -> dict:
"""Get job details by ID."""
resp = httpx.get(
self._url(f"/api/v3/jobs/{job_id}"),
timeout=DEFAULT_TIMEOUT,
)
resp.raise_for_status()
return resp.json()
def cancel_job(self, job_id: str) -> None:
"""Request cancellation of a job."""
resp = httpx.post(
self._url(f"/api/v3/jobs/{job_id}/setstatus"),
json={"status": "cancel-requested"},
timeout=DEFAULT_TIMEOUT,
)
resp.raise_for_status()
# ── Workers ─────────────────────────────────────────────────────────────
def list_workers(self) -> list[dict]:
"""List all registered workers."""
resp = httpx.get(
self._url("/api/v3/worker-mgt/workers"),
timeout=DEFAULT_TIMEOUT,
)
resp.raise_for_status()
data = resp.json()
return data.get("workers", data) if isinstance(data, dict) else data
# ── Farm status ─────────────────────────────────────────────────────────
def get_farm_status(self) -> dict:
"""Get overall farm status from the Manager."""
resp = httpx.get(
self._url("/api/v3/configuration"),
timeout=DEFAULT_TIMEOUT,
)
resp.raise_for_status()
return resp.json()
def health_check(self) -> dict:
"""Check if the Flamenco Manager is reachable and return version info."""
try:
resp = httpx.get(
self._url("/api/v3/version"),
timeout=5.0,
)
resp.raise_for_status()
data = resp.json()
return {
"available": True,
"version": data.get("version", "unknown"),
"name": data.get("name", "Flamenco"),
}
except Exception as exc:
logger.warning(f"Flamenco health check failed: {exc}")
return {
"available": False,
"version": None,
"name": None,
"error": str(exc)[:200],
}
def get_flamenco_client(manager_url: str) -> FlamencoClient:
"""Factory that creates a FlamencoClient from a manager URL."""
return FlamencoClient(manager_url)
+8 -286
View File
@@ -1,12 +1,7 @@
"""Render dispatcher — routes render jobs to Celery or Flamenco.
"""Render dispatcher — routes render jobs to Celery.
Backend selection priority:
1. OutputType.render_backend per-type override ("celery" / "flamenco")
2. OutputType.is_animation — animations default to Flamenco
3. System setting render_backend — global default ("celery" / "flamenco" / "auto")
4. "auto" mode: stills → Celery, animations → Flamenco
All renders run via Celery workers (Flamenco removed in v2 refactor).
"""
import json
import logging
from datetime import datetime
@@ -14,7 +9,6 @@ from sqlalchemy import select, update as sql_update
from sqlalchemy.orm import Session, joinedload
from app.models.order_line import OrderLine
from app.models.output_type import OutputType
from app.models.product import Product
from app.models.system_setting import SystemSetting
@@ -29,113 +23,11 @@ def _load_setting(session: Session, key: str, default: str = "") -> str:
return row.value if row else default
def resolve_backend(output_type: OutputType | None, system_backend: str) -> str:
"""Determine which backend to use for a given output type.
Returns "celery" or "flamenco".
"""
if output_type is None:
return "celery"
# Priority 1: explicit per-type override
ot_backend = output_type.render_backend
if ot_backend in ("celery", "flamenco"):
return ot_backend
# Priority 2+3: is_animation + system setting
if system_backend in ("celery", "flamenco"):
return system_backend
# Priority 4: auto mode — animations → Flamenco, stills → Celery
if output_type.is_animation:
return "flamenco"
return "celery"
def build_flamenco_job_settings(
output_type: OutputType,
product: Product,
step_path: str,
output_dir: str,
system_settings: dict[str, str],
lighting_only: bool = False,
shadow_catcher: bool = False,
camera_orbit: bool = True,
cycles_device: str = "auto",
rotation_x: float = 0.0,
rotation_y: float = 0.0,
rotation_z: float = 0.0,
) -> dict:
"""Build Flamenco job settings from output type and product metadata."""
render_settings = output_type.render_settings or {}
engine = render_settings.get("engine", system_settings.get("blender_engine", "cycles"))
samples_key = f"blender_{engine}_samples"
samples = render_settings.get("samples", int(system_settings.get(samples_key, "256")))
stl_quality = render_settings.get("stl_quality", system_settings.get("stl_quality", "low"))
width = render_settings.get("width", 1920 if output_type.is_animation else 1024)
height = render_settings.get("height", 1080 if output_type.is_animation else 1024)
part_colors = {}
part_names_ordered = []
if product.cad_file and product.cad_file.parsed_objects:
part_names_ordered = product.cad_file.parsed_objects.get("objects", [])
materials_source = product.cad_part_materials
if materials_source:
from app.services.step_processor import build_part_colors
part_colors = build_part_colors(part_names_ordered, materials_source)
transparent_bg = bool(output_type.transparent_bg) if hasattr(output_type, 'transparent_bg') else False
settings = {
"step_path": step_path,
"engine": engine,
"samples": samples,
"stl_quality": stl_quality,
"width": width,
"height": height,
"part_colors_json": json.dumps(part_colors),
"transparent_bg": transparent_bg,
"template_path": "",
"target_collection": "Product",
"material_library_path": "",
"material_map_json": "{}",
"part_names_ordered_json": json.dumps(part_names_ordered),
"lighting_only": lighting_only,
"shadow_catcher": shadow_catcher,
"cycles_device": cycles_device,
"rotation_x": rotation_x,
"rotation_y": rotation_y,
"rotation_z": rotation_z,
}
for dk in ('noise_threshold', 'denoiser', 'denoising_input_passes',
'denoising_prefilter', 'denoising_quality', 'denoising_use_gpu'):
settings[dk] = str(render_settings.get(dk, ""))
if output_type.is_animation:
# Turntable-specific settings
output_name = render_settings.get("output_name", "turntable")
settings["output_dir"] = output_dir
settings["output_name"] = output_name
settings["frame_count"] = render_settings.get("frame_count", 120)
settings["fps"] = render_settings.get("fps", 30)
settings["turntable_degrees"] = render_settings.get("turntable_degrees", 360)
settings["turntable_axis"] = render_settings.get("turntable_axis", "world_z")
settings["bg_color"] = render_settings.get("bg_color", "")
settings["camera_orbit"] = camera_orbit
else:
# Still-specific settings
ext = output_type.output_format or "png"
settings["output_path"] = f"{output_dir}/render.{ext}"
return settings
def dispatch_render(order_line_id: str) -> dict:
"""Route a render job to Celery or Flamenco based on configuration.
"""Dispatch a render job to Celery.
Must be called from a sync context (Celery task or sync wrapper).
Returns {"backend": "celery"|"flamenco", "job_ref": str}.
Returns {"backend": "celery", "job_ref": str}.
"""
from app.config import settings as app_settings
from app.services.render_log import emit, clear
@@ -179,196 +71,26 @@ def dispatch_render(order_line_id: str) -> dict:
cad_name = line.product.cad_file.original_name if line.product.cad_file else "?"
emit(order_line_id, f"CAD file: {cad_name}")
emit(order_line_id, "Dispatching to Celery render worker")
# Load system settings
system_backend = _load_setting(session, "render_backend", "celery")
flamenco_url = _load_setting(session, "flamenco_manager_url", "http://flamenco-manager:8080")
backend = resolve_backend(line.output_type, system_backend)
emit(order_line_id, f"Resolved backend: {backend}")
# Mark as processing
now = datetime.utcnow()
session.execute(
sql_update(OrderLine)
.where(OrderLine.id == line.id)
.values(
render_status="processing",
render_backend_used=backend,
render_backend_used="celery",
render_started_at=now,
)
)
session.commit()
if backend == "flamenco":
emit(order_line_id, f"Submitting job to Flamenco Manager ({flamenco_url})")
result = _dispatch_flamenco(session, line, flamenco_url)
if result.get("error"):
emit(order_line_id, f"Flamenco submit failed: {result['error']}", "error")
else:
emit(order_line_id, f"Flamenco job submitted: {result.get('job_ref', '?')}")
return result
else:
emit(order_line_id, "Dispatching to Celery render worker")
return _dispatch_celery(order_line_id)
engine_db.dispose()
return _dispatch_celery(order_line_id)
def _dispatch_celery(order_line_id: str) -> dict:
"""Dispatch to the existing Celery render task."""
"""Dispatch to the Celery render task."""
from app.tasks.step_tasks import render_order_line_task
result = render_order_line_task.delay(order_line_id)
return {"backend": "celery", "job_ref": result.id}
def _dispatch_flamenco(session: Session, line: OrderLine, flamenco_url: str) -> dict:
"""Submit a job to Flamenco Manager."""
import re
from app.services.flamenco_client import get_flamenco_client
# Load all needed system settings
all_keys = ["blender_engine", "blender_cycles_samples", "blender_eevee_samples", "stl_quality", "cycles_device"]
sys_settings = {}
for key in all_keys:
sys_settings[key] = _load_setting(session, key, "")
output_type = line.output_type
product = line.product
cad_file = product.cad_file
# Load render_position for rotation values
rotation_x = rotation_y = rotation_z = 0.0
if line.render_position_id:
from app.models.render_position import ProductRenderPosition
rp = session.get(ProductRenderPosition, line.render_position_id)
if rp:
rotation_x, rotation_y, rotation_z = rp.rotation_x, rp.rotation_y, rp.rotation_z
# Flamenco mounts the uploads volume at /shared, backend uses /app/uploads
raw_path = cad_file.stored_path if cad_file else ""
step_path = raw_path.replace("/app/uploads/", "/shared/") if raw_path else ""
output_dir = f"/shared/renders/{line.id}"
job_type = "schaeffler-turntable" if (output_type and output_type.is_animation) else "schaeffler-still"
# Resolve render template + material library BEFORE building job settings
# (template.lighting_only is needed by build_flamenco_job_settings)
from app.services.template_service import resolve_template, get_material_library_path
category_key = product.category_key if product else None
ot_id = str(line.output_type_id) if line.output_type_id else None
template = resolve_template(category_key=category_key, output_type_id=ot_id)
material_library = get_material_library_path()
# Resolve cycles_device: per-output-type override wins, fall back to system setting
ot_cycles_device = output_type.cycles_device if output_type else None
effective_cycles_device = ot_cycles_device or sys_settings.get("cycles_device", "gpu") or "gpu"
settings = build_flamenco_job_settings(
output_type=output_type,
product=product,
step_path=step_path,
output_dir=output_dir,
system_settings=sys_settings,
lighting_only=bool(template.lighting_only) if template else False,
shadow_catcher=bool(template.shadow_catcher_enabled) if template else False,
camera_orbit=bool(template.camera_orbit) if template else True,
cycles_device=effective_cycles_device,
rotation_x=rotation_x,
rotation_y=rotation_y,
rotation_z=rotation_z,
)
if template:
# Remap path for Flamenco shared volume
tmpl_path = template.blend_file_path.replace("/app/uploads/", "/shared/")
settings["template_path"] = tmpl_path
settings["target_collection"] = template.target_collection
logger.info(
f"Flamenco job: using render template '{template.name}' "
f"(id={template.id}, path={tmpl_path}, collection={template.target_collection})"
)
else:
logger.info(
f"Flamenco job: no render template found for "
f"category_key={category_key!r}, output_type_id={ot_id!r} — using factory settings"
)
# Material library + material map: send whenever library exists and product
# has material assignments — works with or without a render template.
# When a template is present, only apply if material_replace_enabled is set.
materials_source = product.cad_part_materials
use_materials = bool(material_library and materials_source)
if template and not template.material_replace_enabled:
use_materials = False
if use_materials:
mat_lib_path = material_library.replace("/app/uploads/", "/shared/")
settings["material_library_path"] = mat_lib_path
mat_map = {
m["part_name"]: m["material"]
for m in materials_source
if m.get("part_name") and m.get("material")
}
# Resolve raw material names to SCHAEFFLER library names via aliases
from app.services.material_service import resolve_material_map
mat_map = resolve_material_map(mat_map)
settings["material_map_json"] = json.dumps(mat_map)
# Output naming: meaningful filename instead of generic render.ext
def _sanitize(s: str) -> str:
return re.sub(r'[^\w\-.]', '_', s.strip())[:100]
product_name = product.name or product.pim_id or "product"
ot_name = output_type.name if output_type else "render"
if not (output_type and output_type.is_animation):
ext = output_type.output_format or "png" if output_type else "png"
filename = f"{_sanitize(product_name)}_{_sanitize(ot_name)}.{ext}"
settings["output_path"] = f"{output_dir}/{filename}"
metadata = {
"order_line_id": str(line.id),
"order_id": str(line.order_id),
"product_name": product.name or "",
"output_type": output_type.name if output_type else "",
"category": product.category_key or "",
}
job_name = f"{product.name or product.pim_id} - {output_type.name if output_type else 'render'}"
try:
client = get_flamenco_client(flamenco_url)
job = client.submit_job(
name=job_name[:200],
job_type=job_type,
settings=settings,
metadata=metadata,
)
job_id = job.get("id", "")
# Save flamenco_job_id
session.execute(
sql_update(OrderLine)
.where(OrderLine.id == line.id)
.values(flamenco_job_id=job_id)
)
session.commit()
logger.info(f"Flamenco job submitted: {job_id} for OrderLine {line.id}")
return {"backend": "flamenco", "job_ref": job_id}
except Exception as exc:
logger.error(f"Flamenco submit failed for OrderLine {line.id}: {exc}")
session.execute(
sql_update(OrderLine)
.where(OrderLine.id == line.id)
.values(
render_status="failed",
render_completed_at=datetime.utcnow(),
render_log={"error": f"Flamenco submit failed: {str(exc)[:500]}"},
)
)
session.commit()
return {"backend": "flamenco", "job_ref": "", "error": str(exc)}
+2 -16
View File
@@ -5,7 +5,7 @@ celery_app = Celery(
"schaefflerautomat",
broker=settings.redis_url,
backend=settings.redis_url,
include=["app.tasks.step_tasks", "app.tasks.ai_tasks", "app.tasks.flamenco_tasks"],
include=["app.tasks.step_tasks", "app.tasks.ai_tasks"],
)
celery_app.conf.update(
@@ -17,20 +17,6 @@ celery_app.conf.update(
task_routes={
"app.tasks.step_tasks.*": {"queue": "step_processing"},
"app.tasks.ai_tasks.*": {"queue": "ai_validation"},
"app.tasks.flamenco_tasks.*": {"queue": "step_processing"},
},
beat_schedule={
"poll-flamenco-jobs": {
"task": "app.tasks.flamenco_tasks.poll_flamenco_jobs",
"schedule": 10.0, # every 10 seconds
# Discard if not consumed before the next run; prevents queue build-up
# when workers are busy with long-running STEP/render tasks.
"options": {"expires": 9},
},
"check-stalled-renders": {
"task": "app.tasks.flamenco_tasks.check_stalled_renders",
"schedule": 300.0, # every 5 minutes
"options": {"expires": 290},
},
},
beat_schedule={},
)
-335
View File
@@ -1,335 +0,0 @@
"""Celery tasks for polling Flamenco job status and watchdog recovery."""
import logging
from datetime import datetime, timedelta
from app.tasks.celery_app import celery_app
logger = logging.getLogger(__name__)
# Flamenco status → our render_status mapping
FLAMENCO_STATUS_MAP = {
"queued": "processing",
"active": "processing",
"completed": "completed",
"failed": "failed",
"canceled": "failed",
"cancel-requested": "processing",
"paused": "processing",
}
@celery_app.task(name="app.tasks.flamenco_tasks.poll_flamenco_jobs", queue="step_processing")
def poll_flamenco_jobs():
"""Poll Flamenco Manager for active render jobs and update OrderLine status.
Runs on a Celery Beat schedule (every 10 seconds).
Uses a Redis lock (TTL=9s) to ensure at most one poll executes per 10-second
window. When the queue backs up with many duplicates (e.g. all workers are
busy with long STEP/render tasks), duplicates acquire the lock, find it taken,
and return immediately — draining the queue without doing redundant work.
"""
import redis as redis_lib
from app.config import settings as app_settings
# Deduplicate: skip if a poll ran within the last 9 seconds
try:
r = redis_lib.from_url(app_settings.redis_url)
acquired = r.set("flamenco_poll_lock", "1", nx=True, ex=9)
if not acquired:
return {"skipped": "deduplicated"}
except Exception:
pass # Redis unavailable — proceed anyway
from sqlalchemy import create_engine, select, update as sql_update
from sqlalchemy.orm import Session
from app.models.order_line import OrderLine
from app.models.system_setting import SystemSetting
from app.services.flamenco_client import get_flamenco_client
sync_url = app_settings.database_url.replace("+asyncpg", "")
engine = create_engine(sync_url)
# Track orders whose lines transitioned to a terminal state
completed_order_ids = set()
with Session(engine) as session:
# Load Flamenco Manager URL
row = session.execute(
select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url")
).scalar_one_or_none()
manager_url = row.value if row else "http://flamenco-manager:8080"
# Find all OrderLines dispatched to Flamenco that are still processing
lines = session.execute(
select(OrderLine).where(
OrderLine.render_backend_used == "flamenco",
OrderLine.render_status == "processing",
OrderLine.flamenco_job_id.isnot(None),
)
).scalars().all()
if not lines:
engine.dispose()
return {"polled": 0}
client = get_flamenco_client(manager_url)
updated = 0
for line in lines:
try:
job = client.get_job(line.flamenco_job_id)
flamenco_status = job.get("status", "")
our_status = FLAMENCO_STATUS_MAP.get(flamenco_status, "processing")
if our_status == line.render_status:
continue # No change
updates = {"render_status": our_status}
if our_status == "completed":
updates["render_completed_at"] = datetime.utcnow()
# Try to extract result path from job activity
activity = job.get("activity", "")
if activity:
updates["render_log"] = {
"flamenco_job_id": line.flamenco_job_id,
"flamenco_status": flamenco_status,
"activity": activity,
}
# Set result path based on job type
job_type = job.get("type", "")
metadata = job.get("metadata", {})
if job_type == "schaeffler-turntable":
output_dir = job.get("settings", {}).get("output_dir", "")
output_name = job.get("settings", {}).get("output_name", "turntable")
updates["result_path"] = f"{output_dir}/{output_name}.mp4"
elif job_type == "schaeffler-still":
updates["result_path"] = job.get("settings", {}).get("output_path", "")
elif our_status == "failed":
updates["render_completed_at"] = datetime.utcnow()
updates["render_log"] = {
"flamenco_job_id": line.flamenco_job_id,
"flamenco_status": flamenco_status,
"error": job.get("activity", "Job failed"),
}
session.execute(
sql_update(OrderLine)
.where(OrderLine.id == line.id)
.values(**updates)
)
updated += 1
logger.info(
f"Flamenco job {line.flamenco_job_id}: "
f"{flamenco_status} → render_status={our_status}"
)
# Track orders with lines that reached a terminal state
if our_status in ("completed", "failed"):
completed_order_ids.add(str(line.order_id))
except Exception as exc:
logger.warning(
f"Failed to poll Flamenco job {line.flamenco_job_id}: {exc}"
)
if updated:
session.commit()
engine.dispose()
# Auto-advance orders if all renderable lines are done
if completed_order_ids:
from app.services.order_status_service import check_order_completion
for oid in completed_order_ids:
check_order_completion(oid)
return {"polled": len(lines), "updated": updated}
# ---------------------------------------------------------------------------
# Stalled-render watchdog
# ---------------------------------------------------------------------------
@celery_app.task(name="app.tasks.flamenco_tasks.check_stalled_renders", queue="step_processing")
def check_stalled_renders():
"""Watchdog: detect and re-dispatch render jobs stuck in 'processing'.
Runs on a Celery Beat schedule (every 5 minutes).
After a docker restart, Celery workers lose in-flight tasks — the DB still
shows render_status='processing' indefinitely. This task:
* For **Celery** lines: uses Celery inspect to check whether any worker is
still actively executing the task. If not (e.g. after a restart), and
the job has been stuck longer than ``render_stall_timeout_minutes``
(default: 120 min), it is reset to 'pending' and re-dispatched.
* For **Flamenco** lines: queries the Flamenco Manager. If the manager
reports the job as still active the line is left alone; if the job is
gone or in a terminal/error state it is re-dispatched.
"""
from sqlalchemy import create_engine, select, update as sql_update
from sqlalchemy.orm import Session
from app.config import settings as app_settings
from app.models.order_line import OrderLine
from app.models.system_setting import SystemSetting
sync_url = app_settings.database_url.replace("+asyncpg", "")
engine = create_engine(sync_url)
with Session(engine) as session:
# ── Read timeout from system settings ────────────────────────────────
row = session.execute(
select(SystemSetting).where(SystemSetting.key == "render_stall_timeout_minutes")
).scalar_one_or_none()
try:
timeout_minutes = int(row.value) if row else 120
except (ValueError, TypeError):
timeout_minutes = 120
cutoff = datetime.utcnow() - timedelta(minutes=timeout_minutes)
stalled_lines = session.execute(
select(OrderLine).where(
OrderLine.render_status == "processing",
OrderLine.render_started_at.isnot(None),
OrderLine.render_started_at < cutoff,
)
).scalars().all()
if not stalled_lines:
engine.dispose()
return {"checked": 0, "restarted": 0, "timeout_minutes": timeout_minutes}
logger.info(
"[watchdog] Found %d stalled render(s) older than %d minutes",
len(stalled_lines), timeout_minutes,
)
# ── Build set of order_line_ids actively running on Celery workers ───
active_celery_line_ids: set[str] = set()
inspect_ok = False
try:
inspect = celery_app.control.inspect(timeout=2)
active_tasks = inspect.active() or {}
for worker_tasks in active_tasks.values():
for task_info in (worker_tasks or []):
args = task_info.get("args", [])
if args:
active_celery_line_ids.add(str(args[0]))
inspect_ok = True
except Exception as exc:
logger.warning(
"[watchdog] Celery inspect failed (%s) — will re-dispatch all timed-out Celery jobs",
exc,
)
# ── Load Flamenco Manager URL ─────────────────────────────────────────
manager_url = "http://flamenco-manager:8080"
try:
url_row = session.execute(
select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url")
).scalar_one_or_none()
if url_row:
manager_url = url_row.value
except Exception:
pass
# ── Decide which lines to restart ────────────────────────────────────
to_restart: list[OrderLine] = []
for line in stalled_lines:
line_id = str(line.id)
if line.flamenco_job_id:
# Flamenco job: verify with manager before re-dispatching
try:
from app.services.flamenco_client import get_flamenco_client
client = get_flamenco_client(manager_url)
job = client.get_job(line.flamenco_job_id)
flamenco_status = job.get("status", "")
if flamenco_status in (
"active", "queued", "paused",
"pause-requested", "cancel-requested",
):
logger.info(
"[watchdog] Flamenco job %s is still %s — skipping line %s",
line.flamenco_job_id, flamenco_status, line_id,
)
continue
logger.info(
"[watchdog] Flamenco job %s status=%r → re-dispatching line %s",
line.flamenco_job_id, flamenco_status, line_id,
)
except Exception as exc:
# Manager unreachable — skip to avoid false restarts
logger.warning(
"[watchdog] Cannot reach Flamenco for job %s (%s) — skipping line %s",
line.flamenco_job_id, exc, line_id,
)
continue
else:
# Celery job: skip if still actively running on a worker
if inspect_ok and line_id in active_celery_line_ids:
logger.info(
"[watchdog] Celery render for line %s still active — skipping", line_id
)
continue
logger.info(
"[watchdog] Celery render for line %s not found in active tasks — re-dispatching",
line_id,
)
to_restart.append(line)
if not to_restart:
engine.dispose()
return {
"checked": len(stalled_lines),
"restarted": 0,
"timeout_minutes": timeout_minutes,
}
# ── Reset stalled lines to pending ───────────────────────────────────
for line in to_restart:
session.execute(
sql_update(OrderLine)
.where(OrderLine.id == line.id)
.values(
render_status="pending",
render_started_at=None,
render_backend_used=None,
flamenco_job_id=None,
render_log={
"watchdog": (
f"Auto-restarted after {timeout_minutes} min stall "
f"(previous backend: {line.render_backend_used or 'unknown'})"
)
},
)
)
session.commit()
engine.dispose()
# ── Re-dispatch outside DB session ───────────────────────────────────────
from app.services.render_dispatcher import dispatch_render
restarted = 0
for line in to_restart:
try:
dispatch_render(str(line.id))
restarted += 1
logger.info("[watchdog] Re-dispatched render for order line %s", line.id)
except Exception as exc:
logger.error(
"[watchdog] Failed to re-dispatch line %s: %s — left as pending", line.id, exc
)
return {
"checked": len(stalled_lines),
"restarted": restarted,
"timeout_minutes": timeout_minutes,
}