refactor(A1): remove Flamenco, simplify render pipeline to Celery-only
- Remove flamenco-manager and flamenco-worker from docker-compose.yml - Delete flamenco_client.py, flamenco_tasks.py, docker_scaler.py - Simplify render_dispatcher.py to Celery-only (removes ~300 lines) - Remove Flamenco beat schedule from celery_app.py - Clean admin.py: remove flamenco settings, endpoints, threejs validation - Clean orders.py cancel-render: Celery revoke only - Clean worker.py: remove flamenco_job_id from activity response - Migration 032: cancel lingering flamenco jobs, remove flamenco settings - PLAN.md: mark all decisions confirmed, status IN UMSETZUNG Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+36
@@ -5,3 +5,39 @@ node_modules/
|
||||
*.log
|
||||
core
|
||||
/blender-renderer/core
|
||||
|
||||
# Python cache
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*.pyo
|
||||
|
||||
# Node / Vite build output
|
||||
dist/
|
||||
node_modules/
|
||||
|
||||
# Celery beat schedule
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# Test cache
|
||||
.pytest_cache/
|
||||
.coverage
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
|
||||
# Excel lock files
|
||||
~$*
|
||||
.~lock.*#
|
||||
|
||||
# Kundendaten ausschließen
|
||||
*.stp
|
||||
*.step
|
||||
*.stl
|
||||
*.xls
|
||||
+.xslx
|
||||
*.csv
|
||||
*.xlsx
|
||||
|
||||
*.blend1
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
"""Remove Flamenco: cancel lingering flamenco jobs, clean up settings.
|
||||
|
||||
Revision ID: 032
|
||||
Revises: 031
|
||||
Create Date: 2026-03-06
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
revision = '032'
|
||||
down_revision = '031'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# Cancel any order lines that were dispatched to Flamenco and never completed
|
||||
op.execute("""
|
||||
UPDATE order_lines
|
||||
SET render_status = 'cancelled',
|
||||
render_completed_at = NOW(),
|
||||
render_log = render_log || '{"cancelled_reason": "flamenco_removed_in_v2"}'::jsonb
|
||||
WHERE render_backend_used = 'flamenco'
|
||||
AND render_status IN ('processing', 'pending')
|
||||
""")
|
||||
|
||||
# Remove Flamenco-specific system settings
|
||||
op.execute("""
|
||||
DELETE FROM system_settings
|
||||
WHERE key IN ('flamenco_manager_url', 'flamenco_worker_count')
|
||||
""")
|
||||
|
||||
# Reset render_backend setting to 'celery' if it was 'flamenco' or 'auto'
|
||||
op.execute("""
|
||||
UPDATE system_settings
|
||||
SET value = 'celery'
|
||||
WHERE key = 'render_backend' AND value IN ('flamenco', 'auto')
|
||||
""")
|
||||
|
||||
|
||||
def downgrade():
|
||||
# Re-insert default Flamenco settings
|
||||
op.execute("""
|
||||
INSERT INTO system_settings (key, value, updated_at)
|
||||
VALUES
|
||||
('flamenco_manager_url', 'http://flamenco-manager:8080', NOW()),
|
||||
('flamenco_worker_count', '1', NOW())
|
||||
ON CONFLICT (key) DO NOTHING
|
||||
""")
|
||||
@@ -1,4 +1,3 @@
|
||||
import asyncio
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
@@ -17,27 +16,21 @@ from app.utils.auth import require_admin, hash_password
|
||||
|
||||
router = APIRouter(prefix="/admin", tags=["admin"])
|
||||
|
||||
VALID_RENDERERS = {"pillow", "blender", "threejs"}
|
||||
VALID_ENGINES = {"cycles", "eevee"}
|
||||
VALID_THREEJS_SIZES = {512, 1024, 2048}
|
||||
VALID_FORMATS = {"jpg", "png"}
|
||||
VALID_STL_QUALITIES = {"low", "high"}
|
||||
VALID_RENDERERS = {"pillow", "blender"}
|
||||
VALID_ENGINES = {"cycles", "eevee"}
|
||||
VALID_FORMATS = {"jpg", "png"}
|
||||
VALID_STL_QUALITIES = {"low", "high"}
|
||||
VALID_CYCLES_DEVICES = {"auto", "gpu", "cpu"}
|
||||
VALID_RENDER_BACKENDS = {"celery", "flamenco", "auto"}
|
||||
|
||||
SETTINGS_DEFAULTS: dict[str, str] = {
|
||||
"thumbnail_renderer": "pillow",
|
||||
"thumbnail_renderer": "blender",
|
||||
"blender_engine": "cycles",
|
||||
"blender_cycles_samples": "256",
|
||||
"blender_eevee_samples": "64",
|
||||
"threejs_render_size": "1024",
|
||||
"thumbnail_format": "jpg",
|
||||
"stl_quality": "low",
|
||||
"blender_smooth_angle": "30",
|
||||
"cycles_device": "auto",
|
||||
"render_backend": "celery",
|
||||
"flamenco_manager_url": "http://flamenco-manager:8080",
|
||||
"flamenco_worker_count": "1",
|
||||
"blender_max_concurrent_renders": "3",
|
||||
"product_thumbnail_priority": '["latest_render","cad_thumbnail"]',
|
||||
"render_stall_timeout_minutes": "120",
|
||||
@@ -45,18 +38,15 @@ SETTINGS_DEFAULTS: dict[str, str] = {
|
||||
|
||||
|
||||
class SettingsOut(BaseModel):
|
||||
thumbnail_renderer: str = "pillow"
|
||||
thumbnail_renderer: str = "blender"
|
||||
blender_engine: str = "cycles"
|
||||
blender_cycles_samples: int = 256
|
||||
blender_eevee_samples: int = 64
|
||||
threejs_render_size: int = 1024
|
||||
thumbnail_format: str = "jpg"
|
||||
stl_quality: str = "low"
|
||||
blender_smooth_angle: int = 30
|
||||
cycles_device: str = "auto"
|
||||
render_backend: str = "celery"
|
||||
flamenco_manager_url: str = "http://flamenco-manager:8080"
|
||||
flamenco_worker_count: int = 1
|
||||
blender_max_concurrent_renders: int = 3
|
||||
product_thumbnail_priority: str = '["latest_render","cad_thumbnail"]'
|
||||
render_stall_timeout_minutes: int = 120
|
||||
@@ -67,14 +57,11 @@ class SettingsUpdate(BaseModel):
|
||||
blender_engine: str | None = None
|
||||
blender_cycles_samples: int | None = None
|
||||
blender_eevee_samples: int | None = None
|
||||
threejs_render_size: int | None = None
|
||||
thumbnail_format: str | None = None
|
||||
stl_quality: str | None = None
|
||||
blender_smooth_angle: int | None = None
|
||||
cycles_device: str | None = None
|
||||
render_backend: str | None = None
|
||||
flamenco_manager_url: str | None = None
|
||||
flamenco_worker_count: int | None = None
|
||||
blender_max_concurrent_renders: int | None = None
|
||||
product_thumbnail_priority: str | None = None
|
||||
render_stall_timeout_minutes: int | None = None
|
||||
@@ -171,14 +158,11 @@ def _settings_to_out(raw: dict[str, str]) -> SettingsOut:
|
||||
blender_engine=raw["blender_engine"],
|
||||
blender_cycles_samples=int(raw["blender_cycles_samples"]),
|
||||
blender_eevee_samples=int(raw["blender_eevee_samples"]),
|
||||
threejs_render_size=int(raw["threejs_render_size"]),
|
||||
thumbnail_format=raw["thumbnail_format"],
|
||||
stl_quality=raw["stl_quality"],
|
||||
blender_smooth_angle=int(raw["blender_smooth_angle"]),
|
||||
cycles_device=raw["cycles_device"],
|
||||
render_backend=raw["render_backend"],
|
||||
flamenco_manager_url=raw["flamenco_manager_url"],
|
||||
flamenco_worker_count=int(raw["flamenco_worker_count"]),
|
||||
blender_max_concurrent_renders=int(raw["blender_max_concurrent_renders"]),
|
||||
product_thumbnail_priority=raw.get("product_thumbnail_priority", '["latest_render","cad_thumbnail"]'),
|
||||
render_stall_timeout_minutes=int(raw.get("render_stall_timeout_minutes", "120")),
|
||||
@@ -207,8 +191,6 @@ async def update_settings(
|
||||
raise HTTPException(400, detail="blender_cycles_samples must be 1–4096")
|
||||
if body.blender_eevee_samples is not None and not (1 <= body.blender_eevee_samples <= 1024):
|
||||
raise HTTPException(400, detail="blender_eevee_samples must be 1–1024")
|
||||
if body.threejs_render_size is not None and body.threejs_render_size not in VALID_THREEJS_SIZES:
|
||||
raise HTTPException(400, detail=f"Invalid threejs_render_size. Choose: {', '.join(str(s) for s in sorted(VALID_THREEJS_SIZES))}")
|
||||
if body.thumbnail_format is not None and body.thumbnail_format not in VALID_FORMATS:
|
||||
raise HTTPException(400, detail=f"Invalid thumbnail_format. Choose: {', '.join(sorted(VALID_FORMATS))}")
|
||||
if body.stl_quality is not None and body.stl_quality not in VALID_STL_QUALITIES:
|
||||
@@ -217,10 +199,6 @@ async def update_settings(
|
||||
raise HTTPException(400, detail="blender_smooth_angle must be 0–180 degrees")
|
||||
if body.cycles_device is not None and body.cycles_device not in VALID_CYCLES_DEVICES:
|
||||
raise HTTPException(400, detail=f"Invalid cycles_device. Choose: {', '.join(sorted(VALID_CYCLES_DEVICES))}")
|
||||
if body.render_backend is not None and body.render_backend not in VALID_RENDER_BACKENDS:
|
||||
raise HTTPException(400, detail=f"Invalid render_backend. Choose: {', '.join(sorted(VALID_RENDER_BACKENDS))}")
|
||||
if body.flamenco_worker_count is not None and not (1 <= body.flamenco_worker_count <= 16):
|
||||
raise HTTPException(400, detail="flamenco_worker_count must be 1–16")
|
||||
if body.blender_max_concurrent_renders is not None and not (1 <= body.blender_max_concurrent_renders <= 16):
|
||||
raise HTTPException(400, detail="blender_max_concurrent_renders must be 1–16")
|
||||
if body.render_stall_timeout_minutes is not None and not (10 <= body.render_stall_timeout_minutes <= 10080):
|
||||
@@ -252,8 +230,6 @@ async def update_settings(
|
||||
updates["blender_cycles_samples"] = str(body.blender_cycles_samples)
|
||||
if body.blender_eevee_samples is not None:
|
||||
updates["blender_eevee_samples"] = str(body.blender_eevee_samples)
|
||||
if body.threejs_render_size is not None:
|
||||
updates["threejs_render_size"] = str(body.threejs_render_size)
|
||||
if body.thumbnail_format is not None:
|
||||
updates["thumbnail_format"] = body.thumbnail_format
|
||||
if body.stl_quality is not None:
|
||||
@@ -264,10 +240,6 @@ async def update_settings(
|
||||
updates["cycles_device"] = body.cycles_device
|
||||
if body.render_backend is not None:
|
||||
updates["render_backend"] = body.render_backend
|
||||
if body.flamenco_manager_url is not None:
|
||||
updates["flamenco_manager_url"] = body.flamenco_manager_url
|
||||
if body.flamenco_worker_count is not None:
|
||||
updates["flamenco_worker_count"] = str(body.flamenco_worker_count)
|
||||
if body.blender_max_concurrent_renders is not None:
|
||||
updates["blender_max_concurrent_renders"] = str(body.blender_max_concurrent_renders)
|
||||
if body.render_stall_timeout_minutes is not None:
|
||||
@@ -392,7 +364,6 @@ async def renderer_status(
|
||||
services = {
|
||||
"pillow": {"url": None, "available": True, "note": "Built-in (always available)"},
|
||||
"blender": {"url": "http://blender-renderer:8100/health", "available": False, "note": ""},
|
||||
"threejs": {"url": "http://threejs-renderer:8101/health", "available": False, "note": ""},
|
||||
}
|
||||
async with httpx.AsyncClient(timeout=3.0) as client:
|
||||
for name, info in services.items():
|
||||
@@ -409,78 +380,3 @@ async def renderer_status(
|
||||
return services
|
||||
|
||||
|
||||
@router.get("/settings/flamenco-status")
|
||||
async def flamenco_status(
|
||||
admin: User = Depends(require_admin),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
"""Check Flamenco Manager health and list workers."""
|
||||
raw = await _load_settings(db)
|
||||
manager_url = raw.get("flamenco_manager_url", "http://flamenco-manager:8080")
|
||||
|
||||
from app.services.flamenco_client import get_flamenco_client
|
||||
client = get_flamenco_client(manager_url)
|
||||
|
||||
health = client.health_check()
|
||||
workers: list[dict] = []
|
||||
|
||||
if health["available"]:
|
||||
try:
|
||||
workers = client.list_workers()
|
||||
except Exception as exc:
|
||||
workers = [{"error": str(exc)[:200]}]
|
||||
|
||||
return {
|
||||
"manager": health,
|
||||
"workers": workers,
|
||||
"manager_url": manager_url,
|
||||
}
|
||||
|
||||
|
||||
class WorkerCountBody(BaseModel):
|
||||
count: int
|
||||
|
||||
|
||||
@router.get("/settings/flamenco-worker-actual")
|
||||
async def get_flamenco_worker_actual(admin: User = Depends(require_admin)):
|
||||
"""Return the number of flamenco-worker containers currently running."""
|
||||
from app.services.docker_scaler import get_running_worker_count
|
||||
count = await asyncio.get_event_loop().run_in_executor(None, get_running_worker_count)
|
||||
return {"running": count, "available": count >= 0}
|
||||
|
||||
|
||||
@router.post("/settings/flamenco-worker-count")
|
||||
async def set_flamenco_worker_count(
|
||||
body: WorkerCountBody,
|
||||
admin: User = Depends(require_admin),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
"""Scale Flamenco worker containers to the requested count via Docker socket."""
|
||||
if not (1 <= body.count <= 16):
|
||||
raise HTTPException(400, detail="Worker count must be 1–16")
|
||||
|
||||
# Save desired count to settings first
|
||||
await _save_setting(db, "flamenco_worker_count", str(body.count))
|
||||
await db.commit()
|
||||
|
||||
# Perform actual Docker scaling in a thread (blocking SDK call)
|
||||
from app.services.docker_scaler import scale_workers
|
||||
try:
|
||||
result = await asyncio.get_event_loop().run_in_executor(None, scale_workers, body.count)
|
||||
return {
|
||||
"count": body.count,
|
||||
"previous": result["previous"],
|
||||
"current": result["current"],
|
||||
"delta": result["delta"],
|
||||
"message": result["message"],
|
||||
}
|
||||
except Exception as exc:
|
||||
# Scaling failed — return a warning but keep the saved setting
|
||||
return {
|
||||
"count": body.count,
|
||||
"previous": -1,
|
||||
"current": -1,
|
||||
"delta": 0,
|
||||
"message": f"Setting saved, but Docker scaling failed: {exc}. "
|
||||
f"Run `docker compose up -d --scale flamenco-worker={body.count}` manually.",
|
||||
}
|
||||
|
||||
@@ -920,44 +920,17 @@ async def cancel_line_render(
|
||||
if line.render_status not in ("processing", "pending"):
|
||||
raise HTTPException(400, detail=f"Line render_status is '{line.render_status}', nothing to cancel")
|
||||
|
||||
cancelled_backend = line.render_backend_used or "unknown"
|
||||
cancelled_backend = line.render_backend_used or "celery"
|
||||
errors: list[str] = []
|
||||
|
||||
# Cancel Flamenco job if applicable
|
||||
if line.render_backend_used == "flamenco" and line.flamenco_job_id:
|
||||
try:
|
||||
from app.services.flamenco_client import get_flamenco_client
|
||||
from app.models.system_setting import SystemSetting
|
||||
row = await db.execute(
|
||||
select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url")
|
||||
)
|
||||
setting = row.scalar_one_or_none()
|
||||
url = setting.value if setting else "http://flamenco-manager:8080"
|
||||
client = get_flamenco_client(url)
|
||||
client.cancel_job(line.flamenco_job_id)
|
||||
except Exception as exc:
|
||||
errors.append(f"Flamenco cancel failed: {str(exc)[:200]}")
|
||||
|
||||
# Revoke Celery task if applicable
|
||||
if line.render_backend_used == "celery" or not line.render_backend_used:
|
||||
try:
|
||||
from app.tasks.celery_app import celery_app
|
||||
celery_app.control.revoke(
|
||||
f"render-{line_id}", terminate=True, signal="SIGTERM"
|
||||
)
|
||||
except Exception as exc:
|
||||
errors.append(f"Celery revoke failed: {str(exc)[:200]}")
|
||||
|
||||
# Also kill the Blender subprocess in the renderer microservice.
|
||||
# The job_id sent to blender-renderer equals the order_line_id.
|
||||
try:
|
||||
import httpx as _httpx
|
||||
_httpx.post(
|
||||
f"http://blender-renderer:8100/cancel/{line_id}",
|
||||
timeout=5.0,
|
||||
)
|
||||
except Exception:
|
||||
pass # best-effort; renderer may not be running a job for this line
|
||||
# Revoke Celery task (best-effort)
|
||||
try:
|
||||
from app.tasks.celery_app import celery_app
|
||||
celery_app.control.revoke(
|
||||
f"render-{line_id}", terminate=True, signal="SIGTERM"
|
||||
)
|
||||
except Exception as exc:
|
||||
errors.append(f"Celery revoke failed: {str(exc)[:200]}")
|
||||
|
||||
# Mark line as cancelled
|
||||
from sqlalchemy import update as sql_update
|
||||
@@ -1013,47 +986,21 @@ async def cancel_order_renders(
|
||||
if not lines:
|
||||
raise HTTPException(400, detail="No active renders to cancel")
|
||||
|
||||
from app.services.flamenco_client import get_flamenco_client
|
||||
from app.models.system_setting import SystemSetting
|
||||
from app.tasks.celery_app import celery_app
|
||||
from sqlalchemy import update as sql_update
|
||||
|
||||
# Load Flamenco URL once
|
||||
row = await db.execute(
|
||||
select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url")
|
||||
)
|
||||
setting = row.scalar_one_or_none()
|
||||
flamenco_url = setting.value if setting else "http://flamenco-manager:8080"
|
||||
|
||||
now = datetime.utcnow()
|
||||
cancelled_count = 0
|
||||
errors: list[str] = []
|
||||
|
||||
for line in lines:
|
||||
# Cancel Flamenco job
|
||||
if line.render_backend_used == "flamenco" and line.flamenco_job_id:
|
||||
try:
|
||||
client = get_flamenco_client(flamenco_url)
|
||||
client.cancel_job(line.flamenco_job_id)
|
||||
except Exception as exc:
|
||||
errors.append(f"Line {line.id}: Flamenco cancel failed: {str(exc)[:100]}")
|
||||
|
||||
# Revoke Celery task + kill Blender subprocess in renderer service
|
||||
if line.render_backend_used == "celery" or not line.render_backend_used:
|
||||
try:
|
||||
celery_app.control.revoke(
|
||||
f"render-{line.id}", terminate=True, signal="SIGTERM"
|
||||
)
|
||||
except Exception:
|
||||
pass # Celery revoke is best-effort
|
||||
try:
|
||||
import httpx as _httpx
|
||||
_httpx.post(
|
||||
f"http://blender-renderer:8100/cancel/{line.id}",
|
||||
timeout=5.0,
|
||||
)
|
||||
except Exception:
|
||||
pass # best-effort
|
||||
# Revoke Celery task (best-effort)
|
||||
try:
|
||||
celery_app.control.revoke(
|
||||
f"render-{line.id}", terminate=True, signal="SIGTERM"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
await db.execute(
|
||||
sql_update(OrderLine)
|
||||
|
||||
@@ -38,7 +38,6 @@ class RenderJobEntry(BaseModel):
|
||||
output_type_name: str | None
|
||||
render_status: str
|
||||
render_backend_used: str | None
|
||||
flamenco_job_id: str | None
|
||||
render_started_at: str | None
|
||||
render_completed_at: str | None
|
||||
updated_at: str
|
||||
@@ -140,7 +139,6 @@ async def get_worker_activity(
|
||||
output_type_name=rl.output_type.name if rl.output_type else None,
|
||||
render_status=rl.render_status,
|
||||
render_backend_used=rl.render_backend_used,
|
||||
flamenco_job_id=rl.flamenco_job_id,
|
||||
render_started_at=rl.render_started_at.isoformat() if rl.render_started_at else None,
|
||||
render_completed_at=rl.render_completed_at.isoformat() if rl.render_completed_at else None,
|
||||
updated_at=rl.updated_at.isoformat(),
|
||||
|
||||
@@ -4,7 +4,7 @@ from sqlalchemy import String, DateTime, Boolean, Text, Integer, ForeignKey
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
from sqlalchemy.dialects.postgresql import UUID, JSONB
|
||||
|
||||
VALID_RENDER_BACKENDS = {"celery", "flamenco", "auto"}
|
||||
VALID_RENDER_BACKENDS = {"celery"}
|
||||
from app.database import Base
|
||||
|
||||
|
||||
|
||||
@@ -1,177 +0,0 @@
|
||||
"""Scale Flamenco worker containers via the Docker socket.
|
||||
|
||||
Uses the Docker Python SDK (docker>=6.1.0) to list, start, and stop containers.
|
||||
Requires /var/run/docker.sock to be mounted into the backend container.
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
COMPOSE_PROJECT = os.getenv("COMPOSE_PROJECT_NAME", "schaefflerautomat")
|
||||
SERVICE_NAME = "flamenco-worker"
|
||||
|
||||
|
||||
def _get_client():
|
||||
import docker
|
||||
return docker.from_env()
|
||||
|
||||
|
||||
def get_worker_containers(client=None):
|
||||
"""Return all flamenco-worker containers (running + stopped) sorted by name."""
|
||||
if client is None:
|
||||
client = _get_client()
|
||||
return sorted(
|
||||
client.containers.list(
|
||||
all=True,
|
||||
filters={
|
||||
"label": [
|
||||
f"com.docker.compose.project={COMPOSE_PROJECT}",
|
||||
f"com.docker.compose.service={SERVICE_NAME}",
|
||||
]
|
||||
},
|
||||
),
|
||||
key=lambda c: c.name,
|
||||
)
|
||||
|
||||
|
||||
def get_running_worker_count(client=None) -> int:
|
||||
"""Return how many flamenco-worker containers are currently running."""
|
||||
try:
|
||||
if client is None:
|
||||
client = _get_client()
|
||||
containers = get_worker_containers(client)
|
||||
return sum(1 for c in containers if c.status == "running")
|
||||
except Exception as exc:
|
||||
log.warning("docker_scaler: could not read worker count: %s", exc)
|
||||
return -1
|
||||
|
||||
|
||||
def scale_workers(target: int) -> dict:
|
||||
"""Scale flamenco-worker containers to *target* count.
|
||||
|
||||
Returns a dict with keys:
|
||||
previous – containers running before
|
||||
current – containers running after
|
||||
delta – change (negative = stopped, positive = started)
|
||||
message – human-readable summary
|
||||
"""
|
||||
import docker
|
||||
from docker.types import Mount
|
||||
|
||||
client = _get_client()
|
||||
|
||||
all_workers = get_worker_containers(client)
|
||||
running = [c for c in all_workers if c.status == "running"]
|
||||
previous = len(running)
|
||||
|
||||
if target == previous:
|
||||
return {"previous": previous, "current": previous, "delta": 0,
|
||||
"message": f"Already at {previous} worker(s) — no change"}
|
||||
|
||||
# ── Scale down ────────────────────────────────────────────────────────────
|
||||
if target < previous:
|
||||
# Stop highest-numbered containers first to minimise disruption
|
||||
to_stop = sorted(running, key=lambda c: c.name, reverse=True)[: previous - target]
|
||||
for c in to_stop:
|
||||
log.info("docker_scaler: stopping %s", c.name)
|
||||
c.stop(timeout=20)
|
||||
c.remove()
|
||||
return {
|
||||
"previous": previous,
|
||||
"current": target,
|
||||
"delta": target - previous,
|
||||
"message": f"Stopped {len(to_stop)} worker(s): {[c.name for c in to_stop]}",
|
||||
}
|
||||
|
||||
# ── Scale up ──────────────────────────────────────────────────────────────
|
||||
template = running[0] if running else (all_workers[0] if all_workers else None)
|
||||
if template is None:
|
||||
raise RuntimeError(
|
||||
"No existing flamenco-worker container found to clone configuration from. "
|
||||
"Ensure at least one worker container exists (even if stopped)."
|
||||
)
|
||||
|
||||
attrs = template.attrs
|
||||
image = attrs["Config"]["Image"]
|
||||
env = attrs["Config"].get("Env") or []
|
||||
|
||||
# Reconstruct mounts from the template container
|
||||
mounts = []
|
||||
for m in (attrs.get("Mounts") or []):
|
||||
mount_type = m.get("Type", "bind")
|
||||
source = m.get("Name", "") if mount_type == "volume" else m.get("Source", "")
|
||||
mounts.append(
|
||||
Mount(
|
||||
target=m["Destination"],
|
||||
source=source,
|
||||
type=mount_type,
|
||||
read_only=not m.get("RW", True),
|
||||
)
|
||||
)
|
||||
|
||||
# Reconstruct GPU device requests (nvidia)
|
||||
device_requests = None
|
||||
raw_dr = (attrs.get("HostConfig") or {}).get("DeviceRequests") or []
|
||||
if raw_dr:
|
||||
device_requests = []
|
||||
for dr in raw_dr:
|
||||
device_requests.append(
|
||||
docker.types.DeviceRequest(
|
||||
driver=dr.get("Driver", ""),
|
||||
count=dr.get("Count", -1),
|
||||
device_ids=dr.get("DeviceIDs") or [],
|
||||
capabilities=dr.get("Capabilities") or [],
|
||||
options=dr.get("Options") or {},
|
||||
)
|
||||
)
|
||||
|
||||
# Network(s) the template is connected to
|
||||
network_names = list(
|
||||
(attrs.get("NetworkSettings") or {}).get("Networks", {}).keys()
|
||||
)
|
||||
|
||||
restart_policy_name = (
|
||||
(attrs.get("HostConfig") or {})
|
||||
.get("RestartPolicy", {})
|
||||
.get("Name", "unless-stopped")
|
||||
) or "unless-stopped"
|
||||
|
||||
started = []
|
||||
for i in range(previous + 1, target + 1):
|
||||
new_name = f"{COMPOSE_PROJECT}-{SERVICE_NAME}-{i}"
|
||||
labels = {
|
||||
"com.docker.compose.project": COMPOSE_PROJECT,
|
||||
"com.docker.compose.service": SERVICE_NAME,
|
||||
"com.docker.compose.container-number": str(i),
|
||||
}
|
||||
|
||||
log.info("docker_scaler: creating %s from image %s", new_name, image)
|
||||
container = client.containers.create(
|
||||
image=image,
|
||||
name=new_name,
|
||||
environment=env,
|
||||
labels=labels,
|
||||
mounts=mounts,
|
||||
restart_policy={"Name": restart_policy_name},
|
||||
device_requests=device_requests,
|
||||
)
|
||||
|
||||
for net_name in network_names:
|
||||
try:
|
||||
net = client.networks.get(net_name)
|
||||
net.connect(container)
|
||||
log.info("docker_scaler: connected %s to network %s", new_name, net_name)
|
||||
except Exception as exc:
|
||||
log.warning("docker_scaler: could not connect to network %s: %s", net_name, exc)
|
||||
|
||||
container.start()
|
||||
started.append(new_name)
|
||||
log.info("docker_scaler: started %s", new_name)
|
||||
|
||||
return {
|
||||
"previous": previous,
|
||||
"current": target,
|
||||
"delta": target - previous,
|
||||
"message": f"Started {len(started)} new worker(s): {started}",
|
||||
}
|
||||
@@ -1,121 +0,0 @@
|
||||
"""Flamenco Manager REST API client.
|
||||
|
||||
Uses httpx (sync) for compatibility with Celery tasks and FastAPI endpoints.
|
||||
"""
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_TIMEOUT = 10.0
|
||||
|
||||
|
||||
class FlamencoClient:
|
||||
"""Thin wrapper around the Flamenco Manager v3 REST API."""
|
||||
|
||||
def __init__(self, manager_url: str):
|
||||
self.base_url = manager_url.rstrip("/")
|
||||
|
||||
def _url(self, path: str) -> str:
|
||||
return f"{self.base_url}{path}"
|
||||
|
||||
# ── Job management ──────────────────────────────────────────────────────
|
||||
|
||||
def submit_job(
|
||||
self,
|
||||
name: str,
|
||||
job_type: str,
|
||||
settings: dict[str, Any],
|
||||
metadata: dict[str, str] | None = None,
|
||||
priority: int = 50,
|
||||
) -> dict:
|
||||
"""Submit a new render job to Flamenco Manager.
|
||||
|
||||
Returns the created job dict (includes 'id').
|
||||
"""
|
||||
payload = {
|
||||
"name": name,
|
||||
"type": job_type,
|
||||
"submitter_platform": "linux",
|
||||
"settings": settings,
|
||||
"metadata": metadata or {},
|
||||
"priority": priority,
|
||||
}
|
||||
resp = httpx.post(
|
||||
self._url("/api/v3/jobs"),
|
||||
json=payload,
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
def get_job(self, job_id: str) -> dict:
|
||||
"""Get job details by ID."""
|
||||
resp = httpx.get(
|
||||
self._url(f"/api/v3/jobs/{job_id}"),
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
def cancel_job(self, job_id: str) -> None:
|
||||
"""Request cancellation of a job."""
|
||||
resp = httpx.post(
|
||||
self._url(f"/api/v3/jobs/{job_id}/setstatus"),
|
||||
json={"status": "cancel-requested"},
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
# ── Workers ─────────────────────────────────────────────────────────────
|
||||
|
||||
def list_workers(self) -> list[dict]:
|
||||
"""List all registered workers."""
|
||||
resp = httpx.get(
|
||||
self._url("/api/v3/worker-mgt/workers"),
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data.get("workers", data) if isinstance(data, dict) else data
|
||||
|
||||
# ── Farm status ─────────────────────────────────────────────────────────
|
||||
|
||||
def get_farm_status(self) -> dict:
|
||||
"""Get overall farm status from the Manager."""
|
||||
resp = httpx.get(
|
||||
self._url("/api/v3/configuration"),
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
def health_check(self) -> dict:
|
||||
"""Check if the Flamenco Manager is reachable and return version info."""
|
||||
try:
|
||||
resp = httpx.get(
|
||||
self._url("/api/v3/version"),
|
||||
timeout=5.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return {
|
||||
"available": True,
|
||||
"version": data.get("version", "unknown"),
|
||||
"name": data.get("name", "Flamenco"),
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.warning(f"Flamenco health check failed: {exc}")
|
||||
return {
|
||||
"available": False,
|
||||
"version": None,
|
||||
"name": None,
|
||||
"error": str(exc)[:200],
|
||||
}
|
||||
|
||||
|
||||
def get_flamenco_client(manager_url: str) -> FlamencoClient:
|
||||
"""Factory that creates a FlamencoClient from a manager URL."""
|
||||
return FlamencoClient(manager_url)
|
||||
@@ -1,12 +1,7 @@
|
||||
"""Render dispatcher — routes render jobs to Celery or Flamenco.
|
||||
"""Render dispatcher — routes render jobs to Celery.
|
||||
|
||||
Backend selection priority:
|
||||
1. OutputType.render_backend per-type override ("celery" / "flamenco")
|
||||
2. OutputType.is_animation — animations default to Flamenco
|
||||
3. System setting render_backend — global default ("celery" / "flamenco" / "auto")
|
||||
4. "auto" mode: stills → Celery, animations → Flamenco
|
||||
All renders run via Celery workers (Flamenco removed in v2 refactor).
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
@@ -14,7 +9,6 @@ from sqlalchemy import select, update as sql_update
|
||||
from sqlalchemy.orm import Session, joinedload
|
||||
|
||||
from app.models.order_line import OrderLine
|
||||
from app.models.output_type import OutputType
|
||||
from app.models.product import Product
|
||||
from app.models.system_setting import SystemSetting
|
||||
|
||||
@@ -29,113 +23,11 @@ def _load_setting(session: Session, key: str, default: str = "") -> str:
|
||||
return row.value if row else default
|
||||
|
||||
|
||||
def resolve_backend(output_type: OutputType | None, system_backend: str) -> str:
|
||||
"""Determine which backend to use for a given output type.
|
||||
|
||||
Returns "celery" or "flamenco".
|
||||
"""
|
||||
if output_type is None:
|
||||
return "celery"
|
||||
|
||||
# Priority 1: explicit per-type override
|
||||
ot_backend = output_type.render_backend
|
||||
if ot_backend in ("celery", "flamenco"):
|
||||
return ot_backend
|
||||
|
||||
# Priority 2+3: is_animation + system setting
|
||||
if system_backend in ("celery", "flamenco"):
|
||||
return system_backend
|
||||
|
||||
# Priority 4: auto mode — animations → Flamenco, stills → Celery
|
||||
if output_type.is_animation:
|
||||
return "flamenco"
|
||||
return "celery"
|
||||
|
||||
|
||||
def build_flamenco_job_settings(
|
||||
output_type: OutputType,
|
||||
product: Product,
|
||||
step_path: str,
|
||||
output_dir: str,
|
||||
system_settings: dict[str, str],
|
||||
lighting_only: bool = False,
|
||||
shadow_catcher: bool = False,
|
||||
camera_orbit: bool = True,
|
||||
cycles_device: str = "auto",
|
||||
rotation_x: float = 0.0,
|
||||
rotation_y: float = 0.0,
|
||||
rotation_z: float = 0.0,
|
||||
) -> dict:
|
||||
"""Build Flamenco job settings from output type and product metadata."""
|
||||
render_settings = output_type.render_settings or {}
|
||||
engine = render_settings.get("engine", system_settings.get("blender_engine", "cycles"))
|
||||
samples_key = f"blender_{engine}_samples"
|
||||
samples = render_settings.get("samples", int(system_settings.get(samples_key, "256")))
|
||||
stl_quality = render_settings.get("stl_quality", system_settings.get("stl_quality", "low"))
|
||||
width = render_settings.get("width", 1920 if output_type.is_animation else 1024)
|
||||
height = render_settings.get("height", 1080 if output_type.is_animation else 1024)
|
||||
|
||||
part_colors = {}
|
||||
part_names_ordered = []
|
||||
if product.cad_file and product.cad_file.parsed_objects:
|
||||
part_names_ordered = product.cad_file.parsed_objects.get("objects", [])
|
||||
materials_source = product.cad_part_materials
|
||||
if materials_source:
|
||||
from app.services.step_processor import build_part_colors
|
||||
part_colors = build_part_colors(part_names_ordered, materials_source)
|
||||
|
||||
transparent_bg = bool(output_type.transparent_bg) if hasattr(output_type, 'transparent_bg') else False
|
||||
|
||||
settings = {
|
||||
"step_path": step_path,
|
||||
"engine": engine,
|
||||
"samples": samples,
|
||||
"stl_quality": stl_quality,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"part_colors_json": json.dumps(part_colors),
|
||||
"transparent_bg": transparent_bg,
|
||||
"template_path": "",
|
||||
"target_collection": "Product",
|
||||
"material_library_path": "",
|
||||
"material_map_json": "{}",
|
||||
"part_names_ordered_json": json.dumps(part_names_ordered),
|
||||
"lighting_only": lighting_only,
|
||||
"shadow_catcher": shadow_catcher,
|
||||
"cycles_device": cycles_device,
|
||||
"rotation_x": rotation_x,
|
||||
"rotation_y": rotation_y,
|
||||
"rotation_z": rotation_z,
|
||||
}
|
||||
|
||||
for dk in ('noise_threshold', 'denoiser', 'denoising_input_passes',
|
||||
'denoising_prefilter', 'denoising_quality', 'denoising_use_gpu'):
|
||||
settings[dk] = str(render_settings.get(dk, ""))
|
||||
|
||||
if output_type.is_animation:
|
||||
# Turntable-specific settings
|
||||
output_name = render_settings.get("output_name", "turntable")
|
||||
settings["output_dir"] = output_dir
|
||||
settings["output_name"] = output_name
|
||||
settings["frame_count"] = render_settings.get("frame_count", 120)
|
||||
settings["fps"] = render_settings.get("fps", 30)
|
||||
settings["turntable_degrees"] = render_settings.get("turntable_degrees", 360)
|
||||
settings["turntable_axis"] = render_settings.get("turntable_axis", "world_z")
|
||||
settings["bg_color"] = render_settings.get("bg_color", "")
|
||||
settings["camera_orbit"] = camera_orbit
|
||||
else:
|
||||
# Still-specific settings
|
||||
ext = output_type.output_format or "png"
|
||||
settings["output_path"] = f"{output_dir}/render.{ext}"
|
||||
|
||||
return settings
|
||||
|
||||
|
||||
def dispatch_render(order_line_id: str) -> dict:
|
||||
"""Route a render job to Celery or Flamenco based on configuration.
|
||||
"""Dispatch a render job to Celery.
|
||||
|
||||
Must be called from a sync context (Celery task or sync wrapper).
|
||||
Returns {"backend": "celery"|"flamenco", "job_ref": str}.
|
||||
Returns {"backend": "celery", "job_ref": str}.
|
||||
"""
|
||||
from app.config import settings as app_settings
|
||||
from app.services.render_log import emit, clear
|
||||
@@ -179,196 +71,26 @@ def dispatch_render(order_line_id: str) -> dict:
|
||||
|
||||
cad_name = line.product.cad_file.original_name if line.product.cad_file else "?"
|
||||
emit(order_line_id, f"CAD file: {cad_name}")
|
||||
emit(order_line_id, "Dispatching to Celery render worker")
|
||||
|
||||
# Load system settings
|
||||
system_backend = _load_setting(session, "render_backend", "celery")
|
||||
flamenco_url = _load_setting(session, "flamenco_manager_url", "http://flamenco-manager:8080")
|
||||
|
||||
backend = resolve_backend(line.output_type, system_backend)
|
||||
emit(order_line_id, f"Resolved backend: {backend}")
|
||||
|
||||
# Mark as processing
|
||||
now = datetime.utcnow()
|
||||
session.execute(
|
||||
sql_update(OrderLine)
|
||||
.where(OrderLine.id == line.id)
|
||||
.values(
|
||||
render_status="processing",
|
||||
render_backend_used=backend,
|
||||
render_backend_used="celery",
|
||||
render_started_at=now,
|
||||
)
|
||||
)
|
||||
session.commit()
|
||||
|
||||
if backend == "flamenco":
|
||||
emit(order_line_id, f"Submitting job to Flamenco Manager ({flamenco_url})")
|
||||
result = _dispatch_flamenco(session, line, flamenco_url)
|
||||
if result.get("error"):
|
||||
emit(order_line_id, f"Flamenco submit failed: {result['error']}", "error")
|
||||
else:
|
||||
emit(order_line_id, f"Flamenco job submitted: {result.get('job_ref', '?')}")
|
||||
return result
|
||||
else:
|
||||
emit(order_line_id, "Dispatching to Celery render worker")
|
||||
return _dispatch_celery(order_line_id)
|
||||
|
||||
engine_db.dispose()
|
||||
return _dispatch_celery(order_line_id)
|
||||
|
||||
|
||||
def _dispatch_celery(order_line_id: str) -> dict:
|
||||
"""Dispatch to the existing Celery render task."""
|
||||
"""Dispatch to the Celery render task."""
|
||||
from app.tasks.step_tasks import render_order_line_task
|
||||
result = render_order_line_task.delay(order_line_id)
|
||||
return {"backend": "celery", "job_ref": result.id}
|
||||
|
||||
|
||||
def _dispatch_flamenco(session: Session, line: OrderLine, flamenco_url: str) -> dict:
|
||||
"""Submit a job to Flamenco Manager."""
|
||||
import re
|
||||
from app.services.flamenco_client import get_flamenco_client
|
||||
|
||||
# Load all needed system settings
|
||||
all_keys = ["blender_engine", "blender_cycles_samples", "blender_eevee_samples", "stl_quality", "cycles_device"]
|
||||
sys_settings = {}
|
||||
for key in all_keys:
|
||||
sys_settings[key] = _load_setting(session, key, "")
|
||||
|
||||
output_type = line.output_type
|
||||
product = line.product
|
||||
cad_file = product.cad_file
|
||||
|
||||
# Load render_position for rotation values
|
||||
rotation_x = rotation_y = rotation_z = 0.0
|
||||
if line.render_position_id:
|
||||
from app.models.render_position import ProductRenderPosition
|
||||
rp = session.get(ProductRenderPosition, line.render_position_id)
|
||||
if rp:
|
||||
rotation_x, rotation_y, rotation_z = rp.rotation_x, rp.rotation_y, rp.rotation_z
|
||||
|
||||
# Flamenco mounts the uploads volume at /shared, backend uses /app/uploads
|
||||
raw_path = cad_file.stored_path if cad_file else ""
|
||||
step_path = raw_path.replace("/app/uploads/", "/shared/") if raw_path else ""
|
||||
output_dir = f"/shared/renders/{line.id}"
|
||||
|
||||
job_type = "schaeffler-turntable" if (output_type and output_type.is_animation) else "schaeffler-still"
|
||||
|
||||
# Resolve render template + material library BEFORE building job settings
|
||||
# (template.lighting_only is needed by build_flamenco_job_settings)
|
||||
from app.services.template_service import resolve_template, get_material_library_path
|
||||
|
||||
category_key = product.category_key if product else None
|
||||
ot_id = str(line.output_type_id) if line.output_type_id else None
|
||||
template = resolve_template(category_key=category_key, output_type_id=ot_id)
|
||||
material_library = get_material_library_path()
|
||||
|
||||
# Resolve cycles_device: per-output-type override wins, fall back to system setting
|
||||
ot_cycles_device = output_type.cycles_device if output_type else None
|
||||
effective_cycles_device = ot_cycles_device or sys_settings.get("cycles_device", "gpu") or "gpu"
|
||||
|
||||
settings = build_flamenco_job_settings(
|
||||
output_type=output_type,
|
||||
product=product,
|
||||
step_path=step_path,
|
||||
output_dir=output_dir,
|
||||
system_settings=sys_settings,
|
||||
lighting_only=bool(template.lighting_only) if template else False,
|
||||
shadow_catcher=bool(template.shadow_catcher_enabled) if template else False,
|
||||
camera_orbit=bool(template.camera_orbit) if template else True,
|
||||
cycles_device=effective_cycles_device,
|
||||
rotation_x=rotation_x,
|
||||
rotation_y=rotation_y,
|
||||
rotation_z=rotation_z,
|
||||
)
|
||||
|
||||
if template:
|
||||
# Remap path for Flamenco shared volume
|
||||
tmpl_path = template.blend_file_path.replace("/app/uploads/", "/shared/")
|
||||
settings["template_path"] = tmpl_path
|
||||
settings["target_collection"] = template.target_collection
|
||||
logger.info(
|
||||
f"Flamenco job: using render template '{template.name}' "
|
||||
f"(id={template.id}, path={tmpl_path}, collection={template.target_collection})"
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f"Flamenco job: no render template found for "
|
||||
f"category_key={category_key!r}, output_type_id={ot_id!r} — using factory settings"
|
||||
)
|
||||
|
||||
# Material library + material map: send whenever library exists and product
|
||||
# has material assignments — works with or without a render template.
|
||||
# When a template is present, only apply if material_replace_enabled is set.
|
||||
materials_source = product.cad_part_materials
|
||||
use_materials = bool(material_library and materials_source)
|
||||
if template and not template.material_replace_enabled:
|
||||
use_materials = False
|
||||
|
||||
if use_materials:
|
||||
mat_lib_path = material_library.replace("/app/uploads/", "/shared/")
|
||||
settings["material_library_path"] = mat_lib_path
|
||||
mat_map = {
|
||||
m["part_name"]: m["material"]
|
||||
for m in materials_source
|
||||
if m.get("part_name") and m.get("material")
|
||||
}
|
||||
# Resolve raw material names to SCHAEFFLER library names via aliases
|
||||
from app.services.material_service import resolve_material_map
|
||||
mat_map = resolve_material_map(mat_map)
|
||||
settings["material_map_json"] = json.dumps(mat_map)
|
||||
|
||||
# Output naming: meaningful filename instead of generic render.ext
|
||||
def _sanitize(s: str) -> str:
|
||||
return re.sub(r'[^\w\-.]', '_', s.strip())[:100]
|
||||
|
||||
product_name = product.name or product.pim_id or "product"
|
||||
ot_name = output_type.name if output_type else "render"
|
||||
|
||||
if not (output_type and output_type.is_animation):
|
||||
ext = output_type.output_format or "png" if output_type else "png"
|
||||
filename = f"{_sanitize(product_name)}_{_sanitize(ot_name)}.{ext}"
|
||||
settings["output_path"] = f"{output_dir}/{filename}"
|
||||
|
||||
metadata = {
|
||||
"order_line_id": str(line.id),
|
||||
"order_id": str(line.order_id),
|
||||
"product_name": product.name or "",
|
||||
"output_type": output_type.name if output_type else "",
|
||||
"category": product.category_key or "",
|
||||
}
|
||||
|
||||
job_name = f"{product.name or product.pim_id} - {output_type.name if output_type else 'render'}"
|
||||
|
||||
try:
|
||||
client = get_flamenco_client(flamenco_url)
|
||||
job = client.submit_job(
|
||||
name=job_name[:200],
|
||||
job_type=job_type,
|
||||
settings=settings,
|
||||
metadata=metadata,
|
||||
)
|
||||
job_id = job.get("id", "")
|
||||
|
||||
# Save flamenco_job_id
|
||||
session.execute(
|
||||
sql_update(OrderLine)
|
||||
.where(OrderLine.id == line.id)
|
||||
.values(flamenco_job_id=job_id)
|
||||
)
|
||||
session.commit()
|
||||
|
||||
logger.info(f"Flamenco job submitted: {job_id} for OrderLine {line.id}")
|
||||
return {"backend": "flamenco", "job_ref": job_id}
|
||||
|
||||
except Exception as exc:
|
||||
logger.error(f"Flamenco submit failed for OrderLine {line.id}: {exc}")
|
||||
session.execute(
|
||||
sql_update(OrderLine)
|
||||
.where(OrderLine.id == line.id)
|
||||
.values(
|
||||
render_status="failed",
|
||||
render_completed_at=datetime.utcnow(),
|
||||
render_log={"error": f"Flamenco submit failed: {str(exc)[:500]}"},
|
||||
)
|
||||
)
|
||||
session.commit()
|
||||
return {"backend": "flamenco", "job_ref": "", "error": str(exc)}
|
||||
|
||||
@@ -5,7 +5,7 @@ celery_app = Celery(
|
||||
"schaefflerautomat",
|
||||
broker=settings.redis_url,
|
||||
backend=settings.redis_url,
|
||||
include=["app.tasks.step_tasks", "app.tasks.ai_tasks", "app.tasks.flamenco_tasks"],
|
||||
include=["app.tasks.step_tasks", "app.tasks.ai_tasks"],
|
||||
)
|
||||
|
||||
celery_app.conf.update(
|
||||
@@ -17,20 +17,6 @@ celery_app.conf.update(
|
||||
task_routes={
|
||||
"app.tasks.step_tasks.*": {"queue": "step_processing"},
|
||||
"app.tasks.ai_tasks.*": {"queue": "ai_validation"},
|
||||
"app.tasks.flamenco_tasks.*": {"queue": "step_processing"},
|
||||
},
|
||||
beat_schedule={
|
||||
"poll-flamenco-jobs": {
|
||||
"task": "app.tasks.flamenco_tasks.poll_flamenco_jobs",
|
||||
"schedule": 10.0, # every 10 seconds
|
||||
# Discard if not consumed before the next run; prevents queue build-up
|
||||
# when workers are busy with long-running STEP/render tasks.
|
||||
"options": {"expires": 9},
|
||||
},
|
||||
"check-stalled-renders": {
|
||||
"task": "app.tasks.flamenco_tasks.check_stalled_renders",
|
||||
"schedule": 300.0, # every 5 minutes
|
||||
"options": {"expires": 290},
|
||||
},
|
||||
},
|
||||
beat_schedule={},
|
||||
)
|
||||
|
||||
@@ -1,335 +0,0 @@
|
||||
"""Celery tasks for polling Flamenco job status and watchdog recovery."""
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from app.tasks.celery_app import celery_app
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Flamenco status → our render_status mapping
|
||||
FLAMENCO_STATUS_MAP = {
|
||||
"queued": "processing",
|
||||
"active": "processing",
|
||||
"completed": "completed",
|
||||
"failed": "failed",
|
||||
"canceled": "failed",
|
||||
"cancel-requested": "processing",
|
||||
"paused": "processing",
|
||||
}
|
||||
|
||||
|
||||
@celery_app.task(name="app.tasks.flamenco_tasks.poll_flamenco_jobs", queue="step_processing")
|
||||
def poll_flamenco_jobs():
|
||||
"""Poll Flamenco Manager for active render jobs and update OrderLine status.
|
||||
|
||||
Runs on a Celery Beat schedule (every 10 seconds).
|
||||
|
||||
Uses a Redis lock (TTL=9s) to ensure at most one poll executes per 10-second
|
||||
window. When the queue backs up with many duplicates (e.g. all workers are
|
||||
busy with long STEP/render tasks), duplicates acquire the lock, find it taken,
|
||||
and return immediately — draining the queue without doing redundant work.
|
||||
"""
|
||||
import redis as redis_lib
|
||||
from app.config import settings as app_settings
|
||||
|
||||
# Deduplicate: skip if a poll ran within the last 9 seconds
|
||||
try:
|
||||
r = redis_lib.from_url(app_settings.redis_url)
|
||||
acquired = r.set("flamenco_poll_lock", "1", nx=True, ex=9)
|
||||
if not acquired:
|
||||
return {"skipped": "deduplicated"}
|
||||
except Exception:
|
||||
pass # Redis unavailable — proceed anyway
|
||||
|
||||
from sqlalchemy import create_engine, select, update as sql_update
|
||||
from sqlalchemy.orm import Session
|
||||
from app.models.order_line import OrderLine
|
||||
from app.models.system_setting import SystemSetting
|
||||
from app.services.flamenco_client import get_flamenco_client
|
||||
|
||||
sync_url = app_settings.database_url.replace("+asyncpg", "")
|
||||
engine = create_engine(sync_url)
|
||||
|
||||
# Track orders whose lines transitioned to a terminal state
|
||||
completed_order_ids = set()
|
||||
|
||||
with Session(engine) as session:
|
||||
# Load Flamenco Manager URL
|
||||
row = session.execute(
|
||||
select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url")
|
||||
).scalar_one_or_none()
|
||||
manager_url = row.value if row else "http://flamenco-manager:8080"
|
||||
|
||||
# Find all OrderLines dispatched to Flamenco that are still processing
|
||||
lines = session.execute(
|
||||
select(OrderLine).where(
|
||||
OrderLine.render_backend_used == "flamenco",
|
||||
OrderLine.render_status == "processing",
|
||||
OrderLine.flamenco_job_id.isnot(None),
|
||||
)
|
||||
).scalars().all()
|
||||
|
||||
if not lines:
|
||||
engine.dispose()
|
||||
return {"polled": 0}
|
||||
|
||||
client = get_flamenco_client(manager_url)
|
||||
updated = 0
|
||||
|
||||
for line in lines:
|
||||
try:
|
||||
job = client.get_job(line.flamenco_job_id)
|
||||
flamenco_status = job.get("status", "")
|
||||
our_status = FLAMENCO_STATUS_MAP.get(flamenco_status, "processing")
|
||||
|
||||
if our_status == line.render_status:
|
||||
continue # No change
|
||||
|
||||
updates = {"render_status": our_status}
|
||||
|
||||
if our_status == "completed":
|
||||
updates["render_completed_at"] = datetime.utcnow()
|
||||
# Try to extract result path from job activity
|
||||
activity = job.get("activity", "")
|
||||
if activity:
|
||||
updates["render_log"] = {
|
||||
"flamenco_job_id": line.flamenco_job_id,
|
||||
"flamenco_status": flamenco_status,
|
||||
"activity": activity,
|
||||
}
|
||||
# Set result path based on job type
|
||||
job_type = job.get("type", "")
|
||||
metadata = job.get("metadata", {})
|
||||
if job_type == "schaeffler-turntable":
|
||||
output_dir = job.get("settings", {}).get("output_dir", "")
|
||||
output_name = job.get("settings", {}).get("output_name", "turntable")
|
||||
updates["result_path"] = f"{output_dir}/{output_name}.mp4"
|
||||
elif job_type == "schaeffler-still":
|
||||
updates["result_path"] = job.get("settings", {}).get("output_path", "")
|
||||
|
||||
elif our_status == "failed":
|
||||
updates["render_completed_at"] = datetime.utcnow()
|
||||
updates["render_log"] = {
|
||||
"flamenco_job_id": line.flamenco_job_id,
|
||||
"flamenco_status": flamenco_status,
|
||||
"error": job.get("activity", "Job failed"),
|
||||
}
|
||||
|
||||
session.execute(
|
||||
sql_update(OrderLine)
|
||||
.where(OrderLine.id == line.id)
|
||||
.values(**updates)
|
||||
)
|
||||
updated += 1
|
||||
logger.info(
|
||||
f"Flamenco job {line.flamenco_job_id}: "
|
||||
f"{flamenco_status} → render_status={our_status}"
|
||||
)
|
||||
|
||||
# Track orders with lines that reached a terminal state
|
||||
if our_status in ("completed", "failed"):
|
||||
completed_order_ids.add(str(line.order_id))
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
f"Failed to poll Flamenco job {line.flamenco_job_id}: {exc}"
|
||||
)
|
||||
|
||||
if updated:
|
||||
session.commit()
|
||||
|
||||
engine.dispose()
|
||||
|
||||
# Auto-advance orders if all renderable lines are done
|
||||
if completed_order_ids:
|
||||
from app.services.order_status_service import check_order_completion
|
||||
for oid in completed_order_ids:
|
||||
check_order_completion(oid)
|
||||
|
||||
return {"polled": len(lines), "updated": updated}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stalled-render watchdog
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@celery_app.task(name="app.tasks.flamenco_tasks.check_stalled_renders", queue="step_processing")
|
||||
def check_stalled_renders():
|
||||
"""Watchdog: detect and re-dispatch render jobs stuck in 'processing'.
|
||||
|
||||
Runs on a Celery Beat schedule (every 5 minutes).
|
||||
|
||||
After a docker restart, Celery workers lose in-flight tasks — the DB still
|
||||
shows render_status='processing' indefinitely. This task:
|
||||
|
||||
* For **Celery** lines: uses Celery inspect to check whether any worker is
|
||||
still actively executing the task. If not (e.g. after a restart), and
|
||||
the job has been stuck longer than ``render_stall_timeout_minutes``
|
||||
(default: 120 min), it is reset to 'pending' and re-dispatched.
|
||||
|
||||
* For **Flamenco** lines: queries the Flamenco Manager. If the manager
|
||||
reports the job as still active the line is left alone; if the job is
|
||||
gone or in a terminal/error state it is re-dispatched.
|
||||
"""
|
||||
from sqlalchemy import create_engine, select, update as sql_update
|
||||
from sqlalchemy.orm import Session
|
||||
from app.config import settings as app_settings
|
||||
from app.models.order_line import OrderLine
|
||||
from app.models.system_setting import SystemSetting
|
||||
|
||||
sync_url = app_settings.database_url.replace("+asyncpg", "")
|
||||
engine = create_engine(sync_url)
|
||||
|
||||
with Session(engine) as session:
|
||||
# ── Read timeout from system settings ────────────────────────────────
|
||||
row = session.execute(
|
||||
select(SystemSetting).where(SystemSetting.key == "render_stall_timeout_minutes")
|
||||
).scalar_one_or_none()
|
||||
try:
|
||||
timeout_minutes = int(row.value) if row else 120
|
||||
except (ValueError, TypeError):
|
||||
timeout_minutes = 120
|
||||
|
||||
cutoff = datetime.utcnow() - timedelta(minutes=timeout_minutes)
|
||||
|
||||
stalled_lines = session.execute(
|
||||
select(OrderLine).where(
|
||||
OrderLine.render_status == "processing",
|
||||
OrderLine.render_started_at.isnot(None),
|
||||
OrderLine.render_started_at < cutoff,
|
||||
)
|
||||
).scalars().all()
|
||||
|
||||
if not stalled_lines:
|
||||
engine.dispose()
|
||||
return {"checked": 0, "restarted": 0, "timeout_minutes": timeout_minutes}
|
||||
|
||||
logger.info(
|
||||
"[watchdog] Found %d stalled render(s) older than %d minutes",
|
||||
len(stalled_lines), timeout_minutes,
|
||||
)
|
||||
|
||||
# ── Build set of order_line_ids actively running on Celery workers ───
|
||||
active_celery_line_ids: set[str] = set()
|
||||
inspect_ok = False
|
||||
try:
|
||||
inspect = celery_app.control.inspect(timeout=2)
|
||||
active_tasks = inspect.active() or {}
|
||||
for worker_tasks in active_tasks.values():
|
||||
for task_info in (worker_tasks or []):
|
||||
args = task_info.get("args", [])
|
||||
if args:
|
||||
active_celery_line_ids.add(str(args[0]))
|
||||
inspect_ok = True
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[watchdog] Celery inspect failed (%s) — will re-dispatch all timed-out Celery jobs",
|
||||
exc,
|
||||
)
|
||||
|
||||
# ── Load Flamenco Manager URL ─────────────────────────────────────────
|
||||
manager_url = "http://flamenco-manager:8080"
|
||||
try:
|
||||
url_row = session.execute(
|
||||
select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url")
|
||||
).scalar_one_or_none()
|
||||
if url_row:
|
||||
manager_url = url_row.value
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── Decide which lines to restart ────────────────────────────────────
|
||||
to_restart: list[OrderLine] = []
|
||||
|
||||
for line in stalled_lines:
|
||||
line_id = str(line.id)
|
||||
|
||||
if line.flamenco_job_id:
|
||||
# Flamenco job: verify with manager before re-dispatching
|
||||
try:
|
||||
from app.services.flamenco_client import get_flamenco_client
|
||||
client = get_flamenco_client(manager_url)
|
||||
job = client.get_job(line.flamenco_job_id)
|
||||
flamenco_status = job.get("status", "")
|
||||
if flamenco_status in (
|
||||
"active", "queued", "paused",
|
||||
"pause-requested", "cancel-requested",
|
||||
):
|
||||
logger.info(
|
||||
"[watchdog] Flamenco job %s is still %s — skipping line %s",
|
||||
line.flamenco_job_id, flamenco_status, line_id,
|
||||
)
|
||||
continue
|
||||
logger.info(
|
||||
"[watchdog] Flamenco job %s status=%r → re-dispatching line %s",
|
||||
line.flamenco_job_id, flamenco_status, line_id,
|
||||
)
|
||||
except Exception as exc:
|
||||
# Manager unreachable — skip to avoid false restarts
|
||||
logger.warning(
|
||||
"[watchdog] Cannot reach Flamenco for job %s (%s) — skipping line %s",
|
||||
line.flamenco_job_id, exc, line_id,
|
||||
)
|
||||
continue
|
||||
else:
|
||||
# Celery job: skip if still actively running on a worker
|
||||
if inspect_ok and line_id in active_celery_line_ids:
|
||||
logger.info(
|
||||
"[watchdog] Celery render for line %s still active — skipping", line_id
|
||||
)
|
||||
continue
|
||||
logger.info(
|
||||
"[watchdog] Celery render for line %s not found in active tasks — re-dispatching",
|
||||
line_id,
|
||||
)
|
||||
|
||||
to_restart.append(line)
|
||||
|
||||
if not to_restart:
|
||||
engine.dispose()
|
||||
return {
|
||||
"checked": len(stalled_lines),
|
||||
"restarted": 0,
|
||||
"timeout_minutes": timeout_minutes,
|
||||
}
|
||||
|
||||
# ── Reset stalled lines to pending ───────────────────────────────────
|
||||
for line in to_restart:
|
||||
session.execute(
|
||||
sql_update(OrderLine)
|
||||
.where(OrderLine.id == line.id)
|
||||
.values(
|
||||
render_status="pending",
|
||||
render_started_at=None,
|
||||
render_backend_used=None,
|
||||
flamenco_job_id=None,
|
||||
render_log={
|
||||
"watchdog": (
|
||||
f"Auto-restarted after {timeout_minutes} min stall "
|
||||
f"(previous backend: {line.render_backend_used or 'unknown'})"
|
||||
)
|
||||
},
|
||||
)
|
||||
)
|
||||
session.commit()
|
||||
|
||||
engine.dispose()
|
||||
|
||||
# ── Re-dispatch outside DB session ───────────────────────────────────────
|
||||
from app.services.render_dispatcher import dispatch_render
|
||||
restarted = 0
|
||||
for line in to_restart:
|
||||
try:
|
||||
dispatch_render(str(line.id))
|
||||
restarted += 1
|
||||
logger.info("[watchdog] Re-dispatched render for order line %s", line.id)
|
||||
except Exception as exc:
|
||||
logger.error(
|
||||
"[watchdog] Failed to re-dispatch line %s: %s — left as pending", line.id, exc
|
||||
)
|
||||
|
||||
return {
|
||||
"checked": len(stalled_lines),
|
||||
"restarted": restarted,
|
||||
"timeout_minutes": timeout_minutes,
|
||||
}
|
||||
@@ -170,39 +170,6 @@ services:
|
||||
- ./threejs-renderer:/app
|
||||
restart: unless-stopped
|
||||
|
||||
flamenco-manager:
|
||||
build: ./flamenco
|
||||
environment:
|
||||
- FLAMENCO_MODE=manager
|
||||
ports:
|
||||
- "8080:8080"
|
||||
volumes:
|
||||
- uploads:/shared
|
||||
- flamenco-data:/data
|
||||
- ./flamenco/scripts:/opt/flamenco/scripts
|
||||
restart: unless-stopped
|
||||
|
||||
flamenco-worker:
|
||||
build: ./flamenco
|
||||
environment:
|
||||
- FLAMENCO_MODE=worker
|
||||
- FLAMENCO_MANAGER_URL=http://flamenco-manager:8080
|
||||
volumes:
|
||||
- uploads:/shared
|
||||
- /opt/blender:/opt/blender:ro
|
||||
- ./flamenco/scripts:/opt/flamenco/scripts
|
||||
depends_on:
|
||||
- flamenco-manager
|
||||
deploy:
|
||||
replicas: 1
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu, compute, utility, graphics]
|
||||
restart: unless-stopped
|
||||
|
||||
frontend:
|
||||
build:
|
||||
context: ./frontend
|
||||
@@ -220,4 +187,3 @@ services:
|
||||
volumes:
|
||||
pgdata:
|
||||
uploads:
|
||||
flamenco-data:
|
||||
|
||||
Reference in New Issue
Block a user