diff --git a/.gitignore b/.gitignore index e9efd5c..ef978ea 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,39 @@ node_modules/ *.log core /blender-renderer/core + +# Python cache +__pycache__/ +*.py[cod] +*.pyo + +# Node / Vite build output +dist/ +node_modules/ + +# Celery beat schedule +celerybeat-schedule +celerybeat.pid + +# Test cache +.pytest_cache/ +.coverage + +# IDE +.vscode/ +.idea/ + +# Excel lock files +~$* +.~lock.*# + +# Kundendaten ausschließen +*.stp +*.step +*.stl +*.xls ++.xslx +*.csv +*.xlsx + +*.blend1 diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 0000000..be7b4c3 --- /dev/null +++ b/PLAN.md @@ -0,0 +1,1406 @@ +# Refactor-Plan: Schaeffler Automat v2 + +**Erstellt:** 2026-03-05 +**Aktualisiert:** 2026-03-06 — Entscheidungen bestätigt, Implementierung gestartet +**Status:** IN UMSETZUNG — Phase A aktiv +**Branch:** `refactor/render-pipeline` → Ziel: neuer Branch `refactor/v2` + +--- + +## Inhaltsverzeichnis + +1. [Ziel-Zusammenfassung](#1-ziel-zusammenfassung) +2. [Architektur-Analyse: Ist vs. Soll](#2-architektur-analyse-ist-vs-soll) +3. [Architektur-Entscheidungen (ADRs)](#3-architektur-entscheidungen-adrs) +4. [Was wird entfernt / ersetzt (mit Risiken)](#4-was-wird-entfernt--ersetzt-mit-risiken) +5. [Was bleibt und wird erweitert](#5-was-bleibt-und-wird-erweitert) +6. [Neue Komponenten](#6-neue-komponenten) +7. [Phasenplan mit Tasks](#7-phasenplan-mit-tasks) +8. [Datenbankmigrationen-Übersicht](#8-datenbankmigrationen-übersicht) +9. [QC-Gates und Test-Checkliste](#9-qc-gates-und-test-checkliste) +10. [Offene Entscheidungen](#10-offene-entscheidungen) + +--- + +## 1. Ziel-Zusammenfassung + +Das System wird von einem Einzelkunden-Render-Tool zu einer **produktionstauglichen Multi-Tenant Render-Plattform** ausgebaut: + +| Ziel | Umsetzung | +|---|---| +| Produktionspipeline maintainbar | Flamenco entfernen, vereinfachte Docker-Architektur (8 statt 11 Services) | +| Multi-Customer | Tenant-Modell mit PostgreSQL Row-Level Security | +| Externe Worker | Celery render-worker auf beliebigen Maschinen via Redis + MinIO | +| Modulare Render-Konfiguration | Celery Canvas Workflows, deklarative WorkflowDefinition JSON-Config | +| Template-basierte Outputs | RenderTemplate mit Workflow-Integration, React Flow Visualisierung | +| Media-Verwaltung | MediaAsset-Katalog, Filter/Sort/Zip-Download, Audit-Log | +| Modernes Design | Responsive, Widget-Dashboard, WebSocket für Live-Updates | +| Skalierbar | Celery horizontal skalierbar, Hash-basiertes Conversion-Caching | +| Produktdatenbank | Excel-Import mit Sanity-Check und Material-Validierung | +| Node-basierter Workflow | React Flow Editor (Visualisierung), Celery Canvas (Execution) | +| Keine doppelten Konvertierungen | SHA256-Hash-basierter zentraler Conversion-Cache | +| Dynamische Worker-Skalierung | Docker API Scaling + Worker-Registrierung via Redis | +| Cycles + EEVEE | Konfigurierbar pro OutputType | +| Nutzerverwaltung | Admin / ProjectManager / Client (Tenant-gebunden, RLS-isoliert) | +| Preise + Abrechnung | PricingTier, Invoice-Modul, WeasyPrint PDF-Export | +| Modulare Dashboards | Widget-basiert, rollenabhängig, WebSocket-Live-Updates | +| Reporting | Invoice-Report, Produktions-Report, Excel/PDF-Export | +| Blender Asset Library | Native Blender Asset Library für Materialien UND Geometry-Node-Modifier, modular pro OutputType | +| Interaktive 3D-Vorschau | Three.js Browser-Viewer mit Production-glTF (Materialien angewendet), OrbitControls | +| Production-Exports | glTF/GLB + .blend mit eingebetteten Produktionsmaterialien downloadbar | +| Frontend-Logs | SSE-Stream für Render-Task-Logs (1 Stream pro Task) | +| Real-Time Dashboard | WebSocket für Queue-Status, Worker-Status, Render-Events | +| Notifications | Konfigurierbar per Event-Typ und User | +| Schaeffler-Workflow | Sanity-Check, Material-Validierung, Order-Readiness | +| OCC Mesh-Attribute | Sharp Edges, UV-Seams aus STEP-Topologie | +| Blender-Version | >= 5.0.1 Pflicht, Upgrade-Pfad auf 5.1 vorbereitet | + +--- + +## 2. Architektur-Analyse: Ist vs. Soll + +### IST-Architektur (11 Services) + +``` +Internet + ↓ +frontend:5173 (React/Vite) + ↓ HTTP +backend:8888 (FastAPI) + ↓ SQL ↓ Celery tasks ↓ HTTP +postgres:5432 redis:6379 blender-renderer:8100 + ↓ ↑ (nur 1 concurrent) + worker (concurrency=8) threejs-renderer:8101 + worker-thumbnail (c=1) ↑ + beat flamenco-manager:8080 + ↓ + flamenco-worker (GPU) +``` + +**Probleme IST:** +- `blender-renderer` ist Flask-HTTP-Service → max. 1 concurrent Request, kein echtes Scaling +- `threejs-renderer` redundant zu Blender für Thumbnails (eigener Container, eigene Playwright-Instanz) +- `flamenco` ist komplexes externes System (Job-Types in JS) — Mehraufwand ohne Mehrwert über verteilte Celery-Worker +- `worker-thumbnail` mit concurrency=1 ist Workaround für blender-renderer-Limitation +- STEP-Konvertierung passiert mehrfach (blender-renderer + threejs-renderer unabhängig voneinander) +- Kein Tenant-Konzept — alle Kunden teilen dieselbe DB-Namespace +- Keine echte Pipeline-Konfiguration — Logik ist hartcodiert in step_tasks.py +- Kein Shared Storage → externe Worker können keine STEP-Dateien lesen + +### SOLL-Architektur (8 Core-Services + n render-worker) + +``` +Internet + ↓ +frontend:5173 (React/Vite + React Flow + WebSocket) + ↓ HTTP / WebSocket / SSE +backend:8888 (FastAPI, Domain-driven, RLS-enabled) + ↓ SQL (RLS) ↓ Celery Canvas ↓ S3 API +postgres:5432 redis:6379 minio:9000 + (+ RLS) ↓ ↑ (shared object storage) + step-worker render-worker ← lokal (Maschine A) + beat render-worker ← Netzwerk (Maschine B) + render-worker ← GPU (Maschine C) +``` + +**Vorteile SOLL:** +- Blender läuft **direkt im Celery-Worker** als subprocess → kein HTTP-Overhead, kein Timeout-Problem +- Worker auf **beliebigen Maschinen**: brauchen nur `REDIS_URL` + `MINIO_URL` + Blender installiert +- **MinIO** als S3-kompatibler Object-Store ersetzt NFS — kein Mount nötig, funktioniert überall +- **PostgreSQL RLS** sichert Tenant-Isolation automatisch — kein manueller WHERE-Filter nötig +- **Celery Canvas** für Workflow-Execution — keine custom Workflow-Engine +- **React Flow** nur als Visualisierungsschicht — deutlich reduzierter Scope +- Kein Flamenco, kein threejs-renderer → 3 Services weniger + +--- + +## 3. Architektur-Entscheidungen (ADRs) + +### ADR-01: PostgreSQL Row-Level Security statt manuellem tenant_id-Filter + +**Problem:** Jeder neue Router-Query müsste manuell `WHERE tenant_id = :x` haben. Ein vergessenes Filter = Datenleck zwischen Kunden. + +**Entscheidung:** PostgreSQL Row-Level Security (RLS) + +```sql +-- Einmalig pro Tabelle (in Migration 035) +ALTER TABLE products ENABLE ROW LEVEL SECURITY; +CREATE POLICY tenant_isolation ON products + USING (tenant_id = current_setting('app.current_tenant_id')::uuid); + +-- Admin-Bypass via BYPASSRLS-Rolle +ALTER ROLE schaeffler_admin BYPASSRLS; +``` + +```python +# FastAPI Dependency: einmal pro Request setzen +async def get_db_for_tenant( + db: AsyncSession = Depends(get_db), + user: User = Depends(get_current_user) +) -> AsyncSession: + await db.execute( + text("SET LOCAL app.current_tenant_id = :tid"), + {"tid": str(user.tenant_id)} + ) + yield db +``` + +**Vorteile:** +- Unmöglich Cross-Tenant-Leaks durch vergessene Filter +- Gilt automatisch für alle zukünftigen Queries — auch neue Endpoints +- Testbar: RLS-Policies sind SQL, unabhängig von Anwendungscode + +**Nachteile / Risiken:** +- Migration muss RLS für alle betroffenen Tabellen aktivieren +- `BYPASSRLS` für Admin-User muss in DB-Migrationen gesetzt werden +- Alembic-Autogenerate erkennt keine RLS-Policies → Policies müssen manuell in Migration geschrieben werden + +--- + +### ADR-02: MinIO statt NFS für Shared Storage + +**Problem:** Externe Worker müssen STEP-Dateien und Render-Outputs lesen/schreiben. NFS ist operationell komplex, plattformabhängig, und ein Single-Point-of-Failure. + +**Entscheidung:** MinIO (S3-kompatibel, Docker-nativ, self-hosted) + +```yaml +# docker-compose.yml +minio: + image: minio/minio:latest + command: server /data --console-address ":9001" + environment: + MINIO_ROOT_USER: ${MINIO_USER:-minioadmin} + MINIO_ROOT_PASSWORD: ${MINIO_PASSWORD:-minioadmin} + ports: + - "9000:9000" # S3 API + - "9001:9001" # Web Console + volumes: + - minio-data:/data + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] +``` + +```python +# backend/core/storage.py +import boto3 +from pathlib import Path + +class MinIOStorage: + def __init__(self): + self.client = boto3.client( + 's3', + endpoint_url=settings.MINIO_URL, + aws_access_key_id=settings.MINIO_USER, + aws_secret_access_key=settings.MINIO_PASSWORD, + ) + self.bucket = 'uploads' + + def upload(self, local_path: Path, object_key: str) -> str: + self.client.upload_file(str(local_path), self.bucket, object_key) + return object_key + + def download(self, object_key: str, local_path: Path) -> Path: + self.client.download_file(self.bucket, object_key, str(local_path)) + return local_path + + def exists(self, object_key: str) -> bool: + try: + self.client.head_object(Bucket=self.bucket, Key=object_key) + return True + except: return False +``` + +**Render-Worker:** Lädt STEP-File vor Render aus MinIO in lokales tmpdir, lädt Output zurück nach MinIO. + +**Externe Worker brauchen nur:** +- `REDIS_URL=redis://server:6379/0` +- `MINIO_URL=http://server:9000` +- `MINIO_USER` + `MINIO_PASSWORD` + +Kein Mount, kein NFS, funktioniert auf Windows/Linux/Mac gleich. + +--- + +### ADR-03: Celery Canvas für Workflow-Execution, React Flow nur Visualisierung + +**Problem:** Eine custom Workflow-Engine (Graph-Traversal, Dependency-Resolution, Retry-Logic) ist ~2-3 Wochen Eigenentwicklung — Celery hat das bereits eingebaut. + +**Entscheidung:** Celery Canvas als Execution-Engine, deklarative JSON-Config als Definition, React Flow als Visualisierung. + +```python +# domains/rendering/workflow_builder.py + +from celery import chain, group + +WORKFLOW_BUILDERS = { + "still": lambda order_line_id: chain( + convert_step.si(order_line_id), + extract_mesh_attributes.si(order_line_id), + render_still.si(order_line_id), + generate_thumbnail.si(order_line_id), + publish_asset.si(order_line_id), + ), + "turntable": lambda order_line_id: chain( + convert_step.si(order_line_id), + render_turntable_frames.si(order_line_id), + composite_ffmpeg.si(order_line_id), + publish_asset.si(order_line_id), + ), + "multi_angle": lambda order_line_id: chain( + convert_step.si(order_line_id), + group( # parallele Renders + render_still.si(order_line_id, angle=0), + render_still.si(order_line_id, angle=45), + render_still.si(order_line_id, angle=90), + ), + publish_asset.si(order_line_id), + ), +} + +def dispatch_workflow(workflow_type: str, order_line_id: str): + canvas = WORKFLOW_BUILDERS[workflow_type](order_line_id) + return canvas.apply_async() +``` + +**WorkflowDefinition** speichert die **deklarative Config** (welcher workflow_type, welche Parameter): +```json +{ + "type": "still", + "params": { + "render_engine": "cycles", + "samples": 256, + "resolution": [2048, 2048], + "material_library_id": "uuid-..." + } +} +``` + +**React Flow Editor** zeigt den Workflow visuell an und bearbeitet diese JSON-Config. Er erzeugt **keine** eigene Execution-Logic — er ist reine Visualisierung des Canvas-Workflows. + +**Vorteile:** +- Celery übernimmt Retry, Error-Handling, Status-Tracking, Parallelisierung +- `workflow_node_results` wird aus Celery-Task-Results befüllt (nicht custom Engine) +- Scope von Phase C reduziert sich um ~50% + +--- + +### ADR-04: Domain-Driven Projektstruktur + +**Problem:** Flache `routers/` + `services/` + `models/` Struktur mit 15+ Domains wird unübersichtlich. Agenten können keine isolierten Domains parallel bearbeiten. + +**Entscheidung:** Domain-Driven Structure + +``` +backend/app/ +├── core/ # Shared: auth, config, database, storage, websocket +│ ├── auth.py +│ ├── config.py +│ ├── database.py +│ ├── storage.py # MinIO StorageBackend +│ └── websocket.py # WebSocket broadcast +├── domains/ +│ ├── tenants/ # Tenant CRUD, RLS setup +│ │ ├── models.py +│ │ ├── schemas.py +│ │ ├── router.py +│ │ └── service.py +│ ├── products/ # Product, CadFile, STEP processing +│ │ ├── models.py +│ │ ├── schemas.py +│ │ ├── router.py +│ │ ├── service.py +│ │ └── tasks.py # extract_cad_metadata, convert_step_to_stl +│ ├── rendering/ # OutputType, RenderTemplate, Workflow, render tasks +│ │ ├── models.py +│ │ ├── schemas.py +│ │ ├── router.py +│ │ ├── service.py +│ │ ├── workflow_builder.py # Celery Canvas workflows +│ │ └── tasks.py # render_still, render_turntable +│ ├── orders/ # Order, OrderItem, OrderLine +│ │ ├── models.py +│ │ ├── schemas.py +│ │ ├── router.py +│ │ └── service.py +│ ├── media/ # MediaAsset, download, zip +│ │ ├── models.py +│ │ ├── schemas.py +│ │ ├── router.py +│ │ └── service.py +│ ├── materials/ # Material, MaterialAlias, MaterialLibrary +│ │ ├── models.py +│ │ ├── schemas.py +│ │ ├── router.py +│ │ └── service.py +│ ├── billing/ # Invoice, PricingTier +│ │ ├── models.py +│ │ ├── schemas.py +│ │ ├── router.py +│ │ └── service.py +│ ├── notifications/ # AuditLog, NotificationConfig +│ │ ├── models.py +│ │ ├── schemas.py +│ │ ├── router.py +│ │ └── service.py +│ └── imports/ # Excel-Parser, Sanity-Check +│ ├── schemas.py +│ ├── router.py +│ ├── excel_parser.py +│ └── tasks.py # validate_excel_import +└── main.py # Nur Router-Registrierung +``` + +**Vorteile:** +- Neue Domain = neues Verzeichnis, kein bestehender Code angefasst +- Jede Domain isoliert testbar +- Agenten können Domains parallel implementieren ohne Konflikte +- Imports sind selbstdokumentierend: `from app.domains.billing.service import create_invoice` + +**Migration:** Bestehender Code wird schrittweise pro Phase in neue Struktur verschoben (nicht alles auf einmal). + +--- + +### ADR-05: WebSocket für Dashboard-Events, SSE nur für Task-Logs + +**Problem:** SSE ist auf max. 6 gleichzeitige Verbindungen pro Browser (HTTP/1.1) limitiert. Für ein Live-Dashboard mit mehreren Datenquellen (Queue-Status, Worker-Status, Render-Events) ist das zu wenig. + +**Entscheidung:** Zwei separate Real-Time-Kanäle: + +**WebSocket** — für Dashboard-Events (1 Verbindung, multiplexed): +```python +# core/websocket.py +@router.websocket("/ws") +async def websocket_endpoint(ws: WebSocket, user=Depends(get_ws_user)): + await ws.accept() + await subscribe_to_tenant_events(ws, user.tenant_id) + # Events: queue_update, render_complete, render_failed, + # worker_online, worker_offline, order_status_change +``` + +**SSE** — für Render-Task-Logs (1 Stream pro Task, kurzlebig): +```python +# domains/rendering/router.py +@router.get("/tasks/{task_id}/logs") +async def stream_task_logs(task_id: str, user=Depends(get_current_user)): + async def event_generator(): + while True: + logs = await redis.lrange(f"task_logs:{task_id}", -50, -1) + for line in logs: + yield f"data: {line}\n\n" + if await task_is_done(task_id): + break + await asyncio.sleep(0.5) + return EventSourceResponse(event_generator()) +``` + +**Einsatz:** +- Dashboard, Worker-Status, Queue-Längen → **WebSocket** +- Blender-Stdout während Render → **SSE** + +--- + +### ADR-06: Blender Version Policy — >= 5.0.1, Upgrade-Pfad auf 5.1 + +**Entscheidung:** Blender < 5.0.1 wird nicht unterstützt. Während der Entwicklung erscheint Blender 5.1 — der Wechsel erfolgt dann ausschließlich auf >= 5.1. + +**Umsetzung:** +```dockerfile +# render-worker/Dockerfile +ARG BLENDER_VERSION=5.0.1 +ARG BLENDER_MIN_VERSION=5.0.1 + +RUN BLENDER_URL="https://download.blender.org/release/Blender${BLENDER_VERSION}/blender-${BLENDER_VERSION}-linux-x64.tar.xz" \ + && curl -L $BLENDER_URL | tar xJ -C /opt/blender --strip-components=1 +``` + +```python +# render-worker/scripts/check_version.py — wird beim Container-Start geprüft +import bpy, sys +major, minor, patch = bpy.app.version +if (major, minor) < (5, 0): + print(f"ERROR: Blender {major}.{minor}.{patch} nicht unterstützt. Minimum: 5.0.1") + sys.exit(1) +``` + +**Upgrade-Strategie 5.0.1 → 5.1:** +- `BLENDER_VERSION` Build-Arg in `.env` ändern, neu bauen +- Render-Scripts auf API-Änderungen prüfen (Blender Changelog 5.1) +- Bestehende `.blend`-Templates: in 5.1 öffnen + resaven (automatisch migriert) +- QC-Gate: Test-Render mit Sample-STEP-File nach Upgrade + +**Hinweis:** Blender 5.x verwendet den neuen Asset Library Standard (ab 3.0 eingeführt, in 5.x vollständig stabil) — dieser wird für ADR-07 vorausgesetzt. + +--- + +### ADR-07: Blender Asset Library für Materialien UND Modifier + +**Problem:** Das bisherige `material_libraries`-Konzept erlaubt nur Material-Linking. Modifier (insbesondere Geometry Nodes) können damit nicht verwaltet werden. Blenders natives Asset-Library-System ist breiter und deckt beides ab. + +**Entscheidung:** Blender Asset Library als primäres System für Assets: + +```python +# render-worker/scripts/asset_library.py + +def apply_asset_library(blend_path: str, material_map: dict, modifier_map: dict): + """ + Lädt Assets aus einer Asset-Library .blend-Datei: + 1. Materialien: linked/appended per Namen aus material_map + 2. Geometry-Node-Modifier: appended per Namen aus modifier_map, auf Mesh angewendet + """ + with bpy.data.libraries.load(blend_path, link=False, assets_only=True) as (data_from, data_to): + # Materialien laden + data_to.materials = [ + name for name in data_from.materials + if name in material_map.values() + ] + # Geometry-Node-Gruppen laden (für Modifier) + data_to.node_groups = [ + name for name in data_from.node_groups + if name in modifier_map.values() + ] + + # Materialien auf Parts anwenden + for obj in bpy.data.objects: + if obj.type == 'MESH': + for slot in obj.material_slots: + resolved = material_map.get(slot.material.name if slot.material else '') + if resolved and resolved in bpy.data.materials: + slot.material = bpy.data.materials[resolved] + + # Geometry-Node-Modifier anwenden + for obj in bpy.data.objects: + if obj.type == 'MESH': + for part_name, modifier_name in modifier_map.items(): + if part_name in obj.name and modifier_name in bpy.data.node_groups: + mod = obj.modifiers.new(name=modifier_name, type='NODES') + mod.node_group = bpy.data.node_groups[modifier_name] +``` + +**Datenmodell:** `material_libraries` → umbenannt in `asset_libraries`: +``` +asset_libraries ( + id UUID PK, tenant_id FK, + name VARCHAR(200), + blend_file_key TEXT, -- MinIO key zur .blend-Datei + catalog JSONB, -- Asset-Katalog: {materials: [...], node_groups: [...]} + description TEXT, + is_active BOOL DEFAULT TRUE, + created_at TIMESTAMP +) +``` + +**Workflow-Integration:** Zwei neue Node-Typen: +- `apply_asset_library_materials` — Material-Substitution via Asset Library +- `apply_asset_library_modifiers` — Geometry-Node-Modifier via Asset Library + +**Katalog-Refresh:** Nach Upload einer neuen `.blend`-Datei analysiert ein Celery-Task via Blender `--background --python` die Assets und schreibt den Katalog in `asset_libraries.catalog JSONB` — damit weiß die UI welche Assets verfügbar sind ohne die .blend zu öffnen. + +**Vorteile gegenüber bisherigem Ansatz:** +- Ein `.blend` kann Materialien *und* Modifier enthalten → weniger Dateien zu verwalten +- Natives Blender-System → zukunftssicher (Blender entwickelt Asset Library weiter) +- Modifier als Assets: z.B. "Bevel Sharp Edges", "Add Chamfer", "Clean Geometry" als wiederverwendbare Node-Groups +- Asset-Katalog im Browser durchsuchbar ohne Blender zu starten + +--- + +## 4. Was wird entfernt / ersetzt (mit Risiken) + +### 4.1 Flamenco (Manager + Worker + Job-Scripts) + +**Entfernt:** +- `flamenco/` Verzeichnis komplett +- `flamenco-manager`, `flamenco-worker` Services aus docker-compose +- `flamenco_client.py`, `flamenco_tasks.py` +- Celery-Beat-Task `poll_flamenco_jobs` +- `flamenco_job_id`, `render_backend_used` Spalten (Migration 032: nullable, später entfernen) +- `render_backend` System-Setting + +**Ersetzt durch:** Distributed Celery render-worker + MinIO shared storage + +**Risiken:** +- Laufende Flamenco-Jobs → Migration setzt Status auf `cancelled` +- render_dispatcher.py muss vereinfacht werden (nur Celery-Pfad) + +**Migration:** +```sql +UPDATE order_lines SET render_status = 'cancelled', flamenco_job_id = NULL +WHERE render_status = 'processing' AND flamenco_job_id IS NOT NULL; +``` + +--- + +### 4.2 blender-renderer (Flask HTTP-Service) + +**Entfernt:** +- `blender-renderer/app.py` (Flask-Wrapper) +- Service aus docker-compose +- HTTP-Aufrufe zu `:8100` aus step_processor.py + +**Ersetzt durch:** +- Blender als **subprocess im render-worker Celery-Container** +- `blender_render.py` wandert nach `render-worker/scripts/` +- Render-Logik: `domains/rendering/tasks.py` + +**Risiken:** +- render-worker Container benötigt Blender + cadquery → größeres Image (~3GB) +- Build-Zeit steigt → Base-Image vorab bauen und in lokale Registry pushen + +--- + +### 4.3 threejs-renderer (Playwright HTTP-Service) + +**Entfernt:** Kompletter Service + alle server-seitigen Three.js-Render-Pfade + +**Three.js bleibt als:** Frontend-3D-Viewer (ThreeDViewer.tsx, läuft im Browser mit glTF) + +**Risiken:** +- Alle Three.js-generierten Thumbnails müssen mit Blender neu gerendert werden +- Admin-Batch-Regenerierung wird beim Deploy ausgeführt + +--- + +### 4.4 system_settings Key-Value-Store + +**Entfernt:** `system_settings` Tabelle + `_save_setting()` direktes SQL-Hack + +**Ersetzt durch:** `app_config` Modell mit JSONB-Spalten pro Kategorie (Render, Storage, Notifications, Worker, Billing) — vollständig ORM-native + +--- + +### 4.5 Flache Projektstruktur (routers/ services/ models/) + +**Ersetzt durch:** Domain-Driven Structure (ADR-04) + +**Migration:** Schrittweise pro Phase, nicht alles auf einmal. + +--- + +## 5. Was bleibt und wird erweitert + +### 5.1 FastAPI Backend + +- Strukturell erhalten, in Domain-Driven Structure migriert +- RLS-fähige DB-Dependency ersetzt einfaches `get_db` +- Neue Domains: `rendering`, `media`, `billing`, `tenants`, `imports` + +### 5.2 SQLAlchemy 2 + Alembic + +- Alle bestehenden Models bleiben (umstrukturiert in Domains) +- RLS-Policies als raw SQL in Migrationen +- Migration 032+ für neue Tabellen + +### 5.3 Celery + Redis — erweiterte Queue-Struktur + +| Queue | Worker | Concurrency | Tasks | +|---|---|---|---| +| `step_processing` | step-worker | 8 | `extract_cad_metadata`, `validate_excel_import` | +| `convert` | step-worker | 4 | `convert_step_to_stl`, `extract_mesh_attributes` | +| `render_default` | render-worker | **1 pro Container** | `render_still`, `render_turntable_frames` | +| `notify` | step-worker | 4 | `send_notification` | + +**Scaling-Modell:** Jeder render-worker hat concurrency=1 (1 Blender-Prozess). Mehr Worker-Container = mehr parallele Renders. `docker compose scale render-worker=4` → 4 parallele Renders. + +### 5.4 Material-Alias-System + +- Lookup-Reihenfolge (Aliases zuerst) bleibt +- Erweitert: `material_library_id` FK auf `material_aliases` +- Erweitert: Unbekannte-Materialien-Report beim Excel-Import + +### 5.5 RenderTemplate + Pricing + Notification (bleibt, in Domains integriert) + +- `lighting_only`, `shadow_catcher` bleiben +- PricingTier → um Invoice-Modul erweitert +- Notification → um `notification_configs` erweitert + +--- + +## 6. Neue Komponenten + +### 6.1 MinIO Object Storage (ADR-02) + +Service in `docker-compose.yml`. Alle Datei-Operationen über `StorageBackend` Abstraction in `core/storage.py`. Externe Worker benötigen nur URL + Credentials — kein Mount. + +Buckets: +- `uploads` — STEP-Dateien, Thumbnails, Render-Outputs +- `blend-templates` — .blend RenderTemplate-Dateien +- `asset-libraries` — .blend Asset-Library-Dateien (Materialien + Modifier) +- `production-exports` — glTF/GLB + .blend Production-Downloads (kurzlebig, TTL 7d) +- `exports` — Zip-Downloads, PDF-Invoices (kurzlebig, TTL 24h) + +--- + +### 6.2 Tenant-Modell + PostgreSQL RLS (ADR-01) + +``` +tenants (id UUID PK, name VARCHAR, slug VARCHAR UNIQUE, is_active BOOL, created_at) +``` + +FK `tenant_id` auf: `users`, `orders`, `products`, `cad_files`, `media_assets`, `invoices`, `material_libraries`, `render_templates` + +RLS-Policies in Migration 035 — danach ist Datenisolation automatisch. + +--- + +### 6.3 Workflow-System: Celery Canvas + React Flow (ADR-03) + +**Datenmodell:** +``` +workflow_definitions (id, name, output_type_id FK, config JSONB, is_active) + config = { "type": "still"|"turntable"|"multi_angle", "params": {...} } + +workflow_runs (id, workflow_def_id FK, order_line_id FK, celery_task_id, status, started_at, completed_at) + +workflow_node_results (id, run_id FK, node_name, status, output JSONB, log TEXT, duration_s FLOAT) +``` + +**Execution:** `workflow_builder.py` baut Celery Canvas aus `config.type` + `config.params`. Jeder Node-Task schreibt sein Ergebnis in `workflow_node_results`. + +**Node-Typen (Celery Tasks):** +- `convert_step` → STEP→STL via cadquery, prüft SHA256-Cache +- `extract_mesh_attributes` → OCC Topologie → sharp_edges JSON +- `apply_asset_library_materials` → Lädt Materialien aus Asset-Library .blend, wendet auf Mesh-Parts an +- `apply_asset_library_modifiers` → Lädt Geometry-Node-Gruppen aus Asset-Library, wendet als Modifier an +- `render_still` → Blender subprocess → PNG nach MinIO +- `render_turntable_frames` → Blender subprocess → Frame-Ordner nach MinIO +- `composite_ffmpeg` → Frames + bg_color → MP4 nach MinIO +- `export_gltf` → Blender exportiert GLB mit angewendeten Produktionsmaterialien → MinIO +- `export_blend` → Blender speichert .blend mit `pack_all()` → MinIO (alle Texturen eingebettet) +- `generate_thumbnail` → Pillow resize → Thumb nach MinIO +- `publish_asset` → MediaAsset-Record in DB erstellen + +**React Flow Frontend:** `WorkflowEditor.tsx` — visualisiert den Canvas-Workflow, bearbeitet `config JSONB`. Kein eigener Execution-Code. + +--- + +### 6.4 MediaAsset-Katalog + +``` +media_assets ( + id UUID PK, tenant_id FK, product_id FK, order_line_id FK, + workflow_run_id FK, + asset_type ENUM(thumbnail, still, turntable, stl_low, stl_high, + gltf_geometry, -- glTF ohne Materialien (aus STEP-Konvertierung) + gltf_production, -- GLB mit Produktionsmaterialien (aus export_gltf Node) + blend_production), -- .blend mit eingebetteten Produktionsmaterialien + storage_key TEXT, -- MinIO object key + file_size_bytes BIGINT, + mime_type VARCHAR(100), + width INT, height INT, duration_s FLOAT, + render_config JSONB, + created_at TIMESTAMP, + is_archived BOOL DEFAULT FALSE +) +``` + +**API:** Filter, Single-Download, Zip-Download (StreamingResponse), Soft-Delete + +--- + +### 6.5 OCC Mesh-Attribute Extraktion + +```python +# domains/products/tasks.py +def extract_mesh_attributes(step_path: str) -> dict: + """ + Via pythonOCC BRep-Topologie: + - sharp_edges: Kanten-Indices mit Dihedral-Winkel > Threshold (default 30°) + - seam_candidates: Kanten zwischen verschiedenen Face-Typen + - face_groups: Flächen nach Typ (planar, cylindrical, toroidal, ...) + """ +``` + +Output in `cad_files.mesh_attributes JSONB` → wird beim Render als Parameter übergeben. + +Blender-Integration in `render_still`: +```python +# render-worker/scripts/blender_render.py +if mesh_attributes and mesh_attributes.get("sharp_edges"): + for edge_idx in mesh_attributes["sharp_edges"]: + mesh.edges[edge_idx].use_edge_sharp = True + bpy.ops.mesh.mark_seam(clear=False) + bpy.ops.uv.smart_project() +``` + +--- + +### 6.6 Hash-basiertes Conversion-Caching + +```python +# domains/products/tasks.py +def get_stl_cache_key(step_object_key: str, quality: str) -> str: + content = storage.download_bytes(step_object_key) + sha256 = hashlib.sha256(content).hexdigest() + return f"conversion-cache/{sha256[:2]}/{sha256}/{quality}.stl" +``` + +Zentraler Cache in MinIO `uploads/conversion-cache/`. Gleiches STEP-File → 1x konvertiert, egal wie oft hochgeladen oder unter welchem Namen. + +--- + +### 6.7 Billing / Invoice-Modul + +``` +invoices (id, tenant_id FK, period_start, period_end, status ENUM, total_amount, created_at) +invoice_lines (id, invoice_id FK, order_line_id FK, product_name, asset_type, quantity, unit_price, total) +``` + +PDF-Export via WeasyPrint (HTML-Template → PDF). Excel-Export via openpyxl. + +--- + +### 6.8 Excel Sanity-Check + +**Task `validate_excel_import`:** +1. Parse Excel +2. Für jede Row prüfen: STEP vorhanden + completed? Materialien in Aliases? Produkt in DB? +3. Fuzzy-Match-Vorschläge für unbekannte Materialien (via `difflib.get_close_matches`) +4. Report in `import_validations` DB + WebSocket-Event an Client + +**Frontend:** Sanity-Check-Dialog nach Upload, Ampel-Anzeige, Material-Lücken direkt schließbar. + +--- + +### 6.9 WebSocket Live-Events (ADR-05) + +```python +# core/websocket.py +EVENT_TYPES = [ + "queue_update", # Queue-Länge geändert + "render_complete", # Render erfolgreich + "render_failed", # Render gescheitert + "worker_online", # Neuer Worker registriert + "worker_offline", # Worker nicht mehr erreichbar + "order_status_change", # Order-Status geändert + "import_validated", # Excel-Sanity-Check abgeschlossen +] +``` + +Dashboard, WorkerManagement, OrderDetail — alle abonnieren denselben WebSocket und filtern Events nach Typ. + +--- + +### 6.10 Worker-Registrierung + +```python +# render-worker entrypoint +redis.hset('registered_workers', f'{hostname}:{pid}', json.dumps({ + 'hostname': hostname, + 'queues': ['render_default'], + 'blender_version': get_blender_version(), + 'gpu': detect_gpu(), # nvidia-smi oder None + 'started_at': utcnow().isoformat(), + 'last_heartbeat': utcnow().isoformat(), +})) +# Heartbeat alle 30s; Beat-Task entfernt stale Workers nach 90s +``` + +`GET /api/workers` liest Redis-Hash, berechnet Queue-Stats via Celery Inspect. + +--- + +### 6.11 Blender Asset Library Management (ADR-07) + +**Datenmodell:** `asset_libraries` (ersetzt `material_libraries`) + +``` +asset_libraries ( + id UUID PK, tenant_id FK, + name VARCHAR(200), + blend_file_key TEXT, -- MinIO key: "asset-libraries/{id}.blend" + catalog JSONB, -- {materials: ["SCHAEFFLER_010101_Steel-Bare", ...], + -- node_groups: ["Bevel_Sharp_Edges", "Clean_Geometry", ...]} + description TEXT, + is_active BOOL DEFAULT TRUE, + created_at TIMESTAMP +) +``` + +**Katalog-Refresh-Task:** +```python +# domains/materials/tasks.py +def refresh_asset_library_catalog(asset_library_id: str): + """ + Öffnet .blend via Blender --background, liest alle markierten Assets, + schreibt Katalog nach asset_libraries.catalog JSONB. + Läuft automatisch nach jedem .blend-Upload. + """ + script = "render-worker/scripts/catalog_assets.py" + result = subprocess.run(['blender', '--background', '--python', script, + '--', blend_path, '--output', 'json'], ...) + catalog = json.loads(result.stdout) + db.execute(update(AssetLibrary).values(catalog=catalog)) +``` + +**API:** +- `POST /api/asset-libraries` — Upload .blend, Katalog wird automatisch gelesen +- `GET /api/asset-libraries/{id}/catalog` — Verfügbare Assets durchsuchen +- `PUT /api/asset-libraries/{id}` — Metadaten aktualisieren +- `DELETE /api/asset-libraries/{id}` — Löschen (nur wenn nicht in Verwendung) + +**Frontend:** Asset-Library-Manager in Admin — Upload, Katalog-Anzeige (Materialien + Node-Groups als Badges), Zuweisung zu OutputTypes. + +--- + +### 6.12 Interaktive 3D Browser-Vorschau mit Production-Materialien + +**Konzept:** Der vorhandene `ThreeDViewer.tsx` (Three.js, OrbitControls) wird um Production-glTF-Support erweitert. Zwei Ansichtsmodi: + +| Modus | glTF-Quelle | Materialien | +|---|---|---| +| Geometrie-Preview | `gltf_geometry` — aus STEP-Konvertierung | Farbige Part-Gruppen (OCC-Extraktion) | +| Production-Preview | `gltf_production` — aus `export_gltf` Workflow-Node | Echte Produktionsmaterialien (PBR) | + +**Blender → GLB Pipeline:** +```python +# render-worker/scripts/export_gltf.py +def export_gltf(stl_path, blend_key, material_map, modifier_map, output_key): + # 1. STL importieren + bpy.ops.import_mesh.stl(filepath=stl_path) + # 2. Asset Library laden (Materialien + Modifier) + apply_asset_library(blend_path, material_map, modifier_map) + # 3. Als GLB exportieren + bpy.ops.export_scene.gltf( + filepath=output_path, + export_format='GLB', + export_materials='EXPORT', # Materialien einbetten + export_apply=True, # Modifier vor Export anwenden + export_draco_mesh_compression_enable=True, # Komprimierung + export_texture_dir='', + ) +``` + +**Hinweis Materialtreue:** Blenders glTF-Exporter konvertiert `Principled BSDF` → PBR (metallic/roughness). Komplexe Shader-Nodes (z.B. Procedural Textures) werden nicht vollständig übertragen — für diese Fälle: Texture Baking vor Export (optionaler Workflow-Node `bake_textures`). + +**Frontend-Erweiterungen ThreeDViewer.tsx:** +```tsx +// Neue Props/Features: +interface ThreeDViewerProps { + geometryGltfUrl?: string // Geometrie-Preview (sofort verfügbar) + productionGltfUrl?: string // Production-Preview (nach Workflow-Abschluss) + showMaterialToggle?: boolean // Umschalten zwischen Modi + showWireframe?: boolean // Wireframe-Overlay + environmentPreset?: 'studio' | 'outdoor' | 'dark' +} +``` + +Progressive Loading: Geometrie-Preview sofort zeigen → Production-Preview nachladen wenn verfügbar. + +**Download-Buttons direkt im Viewer:** +- "GLB herunterladen" → `GET /api/media/{gltf_production_id}/download` +- ".blend herunterladen" → `GET /api/media/{blend_production_id}/download` + +--- + +### 6.13 Production Export: glTF + .blend Download + +**Workflow-Node `export_gltf`:** +- Input: STL-Pfad, Asset-Library-ID, Material-Map, Modifier-Map +- Output: GLB-Datei in MinIO `production-exports/{cad_file_id}/{run_id}.glb` +- MediaAsset-Record: `asset_type = gltf_production` + +**Workflow-Node `export_blend`:** +```python +# render-worker/scripts/export_blend.py +def export_blend(stl_path, blend_key, material_map, modifier_map, output_key): + # 1. STL + Asset Library laden (wie export_gltf) + # ... + # 2. Alle externen Daten einbetten + bpy.ops.file.pack_all() + # 3. Als .blend speichern (komprimiert) + bpy.ops.wm.save_as_mainfile( + filepath=output_path, + compress=True, + copy=True # Original-Session unangetastet + ) +``` + +**Größen-Warnung:** .blend mit eingebetteten Texturen kann 50-500MB werden. Daher: +- `production-exports` Bucket TTL: 7 Tage (konfigurierbar in `app_config`) +- Maximale Dateigröße: 1GB (konfigurierbar) +- Frontend-Warnung bei Dateien > 100MB vor Download + +**Standard-Workflow "Still mit Production-Exports":** +```python +chain( + convert_step.si(order_line_id), + extract_mesh_attributes.si(order_line_id), + apply_asset_library_materials.si(order_line_id), + apply_asset_library_modifiers.si(order_line_id), + group( + render_still.si(order_line_id), # PNG für Produktion + export_gltf.si(order_line_id), # GLB für 3D-Viewer + Download + export_blend.si(order_line_id), # .blend für Archiv/Post-Processing + ), + generate_thumbnail.si(order_line_id), + publish_asset.si(order_line_id), +) +``` + +--- + +## 7. Phasenplan mit Tasks + +### Phase A: Infrastruktur-Cleanup + MinIO (Woche 1-2) + +**A1: Flamenco entfernen** +- `docker-compose.yml` → flamenco-manager, flamenco-worker entfernen +- `flamenco_client.py`, `flamenco_tasks.py` löschen +- `render_dispatcher.py` → vereinfachen (nur Celery-Pfad) +- Migration 032: laufende Flamenco-Jobs auf `cancelled` setzen +- Akzeptanzkriterium: `docker compose up` startet ohne flamenco, alle bestehenden Renders laufen via Celery + +**A2: blender-renderer → render-worker Celery-Container (ADR-06 umsetzen)** +- `render-worker/Dockerfile` (neu): Ubuntu + Blender (>= 5.0.1, via `BLENDER_VERSION` Build-Arg) + cadquery + Python-Deps +- `check_version.py` läuft beim Container-Start: prüft Blender >= 5.0.1, Exit 1 wenn nicht erfüllt +- `blender-renderer/blender_render.py` → `render-worker/scripts/blender_render.py` +- `domains/rendering/tasks.py` (neu): `render_still_task`, `render_turntable_task` +- Blender via `subprocess.run`, stdout in Redis für SSE +- `docker-compose.yml`: `blender-renderer` entfernen, `render-worker` hinzufügen +- `.env.example`: `BLENDER_VERSION=5.0.1` dokumentieren +- Akzeptanzkriterium: Thumbnail via Celery-Task, kein HTTP-Call zu :8100, Version-Check besteht + +**A3: threejs-renderer entfernen** +- Service entfernen, threejs-Pfad in step_processor.py entfernen +- Batch-Regenerierung aller threejs-Thumbnails (Admin-Funktion) +- ThreeDViewer.tsx (Frontend) bleibt +- Akzeptanzkriterium: Alle Thumbnails Blender-gerendert + +**A4: MinIO hinzufügen + Storage-Abstraction** +- MinIO Service in `docker-compose.yml` +- `core/storage.py`: `MinIOStorage` + `LocalStorage` (für Dev-Fallback) +- Bestehende Upload-Endpoints: Dateien nach MinIO statt in lokales `/uploads` +- Migration bestehender Dateien: Skript das `/uploads` nach MinIO hochlädt +- `.env.example`: `MINIO_URL`, `MINIO_USER`, `MINIO_PASSWORD` +- `docker-compose.worker.yml` (neu): render-worker für externe Maschinen +- Akzeptanzkriterium: File-Upload → MinIO, Worker-Container läuft auf Maschine B und rendert Jobs + +**A5: system_settings → app_config** +- Migration 033: `app_config` Tabelle (JSONB-Spalten: render, storage, notifications, worker, billing) +- `core/config_service.py` (neu), `system_settings` Tabelle deprecated +- Migrate bestehende Settings +- Akzeptanzkriterium: Alle Settings ORM-native persistierbar, kein direktes SQL + +--- + +### Phase B: Domain-Driven Umstrukturierung + Tenant-Modell (Woche 2-3) + +**B1: Domain-Driven Struktur anlegen** +- `backend/app/domains/` Verzeichnis erstellen +- Bestehende Models/Services/Routers schrittweise in Domains verschieben (products, orders, materials, rendering, notifications zuerst) +- `main.py` registriert nur noch Domain-Router +- Akzeptanzkriterium: Alle bestehenden Tests grün, Imports funktionieren, API-Endpoints unverändert + +**B2: Tenant-Datenmodell + RLS** +- Migration 034: `tenants` Tabelle +- Migration 035: `tenant_id` FK auf alle relevanten Tabellen + RLS-Policies + Backfill +- `domains/tenants/` (neu) +- `core/database.py`: RLS-fähige `get_db_for_tenant` Dependency +- Admin-DB-User bekommt `BYPASSRLS`-Berechtigung +- Akzeptanzkriterium: Client A kann keine Daten von Client B sehen (Test mit zwei Tenants) + +**B3: Tenant-Management UI** +- Admin-Seite: Tenants CRUD, User-Anlage mit Tenant-Zuweisung +- Tenant-Selektor im Admin-Header +- Akzeptanzkriterium: Admin wechselt Tenant, Daten wechseln entsprechend + +--- + +### Phase C: Workflow-System (Woche 3-4) + +**C1: WorkflowDefinition Datenmodell** +- Migration 036: `workflow_definitions`, `workflow_runs`, `workflow_node_results` +- `domains/rendering/models.py` erweitern +- `domains/rendering/workflow_builder.py` (neu): Celery-Canvas-Builder für "still", "turntable", "multi_angle" +- `output_types.workflow_definition_id` FK (Migration 037) +- Akzeptanzkriterium: Render via `dispatch_workflow("still", order_line_id)` erfolgreich + +**C2: Standard-Workflows seeden** +- Seeding: "Still-Render" → convert→extract_attributes→render_still→generate_thumbnail→publish +- Seeding: "Turntable" → convert→render_turntable_frames→composite_ffmpeg→publish +- Bestehende OrderLine-Render-Logik auf Workflow-Dispatch umstellen +- Akzeptanzkriterium: Alle bestehenden Render-Pfade laufen als Workflow + +**C3: React Flow Workflow-Editor (Frontend)** +- `@xyflow/react` installieren +- `frontend/src/pages/WorkflowEditor.tsx` (neu) +- Node-Typen: ConvertNode, RenderNode, MaterialNode, FFmpegNode, PublishNode +- Editor bearbeitet `workflow_definitions.config JSONB` +- Akzeptanzkriterium: Workflow visuell bearbeitbar, wird gespeichert und dispatcht korrekt + +--- + +### Phase D: OCC Mesh-Attribute (Woche 4) + +**D1: Attribut-Extraktion** +- `domains/products/tasks.py`: `extract_mesh_attributes` Celery-Task +- Migration 038: `cad_files.mesh_attributes JSONB` +- Läuft nach `extract_cad_metadata` in Workflow-Chain +- Akzeptanzkriterium: STEP-Upload → mesh_attributes JSON in DB mit sharp_edges + +**D2: Blender-Integration** +- `render-worker/scripts/blender_render.py`: nimmt `mesh_attributes` JSON, setzt Sharp Edges, Smart UV +- Akzeptanzkriterium: Gerenderte Bilder zeigen UV-Seams auf Basis von OCC-Topologie + +--- + +### Phase E: MediaAsset-Katalog (Woche 5) + +**E1: Datenmodell + Backfill** +- Migration 039: `media_assets` Tabelle +- `domains/media/` (neu) +- Backfill: bestehende Thumbnails + Outputs als MediaAssets +- `publish_asset` Celery-Task erstellt MediaAsset-Record + WebSocket-Event + +**E2: API + Frontend** +- `GET /api/media` mit Filter, `POST /api/media/zip` (StreamingResponse), `DELETE /api/media/{id}` +- `frontend/src/pages/MediaBrowser.tsx` (neu): Grid/List, Multi-Select, Zip-Download +- Akzeptanzkriterium: 100 Assets ladbar, Zip < 30s für 50 Dateien + +--- + +### Phase F: Hash-basiertes Conversion-Caching (Woche 5) + +**F1: Cache-Service** +- `domains/products/tasks.py`: SHA256-Check vor jeder STL-Konvertierung +- Migration 040: `cad_files.step_file_hash VARCHAR(64)` +- Cache in MinIO `uploads/conversion-cache/` +- Akzeptanzkriterium: Gleiches STEP-File → Log zeigt "cache hit" beim 2. Upload + +--- + +### Phase G: Billing & Reporting (Woche 6) + +**G1: Invoice Datenmodell + API** +- Migration 041: `invoices`, `invoice_lines` +- `domains/billing/` (neu) +- `POST /api/billing/invoices`, `GET /api/billing/invoices/{id}/pdf` (WeasyPrint) +- Akzeptanzkriterium: PDF-Invoice mit korrekten Positionen downloadbar + +**G2: Billing Dashboard (Frontend)** +- `frontend/src/pages/Billing.tsx` (neu) +- Kosten-Übersicht per Tenant/Zeitraum, Invoice-Liste + Download +- Akzeptanzkriterium: Invoice generierbar und downloadbar + +--- + +### Phase H: Excel Sanity-Check (Woche 7) + +**H1: Sanity-Check Task + Fuzzy-Match** +- `domains/imports/tasks.py`: `validate_excel_import` +- Migration 042: `import_validations` Tabelle +- `difflib.get_close_matches` für Materialvorschläge +- WebSocket-Event nach Abschluss + +**H2: Sanity-Check UI** +- Ampel-Dialog nach Excel-Upload +- Material-Lücken direkt im Dialog schließbar (neuer Alias) +- Akzeptanzkriterium: Klar welche Produkte produzierbar sind, Material-Aliases ergänzbar + +--- + +### Phase I: Konfigurierbare Notifications (Woche 7) + +**I1: Notification-Config** +- Migration 043: `notification_configs` Tabelle +- `domains/notifications/service.py`: prüft Config vor Emit +- Standard-Seeding: alle Events für Admin aktiviert + +**I2: Settings UI** +- `frontend/src/pages/NotificationSettings.tsx` (neu) +- Toggle-Matrix: Event × Kanal (In-App, E-Mail optional) +- Akzeptanzkriterium: Events abschaltbar, Einstellungen wirksam + +--- + +### Phase J: WebSocket + SSE Log-Streaming (Woche 8) + +**J1: WebSocket Backend** +- `core/websocket.py`: Connection-Manager, Tenant-basiertes Broadcasting +- Alle relevanten Tasks/Services broadcasten WebSocket-Events +- `GET /ws` Endpoint + +**J2: SSE Task-Logs** +- `GET /api/tasks/{task_id}/logs` — SSE, Worker schreibt in Redis-Liste +- `LiveRenderLog.tsx` erweitern: `EventSource` API, Auto-scroll + +**J3: Frontend WebSocket-Integration** +- Dashboard, WorkerManagement, OrderDetail abonnieren `/ws` +- Ersetzt polling-basierte `useQuery`-Intervalle wo sinnvoll +- Akzeptanzkriterium: Render-Start → Dashboard zeigt Status-Update ohne Reload + +--- + +### Phase K: Blender Asset Library + Production Exports (Woche 8-9) + +**K1: Asset Library Datenmodell + Upload** +- Migration 044: `asset_libraries` Tabelle (id, name, blend_file_key, catalog JSONB, tenant_id) +- `render_templates.asset_library_id` FK, `output_types.asset_library_id` Default-Library +- Upload via MinIO `asset-libraries/` Bucket +- Nach Upload: Celery-Task `refresh_asset_library_catalog` → öffnet .blend via Blender --background, liest Asset-Namen, schreibt in `catalog` JSONB +- Akzeptanzkriterium: .blend hochladen → Katalog mit Materialien + Node-Groups in DB sichtbar + +**K2: Asset Library Management UI** +- `domains/materials/` → Asset-Library-Manager (Upload, Katalog-Anzeige als Badge-Grid) +- Materialien + Node-Groups aus Katalog anzeigen +- Zuweisung per OutputType + RenderTemplate wählbar +- Akzeptanzkriterium: 2 Libraries für verschiedene OutputTypes konfigurierbar + +**K3: Workflow-Nodes apply_asset_library_materials + apply_asset_library_modifiers** +- `render-worker/scripts/asset_library.py`: Materialien und Node-Groups aus .blend linken/appenden +- Workflow-Builder: Nodes in Standard-Workflow "Still mit Production-Exports" integrieren +- Akzeptanzkriterium: Render mit Asset-Library zeigt korrekte Produktionsmaterialien im PNG + +**K4: export_gltf Workflow-Node** +- `render-worker/scripts/export_gltf.py`: Blender exportiert GLB mit angewendeten Materialien +- Modifier vor Export anwenden (`export_apply=True`), Draco-Komprimierung aktiviert +- MediaAsset-Eintrag: `asset_type = gltf_production` +- Akzeptanzkriterium: GLB-Download aus Browser ladbar, Materialien sichtbar in Three.js-Viewer + +**K5: export_blend Workflow-Node** +- `render-worker/scripts/export_blend.py`: `pack_all()` + `save_as_mainfile(compress=True)` +- Größenwarnung-Config in `app_config` (Default: Warnung ab 100MB, Limit 1GB) +- MediaAsset-Eintrag: `asset_type = blend_production` +- TTL in MinIO `production-exports/`: 7 Tage (konfigurierbar) +- Akzeptanzkriterium: .blend-Download enthält alle Texturen, öffnet in Blender 5.x ohne fehlende Links + +**K6: 3D-Viewer Production-Modus (Frontend)** +- `ThreeDViewer.tsx` erweitern: Modus-Toggle Geometrie ↔ Production-glTF +- Wireframe-Toggle, Environment-Preset-Auswahl (studio/outdoor/dark) +- Download-Buttons im Viewer für GLB + .blend +- Progressive Loading: Geometrie-Preview sofort, Production-glTF nachladen +- Akzeptanzkriterium: Interaktiver Viewer zeigt Produktionsmaterialien; Download funktioniert + +--- + +### Phase L: Dashboard & UX (Woche 9-10) + +**L1: Modular Widget-Dashboard** +- `Widget.tsx` generischer Container, Widget-Config per User in DB +- Widget-Typen: ProductionStats, QueueStatus, RecentRenders, CostOverview, WorkerStatus +- WebSocket-Feed für Live-Updates + +**L2: Responsive Design** +- Tailwind CSS-Variablen auf RGB-Channel-Format (behebt Learning 2026-02-18) +- 768px Minimum (iPad-Breite) + +**L3: Worker-Management UI** +- `WorkerManagement.tsx` (neu): Worker-Liste aus Redis, Queue-Stats, Scale-Button + +--- + +### Phase M: QC-Tests (Woche 10-11) + +**M1: Pytest Backend** +- `tests/domains/` — pro Domain: API-Tests + Service-Tests +- Fixtures: Test-DB mit RLS-Setup, Mock-MinIO (moto), Mock-Celery +- Akzeptanzkriterium: > 80% Coverage auf Service-Layer, alle Domains + +**M2: Frontend Vitest** +- `frontend/src/__tests__/` — Komponenten-Tests mit Testing Library +- Akzeptanzkriterium: `npm run test` → 0 Failures + +**M3: Integration-Tests** +- End-to-End: STEP Upload → MinIO → Celery → Render (Mock-Blender) → MediaAsset → Download +- Tenant-Isolation-Test: Client A sieht keine Client-B-Daten +- Akzeptanzkriterium: Pipeline durchlaufbar in CI ohne echtes Blender + +--- + +## 8. Datenbankmigrationen-Übersicht + +| Migration | Beschreibung | Phase | +|---|---|---| +| 032 | Flamenco-Felder bereinigen, Jobs auf cancelled | A | +| 033 | app_config (strukturiertes Config-Modell, ersetzt system_settings) | A | +| 034 | tenants Tabelle | B | +| 035 | tenant_id FKs + **PostgreSQL RLS-Policies** + Backfill | B | +| 036 | workflow_definitions, workflow_runs, workflow_node_results | C | +| 037 | output_types.workflow_definition_id FK | C | +| 038 | cad_files.mesh_attributes JSONB | D | +| 039 | media_assets Tabelle | E | +| 040 | cad_files.step_file_hash VARCHAR(64) | F | +| 041 | invoices, invoice_lines | G | +| 042 | import_validations | H | +| 043 | notification_configs | I | +| 044 | **asset_libraries** (ersetzt material_libraries), FKs auf render_templates/output_types | K | +| 045 | media_assets.asset_type: ENUM um gltf_production, blend_production erweitern | K | + +--- + +## 9. QC-Gates und Test-Checkliste + +Diese Checkliste ist für Agenten konzipiert — jeder Task muss diese Gates passieren bevor Commit. + +### 9.1 Backend QC-Gates + +```bash +# Syntax-Check +docker compose exec backend python -m py_compile app/domains/[domain]/[changed_file].py + +# Alembic +docker compose exec backend alembic current # → head + +# Pytest +docker compose exec backend pytest tests/ -x --tb=short # → 0 Failures + +# Import + Schema +docker compose exec backend python -c "from app.main import app; print('OK')" +``` + +### 9.2 RLS QC-Gate (neu, nach Phase B) + +```bash +# Tenant-Isolation Test +docker compose exec backend pytest tests/domains/tenants/test_rls.py -v +# → Client A kann keine Client-B-Daten lesen/schreiben +# → Admin mit BYPASSRLS sieht alle Daten +``` + +### 9.3 Celery QC-Gates + +```bash +docker compose exec step-worker celery -A app.celery_app inspect registered +# → extract_cad_metadata, convert_step_to_stl, extract_mesh_attributes, validate_excel_import + +docker compose exec render-worker celery -A app.celery_app inspect registered +# → render_still, render_turntable_frames, composite_ffmpeg, generate_thumbnail, publish_asset + +docker compose exec step-worker celery -A app.celery_app inspect active_queues +# → step_processing, convert, notify +``` + +### 9.4 MinIO QC-Gate (neu, nach Phase A4) + +```bash +# MinIO erreichbar +curl http://localhost:9000/minio/health/live # → 200 + +# Upload + Download funktioniert +docker compose exec backend python -c " +from app.core.storage import storage +storage.upload('/tmp/test.txt', 'test/test.txt') +assert storage.exists('test/test.txt') +print('MinIO OK') +" + +# Externer Worker kann MinIO erreichen +# (auf Maschine B ausführen) +docker compose -f docker-compose.worker.yml exec render-worker python -c " +from app.core.storage import storage +assert storage.exists('test/test.txt') +print('External worker MinIO access OK') +" +``` + +### 9.5 Frontend QC-Gates + +```bash +cd frontend +npm run type-check # → 0 Errors +npm run lint # → 0 Errors +npm run test # → 0 Failures +npm run build # → Erfolg +``` + +### 9.6 Datenbank QC-Gates + +```bash +# Migration prüfen (manuell lesen!) — besonders RLS-Policies in 035 +cat backend/alembic/versions/035_*.py + +# Up + Down testen +docker compose exec backend alembic upgrade head +docker compose exec backend alembic downgrade -1 +docker compose exec backend alembic upgrade head +``` + +### 9.7 Docker QC-Gates + +```bash +docker compose up -d +docker compose ps # → alle "healthy" nach 90s (MinIO braucht ~30s) +curl http://localhost:8888/health # → 200 +curl http://localhost:9000/minio/health/live # → 200 +``` + +### 9.8 Render-Pipeline QC-Gate (End-to-End) + +```bash +# Upload STEP → Workflow → Thumbnail +curl -X POST http://localhost:8888/api/cad/upload -F "file=@step-sample-file/81113-l_cut.stp" +# → cad_file_id + +sleep 30 +curl http://localhost:8888/api/cad/{id} | jq .processing_status # → "completed" +curl -I http://localhost:8888/api/cad/{id}/thumbnail # → 200, image/png + +# MediaAsset angelegt? +curl http://localhost:8888/api/media?product_id={product_id} | jq length # → > 0 +``` + +### 9.9 Security QC-Gates + +- [ ] Kein Endpoint ohne Auth (außer `/health`, `/ws`, `/api/cad/{id}/thumbnail`) +- [ ] Alle File-Uploads: MIME-Type + Größe validiert +- [ ] Zip-Download: `assert asset.tenant_id == current_tenant.id` vor Hinzufügen +- [ ] MinIO: Buckets nicht public; Presigned URLs mit TTL für Downloads +- [ ] RLS aktiv: `SELECT relrowsecurity FROM pg_class WHERE relname = 'products'` → `t` +- [ ] JWT-Secret in `.env`, nicht im Code + +### 9.10 Performance QC-Gates + +- [ ] Kein N+1-Query (`selectinload` / `joinedload` in List-Endpoints) +- [ ] List-Endpoints paginiert (max. 100 Items/Page) +- [ ] Zip-Download streamt (StreamingResponse) +- [ ] WebSocket: kein Broadcasting an alle Tenants (nur eigener Tenant) +- [ ] Thumbnails: `Cache-Control: max-age=3600` Header + +--- + +## 10. Offene Entscheidungen + +| # | Frage | Optionen | Empfehlung / Status | +|---|---|---|---| +| 1 | **Blender-Version** | ~~4.x / 5.0.1~~ | **Entschieden: >= 5.0.1 Pflicht, Upgrade auf 5.1 sobald verfügbar** | +| 2 | React Flow Lizenz | MIT / Pro | MIT reicht für internes System | +| 3 | PDF-Generator | WeasyPrint / ReportLab | WeasyPrint (HTML→PDF) | +| 4 | Mobile-Support Scope | iPad (768px) / Vollmobil (375px) | 768px Minimum | +| 5 | OrderItem-Refactor | Jetzt / v3 | v3 (zu viel abhängiger Code) | +| 6 | Blender GPU-Config | Pro-Worker via `deploy.resources` | Bleibt, NVIDIA-Support via ENV | +| 7 | E-Mail-Notifications | SMTP jetzt / später | Später — nur In-App in v2 | +| 8 | Three.js-Thumbnails Batch-Regenerierung | Obligatorisch / On-Demand | Obligatorisch beim Refactor-Deploy | +| 9 | MinIO Backup-Strategie | MinIO Replication / S3 Sync | Außerhalb Scope v2 — in `.env` dokumentieren | +| 10 | CI/CD Pipeline | GitHub Actions / lokal | GitHub Actions für Lint + Tests | +| 11 | **glTF Materialtreue** | PBR-Export / Texture-Baking | **✅ 11A: PBR-Export only** — Principled BSDF → GLB, kein Baking | +| 12 | **Asset Library: link vs. append** | `link=True` (Referenz bleibt) / `link=False` (Kopie) | **✅ 12B: link=True** — Library als Referenz; .blend-Exports nutzen pack_all() für self-contained Files | +| 13 | **blend_production TTL** | 7 Tage / 30 Tage / permanent | **✅ 13C: Permanent** — .blend-Dateien bleiben dauerhaft in MinIO; Größen-Warnungen via app_config | +| 14 | **ThreeDViewer Environment** | Nur Studio / mehrere Presets | Studio-Preset im v2-Scope; weitere Presets v3 | + +--- + +## Freigabe + +**Architektur-Entscheidungen bestätigen:** + +- [x] ADR-01: PostgreSQL RLS für Tenant-Isolation +- [x] ADR-02: MinIO als Shared Object Storage (ersetzt NFS) +- [x] ADR-03: Celery Canvas als Workflow-Engine, React Flow nur Visualisierung +- [x] ADR-04: Domain-Driven Projektstruktur +- [x] ADR-05: WebSocket für Dashboard-Events, SSE nur für Task-Logs +- [x] ADR-06: Blender >= 5.0.1 Pflicht, BLENDER_VERSION als Build-Arg, Upgrade auf 5.1 +- [x] ADR-07: Blender Asset Library (Materialien + Modifier), `asset_libraries` Modell + +**Bestätigte Entscheidungen (Abschnitt 10):** +- [x] 11A: glTF PBR-Export only (kein Texture-Baking) +- [x] 12B: Asset Library link=True + pack_all() für .blend-Exports +- [x] 13C: blend_production permanent in MinIO +- [x] Bestehende API-Endpoints bleiben während Refactor erhalten (17) +- [x] Phasenweise Implementierung mit Quality Gates (18) + +**Planung:** + +- [x] Plan insgesamt freigegeben +- [x] Offene Entscheidungen aus Abschnitt 10 geklärt +- [x] Startphase A bestätigt +- [x] Git-Tag `v1-stable` auf main erstellt +- [x] Git-Branch `refactor/v2` erstellt diff --git a/backend/alembic/versions/032_remove_flamenco.py b/backend/alembic/versions/032_remove_flamenco.py new file mode 100644 index 0000000..ad0f788 --- /dev/null +++ b/backend/alembic/versions/032_remove_flamenco.py @@ -0,0 +1,49 @@ +"""Remove Flamenco: cancel lingering flamenco jobs, clean up settings. + +Revision ID: 032 +Revises: 031 +Create Date: 2026-03-06 +""" +from alembic import op +import sqlalchemy as sa + +revision = '032' +down_revision = '031' +branch_labels = None +depends_on = None + + +def upgrade(): + # Cancel any order lines that were dispatched to Flamenco and never completed + op.execute(""" + UPDATE order_lines + SET render_status = 'cancelled', + render_completed_at = NOW(), + render_log = render_log || '{"cancelled_reason": "flamenco_removed_in_v2"}'::jsonb + WHERE render_backend_used = 'flamenco' + AND render_status IN ('processing', 'pending') + """) + + # Remove Flamenco-specific system settings + op.execute(""" + DELETE FROM system_settings + WHERE key IN ('flamenco_manager_url', 'flamenco_worker_count') + """) + + # Reset render_backend setting to 'celery' if it was 'flamenco' or 'auto' + op.execute(""" + UPDATE system_settings + SET value = 'celery' + WHERE key = 'render_backend' AND value IN ('flamenco', 'auto') + """) + + +def downgrade(): + # Re-insert default Flamenco settings + op.execute(""" + INSERT INTO system_settings (key, value, updated_at) + VALUES + ('flamenco_manager_url', 'http://flamenco-manager:8080', NOW()), + ('flamenco_worker_count', '1', NOW()) + ON CONFLICT (key) DO NOTHING + """) diff --git a/backend/app/api/routers/admin.py b/backend/app/api/routers/admin.py index d6bca04..2e4c61d 100644 --- a/backend/app/api/routers/admin.py +++ b/backend/app/api/routers/admin.py @@ -1,4 +1,3 @@ -import asyncio import json import uuid from datetime import datetime @@ -17,27 +16,21 @@ from app.utils.auth import require_admin, hash_password router = APIRouter(prefix="/admin", tags=["admin"]) -VALID_RENDERERS = {"pillow", "blender", "threejs"} -VALID_ENGINES = {"cycles", "eevee"} -VALID_THREEJS_SIZES = {512, 1024, 2048} -VALID_FORMATS = {"jpg", "png"} -VALID_STL_QUALITIES = {"low", "high"} +VALID_RENDERERS = {"pillow", "blender"} +VALID_ENGINES = {"cycles", "eevee"} +VALID_FORMATS = {"jpg", "png"} +VALID_STL_QUALITIES = {"low", "high"} VALID_CYCLES_DEVICES = {"auto", "gpu", "cpu"} -VALID_RENDER_BACKENDS = {"celery", "flamenco", "auto"} - SETTINGS_DEFAULTS: dict[str, str] = { - "thumbnail_renderer": "pillow", + "thumbnail_renderer": "blender", "blender_engine": "cycles", "blender_cycles_samples": "256", "blender_eevee_samples": "64", - "threejs_render_size": "1024", "thumbnail_format": "jpg", "stl_quality": "low", "blender_smooth_angle": "30", "cycles_device": "auto", "render_backend": "celery", - "flamenco_manager_url": "http://flamenco-manager:8080", - "flamenco_worker_count": "1", "blender_max_concurrent_renders": "3", "product_thumbnail_priority": '["latest_render","cad_thumbnail"]', "render_stall_timeout_minutes": "120", @@ -45,18 +38,15 @@ SETTINGS_DEFAULTS: dict[str, str] = { class SettingsOut(BaseModel): - thumbnail_renderer: str = "pillow" + thumbnail_renderer: str = "blender" blender_engine: str = "cycles" blender_cycles_samples: int = 256 blender_eevee_samples: int = 64 - threejs_render_size: int = 1024 thumbnail_format: str = "jpg" stl_quality: str = "low" blender_smooth_angle: int = 30 cycles_device: str = "auto" render_backend: str = "celery" - flamenco_manager_url: str = "http://flamenco-manager:8080" - flamenco_worker_count: int = 1 blender_max_concurrent_renders: int = 3 product_thumbnail_priority: str = '["latest_render","cad_thumbnail"]' render_stall_timeout_minutes: int = 120 @@ -67,14 +57,11 @@ class SettingsUpdate(BaseModel): blender_engine: str | None = None blender_cycles_samples: int | None = None blender_eevee_samples: int | None = None - threejs_render_size: int | None = None thumbnail_format: str | None = None stl_quality: str | None = None blender_smooth_angle: int | None = None cycles_device: str | None = None render_backend: str | None = None - flamenco_manager_url: str | None = None - flamenco_worker_count: int | None = None blender_max_concurrent_renders: int | None = None product_thumbnail_priority: str | None = None render_stall_timeout_minutes: int | None = None @@ -171,14 +158,11 @@ def _settings_to_out(raw: dict[str, str]) -> SettingsOut: blender_engine=raw["blender_engine"], blender_cycles_samples=int(raw["blender_cycles_samples"]), blender_eevee_samples=int(raw["blender_eevee_samples"]), - threejs_render_size=int(raw["threejs_render_size"]), thumbnail_format=raw["thumbnail_format"], stl_quality=raw["stl_quality"], blender_smooth_angle=int(raw["blender_smooth_angle"]), cycles_device=raw["cycles_device"], render_backend=raw["render_backend"], - flamenco_manager_url=raw["flamenco_manager_url"], - flamenco_worker_count=int(raw["flamenco_worker_count"]), blender_max_concurrent_renders=int(raw["blender_max_concurrent_renders"]), product_thumbnail_priority=raw.get("product_thumbnail_priority", '["latest_render","cad_thumbnail"]'), render_stall_timeout_minutes=int(raw.get("render_stall_timeout_minutes", "120")), @@ -207,8 +191,6 @@ async def update_settings( raise HTTPException(400, detail="blender_cycles_samples must be 1–4096") if body.blender_eevee_samples is not None and not (1 <= body.blender_eevee_samples <= 1024): raise HTTPException(400, detail="blender_eevee_samples must be 1–1024") - if body.threejs_render_size is not None and body.threejs_render_size not in VALID_THREEJS_SIZES: - raise HTTPException(400, detail=f"Invalid threejs_render_size. Choose: {', '.join(str(s) for s in sorted(VALID_THREEJS_SIZES))}") if body.thumbnail_format is not None and body.thumbnail_format not in VALID_FORMATS: raise HTTPException(400, detail=f"Invalid thumbnail_format. Choose: {', '.join(sorted(VALID_FORMATS))}") if body.stl_quality is not None and body.stl_quality not in VALID_STL_QUALITIES: @@ -217,10 +199,6 @@ async def update_settings( raise HTTPException(400, detail="blender_smooth_angle must be 0–180 degrees") if body.cycles_device is not None and body.cycles_device not in VALID_CYCLES_DEVICES: raise HTTPException(400, detail=f"Invalid cycles_device. Choose: {', '.join(sorted(VALID_CYCLES_DEVICES))}") - if body.render_backend is not None and body.render_backend not in VALID_RENDER_BACKENDS: - raise HTTPException(400, detail=f"Invalid render_backend. Choose: {', '.join(sorted(VALID_RENDER_BACKENDS))}") - if body.flamenco_worker_count is not None and not (1 <= body.flamenco_worker_count <= 16): - raise HTTPException(400, detail="flamenco_worker_count must be 1–16") if body.blender_max_concurrent_renders is not None and not (1 <= body.blender_max_concurrent_renders <= 16): raise HTTPException(400, detail="blender_max_concurrent_renders must be 1–16") if body.render_stall_timeout_minutes is not None and not (10 <= body.render_stall_timeout_minutes <= 10080): @@ -252,8 +230,6 @@ async def update_settings( updates["blender_cycles_samples"] = str(body.blender_cycles_samples) if body.blender_eevee_samples is not None: updates["blender_eevee_samples"] = str(body.blender_eevee_samples) - if body.threejs_render_size is not None: - updates["threejs_render_size"] = str(body.threejs_render_size) if body.thumbnail_format is not None: updates["thumbnail_format"] = body.thumbnail_format if body.stl_quality is not None: @@ -264,10 +240,6 @@ async def update_settings( updates["cycles_device"] = body.cycles_device if body.render_backend is not None: updates["render_backend"] = body.render_backend - if body.flamenco_manager_url is not None: - updates["flamenco_manager_url"] = body.flamenco_manager_url - if body.flamenco_worker_count is not None: - updates["flamenco_worker_count"] = str(body.flamenco_worker_count) if body.blender_max_concurrent_renders is not None: updates["blender_max_concurrent_renders"] = str(body.blender_max_concurrent_renders) if body.render_stall_timeout_minutes is not None: @@ -392,7 +364,6 @@ async def renderer_status( services = { "pillow": {"url": None, "available": True, "note": "Built-in (always available)"}, "blender": {"url": "http://blender-renderer:8100/health", "available": False, "note": ""}, - "threejs": {"url": "http://threejs-renderer:8101/health", "available": False, "note": ""}, } async with httpx.AsyncClient(timeout=3.0) as client: for name, info in services.items(): @@ -409,78 +380,3 @@ async def renderer_status( return services -@router.get("/settings/flamenco-status") -async def flamenco_status( - admin: User = Depends(require_admin), - db: AsyncSession = Depends(get_db), -): - """Check Flamenco Manager health and list workers.""" - raw = await _load_settings(db) - manager_url = raw.get("flamenco_manager_url", "http://flamenco-manager:8080") - - from app.services.flamenco_client import get_flamenco_client - client = get_flamenco_client(manager_url) - - health = client.health_check() - workers: list[dict] = [] - - if health["available"]: - try: - workers = client.list_workers() - except Exception as exc: - workers = [{"error": str(exc)[:200]}] - - return { - "manager": health, - "workers": workers, - "manager_url": manager_url, - } - - -class WorkerCountBody(BaseModel): - count: int - - -@router.get("/settings/flamenco-worker-actual") -async def get_flamenco_worker_actual(admin: User = Depends(require_admin)): - """Return the number of flamenco-worker containers currently running.""" - from app.services.docker_scaler import get_running_worker_count - count = await asyncio.get_event_loop().run_in_executor(None, get_running_worker_count) - return {"running": count, "available": count >= 0} - - -@router.post("/settings/flamenco-worker-count") -async def set_flamenco_worker_count( - body: WorkerCountBody, - admin: User = Depends(require_admin), - db: AsyncSession = Depends(get_db), -): - """Scale Flamenco worker containers to the requested count via Docker socket.""" - if not (1 <= body.count <= 16): - raise HTTPException(400, detail="Worker count must be 1–16") - - # Save desired count to settings first - await _save_setting(db, "flamenco_worker_count", str(body.count)) - await db.commit() - - # Perform actual Docker scaling in a thread (blocking SDK call) - from app.services.docker_scaler import scale_workers - try: - result = await asyncio.get_event_loop().run_in_executor(None, scale_workers, body.count) - return { - "count": body.count, - "previous": result["previous"], - "current": result["current"], - "delta": result["delta"], - "message": result["message"], - } - except Exception as exc: - # Scaling failed — return a warning but keep the saved setting - return { - "count": body.count, - "previous": -1, - "current": -1, - "delta": 0, - "message": f"Setting saved, but Docker scaling failed: {exc}. " - f"Run `docker compose up -d --scale flamenco-worker={body.count}` manually.", - } diff --git a/backend/app/api/routers/orders.py b/backend/app/api/routers/orders.py index 78adb4f..c165fe9 100644 --- a/backend/app/api/routers/orders.py +++ b/backend/app/api/routers/orders.py @@ -920,44 +920,17 @@ async def cancel_line_render( if line.render_status not in ("processing", "pending"): raise HTTPException(400, detail=f"Line render_status is '{line.render_status}', nothing to cancel") - cancelled_backend = line.render_backend_used or "unknown" + cancelled_backend = line.render_backend_used or "celery" errors: list[str] = [] - # Cancel Flamenco job if applicable - if line.render_backend_used == "flamenco" and line.flamenco_job_id: - try: - from app.services.flamenco_client import get_flamenco_client - from app.models.system_setting import SystemSetting - row = await db.execute( - select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url") - ) - setting = row.scalar_one_or_none() - url = setting.value if setting else "http://flamenco-manager:8080" - client = get_flamenco_client(url) - client.cancel_job(line.flamenco_job_id) - except Exception as exc: - errors.append(f"Flamenco cancel failed: {str(exc)[:200]}") - - # Revoke Celery task if applicable - if line.render_backend_used == "celery" or not line.render_backend_used: - try: - from app.tasks.celery_app import celery_app - celery_app.control.revoke( - f"render-{line_id}", terminate=True, signal="SIGTERM" - ) - except Exception as exc: - errors.append(f"Celery revoke failed: {str(exc)[:200]}") - - # Also kill the Blender subprocess in the renderer microservice. - # The job_id sent to blender-renderer equals the order_line_id. - try: - import httpx as _httpx - _httpx.post( - f"http://blender-renderer:8100/cancel/{line_id}", - timeout=5.0, - ) - except Exception: - pass # best-effort; renderer may not be running a job for this line + # Revoke Celery task (best-effort) + try: + from app.tasks.celery_app import celery_app + celery_app.control.revoke( + f"render-{line_id}", terminate=True, signal="SIGTERM" + ) + except Exception as exc: + errors.append(f"Celery revoke failed: {str(exc)[:200]}") # Mark line as cancelled from sqlalchemy import update as sql_update @@ -1013,47 +986,21 @@ async def cancel_order_renders( if not lines: raise HTTPException(400, detail="No active renders to cancel") - from app.services.flamenco_client import get_flamenco_client - from app.models.system_setting import SystemSetting from app.tasks.celery_app import celery_app from sqlalchemy import update as sql_update - # Load Flamenco URL once - row = await db.execute( - select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url") - ) - setting = row.scalar_one_or_none() - flamenco_url = setting.value if setting else "http://flamenco-manager:8080" - now = datetime.utcnow() cancelled_count = 0 errors: list[str] = [] for line in lines: - # Cancel Flamenco job - if line.render_backend_used == "flamenco" and line.flamenco_job_id: - try: - client = get_flamenco_client(flamenco_url) - client.cancel_job(line.flamenco_job_id) - except Exception as exc: - errors.append(f"Line {line.id}: Flamenco cancel failed: {str(exc)[:100]}") - - # Revoke Celery task + kill Blender subprocess in renderer service - if line.render_backend_used == "celery" or not line.render_backend_used: - try: - celery_app.control.revoke( - f"render-{line.id}", terminate=True, signal="SIGTERM" - ) - except Exception: - pass # Celery revoke is best-effort - try: - import httpx as _httpx - _httpx.post( - f"http://blender-renderer:8100/cancel/{line.id}", - timeout=5.0, - ) - except Exception: - pass # best-effort + # Revoke Celery task (best-effort) + try: + celery_app.control.revoke( + f"render-{line.id}", terminate=True, signal="SIGTERM" + ) + except Exception: + pass await db.execute( sql_update(OrderLine) diff --git a/backend/app/api/routers/worker.py b/backend/app/api/routers/worker.py index 4e62656..90006ce 100644 --- a/backend/app/api/routers/worker.py +++ b/backend/app/api/routers/worker.py @@ -38,7 +38,6 @@ class RenderJobEntry(BaseModel): output_type_name: str | None render_status: str render_backend_used: str | None - flamenco_job_id: str | None render_started_at: str | None render_completed_at: str | None updated_at: str @@ -140,7 +139,6 @@ async def get_worker_activity( output_type_name=rl.output_type.name if rl.output_type else None, render_status=rl.render_status, render_backend_used=rl.render_backend_used, - flamenco_job_id=rl.flamenco_job_id, render_started_at=rl.render_started_at.isoformat() if rl.render_started_at else None, render_completed_at=rl.render_completed_at.isoformat() if rl.render_completed_at else None, updated_at=rl.updated_at.isoformat(), diff --git a/backend/app/models/output_type.py b/backend/app/models/output_type.py index b91e03c..bef1bc6 100644 --- a/backend/app/models/output_type.py +++ b/backend/app/models/output_type.py @@ -4,7 +4,7 @@ from sqlalchemy import String, DateTime, Boolean, Text, Integer, ForeignKey from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.dialects.postgresql import UUID, JSONB -VALID_RENDER_BACKENDS = {"celery", "flamenco", "auto"} +VALID_RENDER_BACKENDS = {"celery"} from app.database import Base diff --git a/backend/app/services/docker_scaler.py b/backend/app/services/docker_scaler.py deleted file mode 100644 index ca0a17e..0000000 --- a/backend/app/services/docker_scaler.py +++ /dev/null @@ -1,177 +0,0 @@ -"""Scale Flamenco worker containers via the Docker socket. - -Uses the Docker Python SDK (docker>=6.1.0) to list, start, and stop containers. -Requires /var/run/docker.sock to be mounted into the backend container. -""" -import os -import logging - -log = logging.getLogger(__name__) - -COMPOSE_PROJECT = os.getenv("COMPOSE_PROJECT_NAME", "schaefflerautomat") -SERVICE_NAME = "flamenco-worker" - - -def _get_client(): - import docker - return docker.from_env() - - -def get_worker_containers(client=None): - """Return all flamenco-worker containers (running + stopped) sorted by name.""" - if client is None: - client = _get_client() - return sorted( - client.containers.list( - all=True, - filters={ - "label": [ - f"com.docker.compose.project={COMPOSE_PROJECT}", - f"com.docker.compose.service={SERVICE_NAME}", - ] - }, - ), - key=lambda c: c.name, - ) - - -def get_running_worker_count(client=None) -> int: - """Return how many flamenco-worker containers are currently running.""" - try: - if client is None: - client = _get_client() - containers = get_worker_containers(client) - return sum(1 for c in containers if c.status == "running") - except Exception as exc: - log.warning("docker_scaler: could not read worker count: %s", exc) - return -1 - - -def scale_workers(target: int) -> dict: - """Scale flamenco-worker containers to *target* count. - - Returns a dict with keys: - previous – containers running before - current – containers running after - delta – change (negative = stopped, positive = started) - message – human-readable summary - """ - import docker - from docker.types import Mount - - client = _get_client() - - all_workers = get_worker_containers(client) - running = [c for c in all_workers if c.status == "running"] - previous = len(running) - - if target == previous: - return {"previous": previous, "current": previous, "delta": 0, - "message": f"Already at {previous} worker(s) — no change"} - - # ── Scale down ──────────────────────────────────────────────────────────── - if target < previous: - # Stop highest-numbered containers first to minimise disruption - to_stop = sorted(running, key=lambda c: c.name, reverse=True)[: previous - target] - for c in to_stop: - log.info("docker_scaler: stopping %s", c.name) - c.stop(timeout=20) - c.remove() - return { - "previous": previous, - "current": target, - "delta": target - previous, - "message": f"Stopped {len(to_stop)} worker(s): {[c.name for c in to_stop]}", - } - - # ── Scale up ────────────────────────────────────────────────────────────── - template = running[0] if running else (all_workers[0] if all_workers else None) - if template is None: - raise RuntimeError( - "No existing flamenco-worker container found to clone configuration from. " - "Ensure at least one worker container exists (even if stopped)." - ) - - attrs = template.attrs - image = attrs["Config"]["Image"] - env = attrs["Config"].get("Env") or [] - - # Reconstruct mounts from the template container - mounts = [] - for m in (attrs.get("Mounts") or []): - mount_type = m.get("Type", "bind") - source = m.get("Name", "") if mount_type == "volume" else m.get("Source", "") - mounts.append( - Mount( - target=m["Destination"], - source=source, - type=mount_type, - read_only=not m.get("RW", True), - ) - ) - - # Reconstruct GPU device requests (nvidia) - device_requests = None - raw_dr = (attrs.get("HostConfig") or {}).get("DeviceRequests") or [] - if raw_dr: - device_requests = [] - for dr in raw_dr: - device_requests.append( - docker.types.DeviceRequest( - driver=dr.get("Driver", ""), - count=dr.get("Count", -1), - device_ids=dr.get("DeviceIDs") or [], - capabilities=dr.get("Capabilities") or [], - options=dr.get("Options") or {}, - ) - ) - - # Network(s) the template is connected to - network_names = list( - (attrs.get("NetworkSettings") or {}).get("Networks", {}).keys() - ) - - restart_policy_name = ( - (attrs.get("HostConfig") or {}) - .get("RestartPolicy", {}) - .get("Name", "unless-stopped") - ) or "unless-stopped" - - started = [] - for i in range(previous + 1, target + 1): - new_name = f"{COMPOSE_PROJECT}-{SERVICE_NAME}-{i}" - labels = { - "com.docker.compose.project": COMPOSE_PROJECT, - "com.docker.compose.service": SERVICE_NAME, - "com.docker.compose.container-number": str(i), - } - - log.info("docker_scaler: creating %s from image %s", new_name, image) - container = client.containers.create( - image=image, - name=new_name, - environment=env, - labels=labels, - mounts=mounts, - restart_policy={"Name": restart_policy_name}, - device_requests=device_requests, - ) - - for net_name in network_names: - try: - net = client.networks.get(net_name) - net.connect(container) - log.info("docker_scaler: connected %s to network %s", new_name, net_name) - except Exception as exc: - log.warning("docker_scaler: could not connect to network %s: %s", net_name, exc) - - container.start() - started.append(new_name) - log.info("docker_scaler: started %s", new_name) - - return { - "previous": previous, - "current": target, - "delta": target - previous, - "message": f"Started {len(started)} new worker(s): {started}", - } diff --git a/backend/app/services/flamenco_client.py b/backend/app/services/flamenco_client.py deleted file mode 100644 index 4807897..0000000 --- a/backend/app/services/flamenco_client.py +++ /dev/null @@ -1,121 +0,0 @@ -"""Flamenco Manager REST API client. - -Uses httpx (sync) for compatibility with Celery tasks and FastAPI endpoints. -""" -import logging -from typing import Any - -import httpx - -logger = logging.getLogger(__name__) - -DEFAULT_TIMEOUT = 10.0 - - -class FlamencoClient: - """Thin wrapper around the Flamenco Manager v3 REST API.""" - - def __init__(self, manager_url: str): - self.base_url = manager_url.rstrip("/") - - def _url(self, path: str) -> str: - return f"{self.base_url}{path}" - - # ── Job management ────────────────────────────────────────────────────── - - def submit_job( - self, - name: str, - job_type: str, - settings: dict[str, Any], - metadata: dict[str, str] | None = None, - priority: int = 50, - ) -> dict: - """Submit a new render job to Flamenco Manager. - - Returns the created job dict (includes 'id'). - """ - payload = { - "name": name, - "type": job_type, - "submitter_platform": "linux", - "settings": settings, - "metadata": metadata or {}, - "priority": priority, - } - resp = httpx.post( - self._url("/api/v3/jobs"), - json=payload, - timeout=DEFAULT_TIMEOUT, - ) - resp.raise_for_status() - return resp.json() - - def get_job(self, job_id: str) -> dict: - """Get job details by ID.""" - resp = httpx.get( - self._url(f"/api/v3/jobs/{job_id}"), - timeout=DEFAULT_TIMEOUT, - ) - resp.raise_for_status() - return resp.json() - - def cancel_job(self, job_id: str) -> None: - """Request cancellation of a job.""" - resp = httpx.post( - self._url(f"/api/v3/jobs/{job_id}/setstatus"), - json={"status": "cancel-requested"}, - timeout=DEFAULT_TIMEOUT, - ) - resp.raise_for_status() - - # ── Workers ───────────────────────────────────────────────────────────── - - def list_workers(self) -> list[dict]: - """List all registered workers.""" - resp = httpx.get( - self._url("/api/v3/worker-mgt/workers"), - timeout=DEFAULT_TIMEOUT, - ) - resp.raise_for_status() - data = resp.json() - return data.get("workers", data) if isinstance(data, dict) else data - - # ── Farm status ───────────────────────────────────────────────────────── - - def get_farm_status(self) -> dict: - """Get overall farm status from the Manager.""" - resp = httpx.get( - self._url("/api/v3/configuration"), - timeout=DEFAULT_TIMEOUT, - ) - resp.raise_for_status() - return resp.json() - - def health_check(self) -> dict: - """Check if the Flamenco Manager is reachable and return version info.""" - try: - resp = httpx.get( - self._url("/api/v3/version"), - timeout=5.0, - ) - resp.raise_for_status() - data = resp.json() - return { - "available": True, - "version": data.get("version", "unknown"), - "name": data.get("name", "Flamenco"), - } - except Exception as exc: - logger.warning(f"Flamenco health check failed: {exc}") - return { - "available": False, - "version": None, - "name": None, - "error": str(exc)[:200], - } - - -def get_flamenco_client(manager_url: str) -> FlamencoClient: - """Factory that creates a FlamencoClient from a manager URL.""" - return FlamencoClient(manager_url) diff --git a/backend/app/services/render_dispatcher.py b/backend/app/services/render_dispatcher.py index de7da2d..0b24842 100644 --- a/backend/app/services/render_dispatcher.py +++ b/backend/app/services/render_dispatcher.py @@ -1,12 +1,7 @@ -"""Render dispatcher — routes render jobs to Celery or Flamenco. +"""Render dispatcher — routes render jobs to Celery. -Backend selection priority: -1. OutputType.render_backend per-type override ("celery" / "flamenco") -2. OutputType.is_animation — animations default to Flamenco -3. System setting render_backend — global default ("celery" / "flamenco" / "auto") -4. "auto" mode: stills → Celery, animations → Flamenco +All renders run via Celery workers (Flamenco removed in v2 refactor). """ -import json import logging from datetime import datetime @@ -14,7 +9,6 @@ from sqlalchemy import select, update as sql_update from sqlalchemy.orm import Session, joinedload from app.models.order_line import OrderLine -from app.models.output_type import OutputType from app.models.product import Product from app.models.system_setting import SystemSetting @@ -29,113 +23,11 @@ def _load_setting(session: Session, key: str, default: str = "") -> str: return row.value if row else default -def resolve_backend(output_type: OutputType | None, system_backend: str) -> str: - """Determine which backend to use for a given output type. - - Returns "celery" or "flamenco". - """ - if output_type is None: - return "celery" - - # Priority 1: explicit per-type override - ot_backend = output_type.render_backend - if ot_backend in ("celery", "flamenco"): - return ot_backend - - # Priority 2+3: is_animation + system setting - if system_backend in ("celery", "flamenco"): - return system_backend - - # Priority 4: auto mode — animations → Flamenco, stills → Celery - if output_type.is_animation: - return "flamenco" - return "celery" - - -def build_flamenco_job_settings( - output_type: OutputType, - product: Product, - step_path: str, - output_dir: str, - system_settings: dict[str, str], - lighting_only: bool = False, - shadow_catcher: bool = False, - camera_orbit: bool = True, - cycles_device: str = "auto", - rotation_x: float = 0.0, - rotation_y: float = 0.0, - rotation_z: float = 0.0, -) -> dict: - """Build Flamenco job settings from output type and product metadata.""" - render_settings = output_type.render_settings or {} - engine = render_settings.get("engine", system_settings.get("blender_engine", "cycles")) - samples_key = f"blender_{engine}_samples" - samples = render_settings.get("samples", int(system_settings.get(samples_key, "256"))) - stl_quality = render_settings.get("stl_quality", system_settings.get("stl_quality", "low")) - width = render_settings.get("width", 1920 if output_type.is_animation else 1024) - height = render_settings.get("height", 1080 if output_type.is_animation else 1024) - - part_colors = {} - part_names_ordered = [] - if product.cad_file and product.cad_file.parsed_objects: - part_names_ordered = product.cad_file.parsed_objects.get("objects", []) - materials_source = product.cad_part_materials - if materials_source: - from app.services.step_processor import build_part_colors - part_colors = build_part_colors(part_names_ordered, materials_source) - - transparent_bg = bool(output_type.transparent_bg) if hasattr(output_type, 'transparent_bg') else False - - settings = { - "step_path": step_path, - "engine": engine, - "samples": samples, - "stl_quality": stl_quality, - "width": width, - "height": height, - "part_colors_json": json.dumps(part_colors), - "transparent_bg": transparent_bg, - "template_path": "", - "target_collection": "Product", - "material_library_path": "", - "material_map_json": "{}", - "part_names_ordered_json": json.dumps(part_names_ordered), - "lighting_only": lighting_only, - "shadow_catcher": shadow_catcher, - "cycles_device": cycles_device, - "rotation_x": rotation_x, - "rotation_y": rotation_y, - "rotation_z": rotation_z, - } - - for dk in ('noise_threshold', 'denoiser', 'denoising_input_passes', - 'denoising_prefilter', 'denoising_quality', 'denoising_use_gpu'): - settings[dk] = str(render_settings.get(dk, "")) - - if output_type.is_animation: - # Turntable-specific settings - output_name = render_settings.get("output_name", "turntable") - settings["output_dir"] = output_dir - settings["output_name"] = output_name - settings["frame_count"] = render_settings.get("frame_count", 120) - settings["fps"] = render_settings.get("fps", 30) - settings["turntable_degrees"] = render_settings.get("turntable_degrees", 360) - settings["turntable_axis"] = render_settings.get("turntable_axis", "world_z") - settings["bg_color"] = render_settings.get("bg_color", "") - settings["camera_orbit"] = camera_orbit - else: - # Still-specific settings - ext = output_type.output_format or "png" - settings["output_path"] = f"{output_dir}/render.{ext}" - - return settings - - def dispatch_render(order_line_id: str) -> dict: - """Route a render job to Celery or Flamenco based on configuration. + """Dispatch a render job to Celery. Must be called from a sync context (Celery task or sync wrapper). - Returns {"backend": "celery"|"flamenco", "job_ref": str}. + Returns {"backend": "celery", "job_ref": str}. """ from app.config import settings as app_settings from app.services.render_log import emit, clear @@ -179,196 +71,26 @@ def dispatch_render(order_line_id: str) -> dict: cad_name = line.product.cad_file.original_name if line.product.cad_file else "?" emit(order_line_id, f"CAD file: {cad_name}") + emit(order_line_id, "Dispatching to Celery render worker") - # Load system settings - system_backend = _load_setting(session, "render_backend", "celery") - flamenco_url = _load_setting(session, "flamenco_manager_url", "http://flamenco-manager:8080") - - backend = resolve_backend(line.output_type, system_backend) - emit(order_line_id, f"Resolved backend: {backend}") - - # Mark as processing now = datetime.utcnow() session.execute( sql_update(OrderLine) .where(OrderLine.id == line.id) .values( render_status="processing", - render_backend_used=backend, + render_backend_used="celery", render_started_at=now, ) ) session.commit() - if backend == "flamenco": - emit(order_line_id, f"Submitting job to Flamenco Manager ({flamenco_url})") - result = _dispatch_flamenco(session, line, flamenco_url) - if result.get("error"): - emit(order_line_id, f"Flamenco submit failed: {result['error']}", "error") - else: - emit(order_line_id, f"Flamenco job submitted: {result.get('job_ref', '?')}") - return result - else: - emit(order_line_id, "Dispatching to Celery render worker") - return _dispatch_celery(order_line_id) - engine_db.dispose() + return _dispatch_celery(order_line_id) def _dispatch_celery(order_line_id: str) -> dict: - """Dispatch to the existing Celery render task.""" + """Dispatch to the Celery render task.""" from app.tasks.step_tasks import render_order_line_task result = render_order_line_task.delay(order_line_id) return {"backend": "celery", "job_ref": result.id} - - -def _dispatch_flamenco(session: Session, line: OrderLine, flamenco_url: str) -> dict: - """Submit a job to Flamenco Manager.""" - import re - from app.services.flamenco_client import get_flamenco_client - - # Load all needed system settings - all_keys = ["blender_engine", "blender_cycles_samples", "blender_eevee_samples", "stl_quality", "cycles_device"] - sys_settings = {} - for key in all_keys: - sys_settings[key] = _load_setting(session, key, "") - - output_type = line.output_type - product = line.product - cad_file = product.cad_file - - # Load render_position for rotation values - rotation_x = rotation_y = rotation_z = 0.0 - if line.render_position_id: - from app.models.render_position import ProductRenderPosition - rp = session.get(ProductRenderPosition, line.render_position_id) - if rp: - rotation_x, rotation_y, rotation_z = rp.rotation_x, rp.rotation_y, rp.rotation_z - - # Flamenco mounts the uploads volume at /shared, backend uses /app/uploads - raw_path = cad_file.stored_path if cad_file else "" - step_path = raw_path.replace("/app/uploads/", "/shared/") if raw_path else "" - output_dir = f"/shared/renders/{line.id}" - - job_type = "schaeffler-turntable" if (output_type and output_type.is_animation) else "schaeffler-still" - - # Resolve render template + material library BEFORE building job settings - # (template.lighting_only is needed by build_flamenco_job_settings) - from app.services.template_service import resolve_template, get_material_library_path - - category_key = product.category_key if product else None - ot_id = str(line.output_type_id) if line.output_type_id else None - template = resolve_template(category_key=category_key, output_type_id=ot_id) - material_library = get_material_library_path() - - # Resolve cycles_device: per-output-type override wins, fall back to system setting - ot_cycles_device = output_type.cycles_device if output_type else None - effective_cycles_device = ot_cycles_device or sys_settings.get("cycles_device", "gpu") or "gpu" - - settings = build_flamenco_job_settings( - output_type=output_type, - product=product, - step_path=step_path, - output_dir=output_dir, - system_settings=sys_settings, - lighting_only=bool(template.lighting_only) if template else False, - shadow_catcher=bool(template.shadow_catcher_enabled) if template else False, - camera_orbit=bool(template.camera_orbit) if template else True, - cycles_device=effective_cycles_device, - rotation_x=rotation_x, - rotation_y=rotation_y, - rotation_z=rotation_z, - ) - - if template: - # Remap path for Flamenco shared volume - tmpl_path = template.blend_file_path.replace("/app/uploads/", "/shared/") - settings["template_path"] = tmpl_path - settings["target_collection"] = template.target_collection - logger.info( - f"Flamenco job: using render template '{template.name}' " - f"(id={template.id}, path={tmpl_path}, collection={template.target_collection})" - ) - else: - logger.info( - f"Flamenco job: no render template found for " - f"category_key={category_key!r}, output_type_id={ot_id!r} — using factory settings" - ) - - # Material library + material map: send whenever library exists and product - # has material assignments — works with or without a render template. - # When a template is present, only apply if material_replace_enabled is set. - materials_source = product.cad_part_materials - use_materials = bool(material_library and materials_source) - if template and not template.material_replace_enabled: - use_materials = False - - if use_materials: - mat_lib_path = material_library.replace("/app/uploads/", "/shared/") - settings["material_library_path"] = mat_lib_path - mat_map = { - m["part_name"]: m["material"] - for m in materials_source - if m.get("part_name") and m.get("material") - } - # Resolve raw material names to SCHAEFFLER library names via aliases - from app.services.material_service import resolve_material_map - mat_map = resolve_material_map(mat_map) - settings["material_map_json"] = json.dumps(mat_map) - - # Output naming: meaningful filename instead of generic render.ext - def _sanitize(s: str) -> str: - return re.sub(r'[^\w\-.]', '_', s.strip())[:100] - - product_name = product.name or product.pim_id or "product" - ot_name = output_type.name if output_type else "render" - - if not (output_type and output_type.is_animation): - ext = output_type.output_format or "png" if output_type else "png" - filename = f"{_sanitize(product_name)}_{_sanitize(ot_name)}.{ext}" - settings["output_path"] = f"{output_dir}/{filename}" - - metadata = { - "order_line_id": str(line.id), - "order_id": str(line.order_id), - "product_name": product.name or "", - "output_type": output_type.name if output_type else "", - "category": product.category_key or "", - } - - job_name = f"{product.name or product.pim_id} - {output_type.name if output_type else 'render'}" - - try: - client = get_flamenco_client(flamenco_url) - job = client.submit_job( - name=job_name[:200], - job_type=job_type, - settings=settings, - metadata=metadata, - ) - job_id = job.get("id", "") - - # Save flamenco_job_id - session.execute( - sql_update(OrderLine) - .where(OrderLine.id == line.id) - .values(flamenco_job_id=job_id) - ) - session.commit() - - logger.info(f"Flamenco job submitted: {job_id} for OrderLine {line.id}") - return {"backend": "flamenco", "job_ref": job_id} - - except Exception as exc: - logger.error(f"Flamenco submit failed for OrderLine {line.id}: {exc}") - session.execute( - sql_update(OrderLine) - .where(OrderLine.id == line.id) - .values( - render_status="failed", - render_completed_at=datetime.utcnow(), - render_log={"error": f"Flamenco submit failed: {str(exc)[:500]}"}, - ) - ) - session.commit() - return {"backend": "flamenco", "job_ref": "", "error": str(exc)} diff --git a/backend/app/tasks/celery_app.py b/backend/app/tasks/celery_app.py index 52cab0a..6e8f15e 100644 --- a/backend/app/tasks/celery_app.py +++ b/backend/app/tasks/celery_app.py @@ -5,7 +5,7 @@ celery_app = Celery( "schaefflerautomat", broker=settings.redis_url, backend=settings.redis_url, - include=["app.tasks.step_tasks", "app.tasks.ai_tasks", "app.tasks.flamenco_tasks"], + include=["app.tasks.step_tasks", "app.tasks.ai_tasks"], ) celery_app.conf.update( @@ -17,20 +17,6 @@ celery_app.conf.update( task_routes={ "app.tasks.step_tasks.*": {"queue": "step_processing"}, "app.tasks.ai_tasks.*": {"queue": "ai_validation"}, - "app.tasks.flamenco_tasks.*": {"queue": "step_processing"}, - }, - beat_schedule={ - "poll-flamenco-jobs": { - "task": "app.tasks.flamenco_tasks.poll_flamenco_jobs", - "schedule": 10.0, # every 10 seconds - # Discard if not consumed before the next run; prevents queue build-up - # when workers are busy with long-running STEP/render tasks. - "options": {"expires": 9}, - }, - "check-stalled-renders": { - "task": "app.tasks.flamenco_tasks.check_stalled_renders", - "schedule": 300.0, # every 5 minutes - "options": {"expires": 290}, - }, }, + beat_schedule={}, ) diff --git a/backend/app/tasks/flamenco_tasks.py b/backend/app/tasks/flamenco_tasks.py deleted file mode 100644 index d240b38..0000000 --- a/backend/app/tasks/flamenco_tasks.py +++ /dev/null @@ -1,335 +0,0 @@ -"""Celery tasks for polling Flamenco job status and watchdog recovery.""" -import logging -from datetime import datetime, timedelta - -from app.tasks.celery_app import celery_app - -logger = logging.getLogger(__name__) - -# Flamenco status → our render_status mapping -FLAMENCO_STATUS_MAP = { - "queued": "processing", - "active": "processing", - "completed": "completed", - "failed": "failed", - "canceled": "failed", - "cancel-requested": "processing", - "paused": "processing", -} - - -@celery_app.task(name="app.tasks.flamenco_tasks.poll_flamenco_jobs", queue="step_processing") -def poll_flamenco_jobs(): - """Poll Flamenco Manager for active render jobs and update OrderLine status. - - Runs on a Celery Beat schedule (every 10 seconds). - - Uses a Redis lock (TTL=9s) to ensure at most one poll executes per 10-second - window. When the queue backs up with many duplicates (e.g. all workers are - busy with long STEP/render tasks), duplicates acquire the lock, find it taken, - and return immediately — draining the queue without doing redundant work. - """ - import redis as redis_lib - from app.config import settings as app_settings - - # Deduplicate: skip if a poll ran within the last 9 seconds - try: - r = redis_lib.from_url(app_settings.redis_url) - acquired = r.set("flamenco_poll_lock", "1", nx=True, ex=9) - if not acquired: - return {"skipped": "deduplicated"} - except Exception: - pass # Redis unavailable — proceed anyway - - from sqlalchemy import create_engine, select, update as sql_update - from sqlalchemy.orm import Session - from app.models.order_line import OrderLine - from app.models.system_setting import SystemSetting - from app.services.flamenco_client import get_flamenco_client - - sync_url = app_settings.database_url.replace("+asyncpg", "") - engine = create_engine(sync_url) - - # Track orders whose lines transitioned to a terminal state - completed_order_ids = set() - - with Session(engine) as session: - # Load Flamenco Manager URL - row = session.execute( - select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url") - ).scalar_one_or_none() - manager_url = row.value if row else "http://flamenco-manager:8080" - - # Find all OrderLines dispatched to Flamenco that are still processing - lines = session.execute( - select(OrderLine).where( - OrderLine.render_backend_used == "flamenco", - OrderLine.render_status == "processing", - OrderLine.flamenco_job_id.isnot(None), - ) - ).scalars().all() - - if not lines: - engine.dispose() - return {"polled": 0} - - client = get_flamenco_client(manager_url) - updated = 0 - - for line in lines: - try: - job = client.get_job(line.flamenco_job_id) - flamenco_status = job.get("status", "") - our_status = FLAMENCO_STATUS_MAP.get(flamenco_status, "processing") - - if our_status == line.render_status: - continue # No change - - updates = {"render_status": our_status} - - if our_status == "completed": - updates["render_completed_at"] = datetime.utcnow() - # Try to extract result path from job activity - activity = job.get("activity", "") - if activity: - updates["render_log"] = { - "flamenco_job_id": line.flamenco_job_id, - "flamenco_status": flamenco_status, - "activity": activity, - } - # Set result path based on job type - job_type = job.get("type", "") - metadata = job.get("metadata", {}) - if job_type == "schaeffler-turntable": - output_dir = job.get("settings", {}).get("output_dir", "") - output_name = job.get("settings", {}).get("output_name", "turntable") - updates["result_path"] = f"{output_dir}/{output_name}.mp4" - elif job_type == "schaeffler-still": - updates["result_path"] = job.get("settings", {}).get("output_path", "") - - elif our_status == "failed": - updates["render_completed_at"] = datetime.utcnow() - updates["render_log"] = { - "flamenco_job_id": line.flamenco_job_id, - "flamenco_status": flamenco_status, - "error": job.get("activity", "Job failed"), - } - - session.execute( - sql_update(OrderLine) - .where(OrderLine.id == line.id) - .values(**updates) - ) - updated += 1 - logger.info( - f"Flamenco job {line.flamenco_job_id}: " - f"{flamenco_status} → render_status={our_status}" - ) - - # Track orders with lines that reached a terminal state - if our_status in ("completed", "failed"): - completed_order_ids.add(str(line.order_id)) - - except Exception as exc: - logger.warning( - f"Failed to poll Flamenco job {line.flamenco_job_id}: {exc}" - ) - - if updated: - session.commit() - - engine.dispose() - - # Auto-advance orders if all renderable lines are done - if completed_order_ids: - from app.services.order_status_service import check_order_completion - for oid in completed_order_ids: - check_order_completion(oid) - - return {"polled": len(lines), "updated": updated} - - -# --------------------------------------------------------------------------- -# Stalled-render watchdog -# --------------------------------------------------------------------------- - -@celery_app.task(name="app.tasks.flamenco_tasks.check_stalled_renders", queue="step_processing") -def check_stalled_renders(): - """Watchdog: detect and re-dispatch render jobs stuck in 'processing'. - - Runs on a Celery Beat schedule (every 5 minutes). - - After a docker restart, Celery workers lose in-flight tasks — the DB still - shows render_status='processing' indefinitely. This task: - - * For **Celery** lines: uses Celery inspect to check whether any worker is - still actively executing the task. If not (e.g. after a restart), and - the job has been stuck longer than ``render_stall_timeout_minutes`` - (default: 120 min), it is reset to 'pending' and re-dispatched. - - * For **Flamenco** lines: queries the Flamenco Manager. If the manager - reports the job as still active the line is left alone; if the job is - gone or in a terminal/error state it is re-dispatched. - """ - from sqlalchemy import create_engine, select, update as sql_update - from sqlalchemy.orm import Session - from app.config import settings as app_settings - from app.models.order_line import OrderLine - from app.models.system_setting import SystemSetting - - sync_url = app_settings.database_url.replace("+asyncpg", "") - engine = create_engine(sync_url) - - with Session(engine) as session: - # ── Read timeout from system settings ──────────────────────────────── - row = session.execute( - select(SystemSetting).where(SystemSetting.key == "render_stall_timeout_minutes") - ).scalar_one_or_none() - try: - timeout_minutes = int(row.value) if row else 120 - except (ValueError, TypeError): - timeout_minutes = 120 - - cutoff = datetime.utcnow() - timedelta(minutes=timeout_minutes) - - stalled_lines = session.execute( - select(OrderLine).where( - OrderLine.render_status == "processing", - OrderLine.render_started_at.isnot(None), - OrderLine.render_started_at < cutoff, - ) - ).scalars().all() - - if not stalled_lines: - engine.dispose() - return {"checked": 0, "restarted": 0, "timeout_minutes": timeout_minutes} - - logger.info( - "[watchdog] Found %d stalled render(s) older than %d minutes", - len(stalled_lines), timeout_minutes, - ) - - # ── Build set of order_line_ids actively running on Celery workers ─── - active_celery_line_ids: set[str] = set() - inspect_ok = False - try: - inspect = celery_app.control.inspect(timeout=2) - active_tasks = inspect.active() or {} - for worker_tasks in active_tasks.values(): - for task_info in (worker_tasks or []): - args = task_info.get("args", []) - if args: - active_celery_line_ids.add(str(args[0])) - inspect_ok = True - except Exception as exc: - logger.warning( - "[watchdog] Celery inspect failed (%s) — will re-dispatch all timed-out Celery jobs", - exc, - ) - - # ── Load Flamenco Manager URL ───────────────────────────────────────── - manager_url = "http://flamenco-manager:8080" - try: - url_row = session.execute( - select(SystemSetting).where(SystemSetting.key == "flamenco_manager_url") - ).scalar_one_or_none() - if url_row: - manager_url = url_row.value - except Exception: - pass - - # ── Decide which lines to restart ──────────────────────────────────── - to_restart: list[OrderLine] = [] - - for line in stalled_lines: - line_id = str(line.id) - - if line.flamenco_job_id: - # Flamenco job: verify with manager before re-dispatching - try: - from app.services.flamenco_client import get_flamenco_client - client = get_flamenco_client(manager_url) - job = client.get_job(line.flamenco_job_id) - flamenco_status = job.get("status", "") - if flamenco_status in ( - "active", "queued", "paused", - "pause-requested", "cancel-requested", - ): - logger.info( - "[watchdog] Flamenco job %s is still %s — skipping line %s", - line.flamenco_job_id, flamenco_status, line_id, - ) - continue - logger.info( - "[watchdog] Flamenco job %s status=%r → re-dispatching line %s", - line.flamenco_job_id, flamenco_status, line_id, - ) - except Exception as exc: - # Manager unreachable — skip to avoid false restarts - logger.warning( - "[watchdog] Cannot reach Flamenco for job %s (%s) — skipping line %s", - line.flamenco_job_id, exc, line_id, - ) - continue - else: - # Celery job: skip if still actively running on a worker - if inspect_ok and line_id in active_celery_line_ids: - logger.info( - "[watchdog] Celery render for line %s still active — skipping", line_id - ) - continue - logger.info( - "[watchdog] Celery render for line %s not found in active tasks — re-dispatching", - line_id, - ) - - to_restart.append(line) - - if not to_restart: - engine.dispose() - return { - "checked": len(stalled_lines), - "restarted": 0, - "timeout_minutes": timeout_minutes, - } - - # ── Reset stalled lines to pending ─────────────────────────────────── - for line in to_restart: - session.execute( - sql_update(OrderLine) - .where(OrderLine.id == line.id) - .values( - render_status="pending", - render_started_at=None, - render_backend_used=None, - flamenco_job_id=None, - render_log={ - "watchdog": ( - f"Auto-restarted after {timeout_minutes} min stall " - f"(previous backend: {line.render_backend_used or 'unknown'})" - ) - }, - ) - ) - session.commit() - - engine.dispose() - - # ── Re-dispatch outside DB session ─────────────────────────────────────── - from app.services.render_dispatcher import dispatch_render - restarted = 0 - for line in to_restart: - try: - dispatch_render(str(line.id)) - restarted += 1 - logger.info("[watchdog] Re-dispatched render for order line %s", line.id) - except Exception as exc: - logger.error( - "[watchdog] Failed to re-dispatch line %s: %s — left as pending", line.id, exc - ) - - return { - "checked": len(stalled_lines), - "restarted": restarted, - "timeout_minutes": timeout_minutes, - } diff --git a/docker-compose.yml b/docker-compose.yml index da66819..1ecfc45 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -170,39 +170,6 @@ services: - ./threejs-renderer:/app restart: unless-stopped - flamenco-manager: - build: ./flamenco - environment: - - FLAMENCO_MODE=manager - ports: - - "8080:8080" - volumes: - - uploads:/shared - - flamenco-data:/data - - ./flamenco/scripts:/opt/flamenco/scripts - restart: unless-stopped - - flamenco-worker: - build: ./flamenco - environment: - - FLAMENCO_MODE=worker - - FLAMENCO_MANAGER_URL=http://flamenco-manager:8080 - volumes: - - uploads:/shared - - /opt/blender:/opt/blender:ro - - ./flamenco/scripts:/opt/flamenco/scripts - depends_on: - - flamenco-manager - deploy: - replicas: 1 - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu, compute, utility, graphics] - restart: unless-stopped - frontend: build: context: ./frontend @@ -220,4 +187,3 @@ services: volumes: pgdata: uploads: - flamenco-data: