diff --git a/PLAN_REFACTOR.md b/PLAN_REFACTOR.md new file mode 100644 index 0000000..6360f3e --- /dev/null +++ b/PLAN_REFACTOR.md @@ -0,0 +1,1174 @@ +# Schaeffler Automat — Refactor Plan + +> Document date: 2026-03-08 +> Branch: refactor/v2 +> Author: Architecture review via Claude Code + +--- + +## Executive Summary + +### Current State + +Schaeffler Automat is a working Blender-based media production pipeline with: +- Domain-driven backend structure (partially migrated, many compat shims still present) +- 7 Docker services with GPU render-worker +- PostgreSQL with tenant_id columns + Row Level Security (RLS) enabled but inconsistently + applied at the application layer +- Celery task queues with two workers (step_processing + thumbnail_rendering) +- WebSocket real-time events via Redis Pub/Sub +- React/Vite frontend with workflow editor (ReactFlow), media browser, notifications + +### Core Problems + +1. `step_tasks.py` is 1,170 lines — monolithic task file containing 8+ distinct pipeline steps +2. Tenant isolation is partial: RLS is defined in DB migration 036 but `set_tenant_context()` + is not called consistently in every router; Celery tasks bypass RLS entirely +3. Pillow overlay code (green bar + model name label) is dead code — all renders use + `transparent_bg=True` but the 55-line block still runs conditionally +4. STL workflow remnants: `stl_quality` setting, `VALID_STL_QUALITIES`, `stl_size_bytes` in + render_log dicts still reference the old STL-based pipeline; the actual pipeline is GLB-only +5. Render job cancellation uses a synthetic task ID (`render-{line_id}`) that does not match + actual Celery task IDs — making revoke() a no-op +6. The MATERIAL_PALETTE + palette fallback lives in `step_processor.py` — should be replaced + with `SCHAEFFLER_059999_FailedMaterial` (magenta) per the project goals +7. Log messages are inconsistent: some use Python f-strings with no prefix, others use + `[STEP_NAME]` markers; structured logging is not enforced +8. `render_order_line_task` in `step_tasks.py` duplicates most of + `render_order_line_still_task` in `domains/rendering/tasks.py` +9. The blender_render.py Blender script is 853 lines with no sub-module structure +10. No GPU-first enforcement: `cycles_device` defaults to "auto" with no explicit fallback log + +### Vision + +A clean, modular pipeline where: +- Every step is a named `ProcessStep` with start/progress/done log events and DB audit trail +- Render jobs are tracked as structured JSON documents (job tickets) in the DB +- Tenant isolation is enforced at the dependency-injection layer, not ad-hoc per endpoint +- Dead code (Pillow overlays, STL workflow, Flamenco shims, threejs renderer) is deleted +- The auth hierarchy supports GlobalAdmin > TenantAdmin > ProjectManager > Client +- Workers scale dynamically without service restarts +- Notifications are batched summaries, not per-render noise + +--- + +## Architecture Overview + +### Current Architecture + +``` +┌─────────────┐ HTTP ┌──────────────────────────────────────────┐ +│ Frontend │ ──────────> │ backend:8888 (FastAPI) │ +│ React/Vite │ │ ├─ domains/auth │ +│ :5173 │ <─ WS ──── │ ├─ domains/orders │ +└─────────────┘ │ ├─ domains/products │ + │ ├─ domains/rendering │ + │ ├─ domains/tenants │ + │ └─ api/routers/ (compat shims) │ + └──────────┬───────────────────────────────┘ + │ Celery tasks via Redis broker + ┌─────────────────┼──────────────────┐ + │ │ │ + ┌──────▼──────┐ ┌──────▼──────┐ ┌──────▼──────┐ + │ worker │ │render-worker│ │ beat │ + │ step_proc │ │thumbnail_ │ │ scheduler │ + │ ai_valid │ │ rendering │ └─────────────┘ + │ concurr=8 │ │ concurr=1 │ + └─────────────┘ └──────▼──────┘ + │ subprocess + ┌──────▼──────┐ + │ blender │ + │ /opt/blend │ + └─────────────┘ + ┌──────────────┐ ┌──────────┐ ┌──────────┐ + │ PostgreSQL │ │ Redis │ │ MinIO │ + │ :5432 │ │ :6379 │ │ :9000 │ + └──────────────┘ └──────────┘ └──────────┘ +``` + +### Target Architecture (Post-Refactor) + +``` +┌─────────────────────────────────────────────────────────┐ +│ Frontend React/Vite :5173 │ +│ ├─ WorkflowEditor (ReactFlow) — visual pipeline │ +│ ├─ MediaBrowser — server-side filtered + virtual scroll│ +│ ├─ NotificationCenter — batched summaries only │ +│ └─ Admin — tooltips on every setting │ +└────────────────────┬────────────────────────────────────┘ + │ HTTP + WebSocket +┌────────────────────▼────────────────────────────────────┐ +│ backend:8888 (FastAPI) │ +│ middleware: TenantContextMiddleware (injects RLS) │ +│ ├─ domains/auth (GlobalAdmin|TenantAdmin|PM|Client)│ +│ ├─ domains/pipeline (process step registry + dispatch) │ +│ ├─ domains/rendering (render job documents, workflows) │ +│ ├─ domains/products (CAD files, media assets) │ +│ ├─ domains/orders (order state machine) │ +│ ├─ domains/tenants (tenant management) │ +│ └─ domains/billing (pricing, invoices) │ +└────────────────────┬────────────────────────────────────┘ + │ Celery canvas / chain / group + ┌───────────────┼───────────────┐ + │ │ │ +┌────▼────┐ ┌──────▼──────┐ ┌────▼────┐ +│ worker │ │render-worker│ │ beat │ +│ step_ │ │ concurr=1 │ │ sched. │ +│ process │ │ +Blender GPU│ │ recover │ +│ concr=8 │ └──────▼──────┘ │ queues │ +└─────────┘ │ └─────────┘ + subprocess (SIGTERM → SIGKILL + cleanup) + │ + ┌──────▼──────┐ + │ blender │ (GPU-first, explicit CPU-fallback log) + └─────────────┘ +``` + +--- + +## Phase 1: Foundation (Weeks 1–2) + +Critical infrastructure that blocks everything else. + +### 1.1 Structured Logging Framework + +**Current state:** +Log messages are a mix of bare `logger.info(f"...")`, `emit(order_line_id, "...")`, and +`log_task_event(task_id, "...")`. No consistent prefix, no structured fields. + +**Target:** +A `PipelineLogger` class that wraps Python's `logging` module and additionally writes +structured events to the DB (`audit_log` or a new `pipeline_events` table). + +**Design:** +```python +# backend/app/core/pipeline_logger.py +class PipelineLogger: + PREFIX_FORMAT = "[{step_name}]" + + def step_start(self, step: str, context: dict): ... + def step_progress(self, step: str, pct: int, msg: str): ... + def step_done(self, step: str, duration_s: float, result: dict): ... + def step_error(self, step: str, error: str, exc: Exception | None): ... +``` + +Every log call emits: +- Python `logging` line with `[STEP_NAME] message` +- Redis `log_task_event` for SSE streaming +- Optional DB insert into `pipeline_events(task_id, step_name, level, message, duration_s, context JSONB, created_at)` + +**Files to create:** +- `backend/app/core/pipeline_logger.py` — PipelineLogger class +- `backend/alembic/versions/048_pipeline_events.py` — new table migration + +**Files to modify:** +- All task files to replace bare `logger.info/error` with `PipelineLogger` calls +- `backend/app/core/task_logs.py` — keep Redis SSE publish, add DB write path + +### 1.2 Render Job Document + +**Current state:** +`OrderLine.render_log` is a loosely-structured JSONB dict. No schema, no state machine, +no step-level results stored. + +**Target:** +A `RenderJobDocument` JSONB schema stored in `order_lines.render_job_doc`. Acts as the +single source of truth for a render job's state machine. + +**Schema (JSONB):** +```json +{ + "version": 1, + "job_id": "", + "created_at": "ISO8601", + "state": "pending|queued|running|completed|failed|cancelled", + "celery_task_id": "uuid", + "steps": [ + { + "name": "resolve_step_path", + "status": "done", + "started_at": "ISO8601", + "completed_at": "ISO8601", + "duration_s": 0.02, + "output": {"step_path": "/app/uploads/..."} + }, + { + "name": "occ_glb_export", + "status": "done", + "duration_s": 8.4, + "output": {"glb_path": "...", "size_bytes": 204800} + }, + { + "name": "blender_render", + "status": "running", + "started_at": "ISO8601", + "gpu_type": "OPTIX", + "engine": "cycles", + "samples": 256 + } + ], + "error": null, + "result": { + "output_path": "...", + "duration_s": 34.2, + "engine_used": "cycles", + "gpu": "RTX 3090" + } +} +``` + +**Migration:** +- `backend/alembic/versions/049_render_job_document.py` — add `render_job_doc JSONB` to `order_lines`; keep `render_log` for backward compat (deprecate, remove in Phase 3) + +**Files to create:** +- `backend/app/domains/rendering/job_document.py` — `RenderJobDocument` Pydantic model + helpers (`update_step`, `set_state`, `append_error`) + +### 1.3 Tenant Context Middleware + +**Current state:** +`set_tenant_context()` must be called manually in each endpoint. Celery tasks bypass RLS +entirely (they use sync engines without `SET LOCAL app.current_tenant_id`). + +**Problem:** +Migration 036 enables RLS, but `build_tenant_db_dep()` in `database.py` actually yields +`db` without setting the tenant context (line 92: `yield db # context-setting happens +via set_tenant_context when needed`). This means most endpoints are silently bypassing RLS. + +**Target:** +A FastAPI middleware `TenantContextMiddleware` that automatically sets RLS context for +every request based on the JWT `tenant_id` claim. + +```python +# backend/app/core/middleware.py +class TenantContextMiddleware(BaseHTTPMiddleware): + async def dispatch(self, request: Request, call_next): + # Extract JWT, decode tenant_id + # Store in request.state.tenant_id + # After DB session is acquired, SET LOCAL app.current_tenant_id + ... +``` + +**JWT changes:** +`create_access_token()` must embed `tenant_id` in claims: +```python +payload = {"sub": user_id, "role": role, "tenant_id": str(tenant_id), "exp": expires} +``` + +**Celery tasks:** +All sync DB sessions in Celery tasks must receive `tenant_id` as a task argument and +execute `session.execute(text("SET LOCAL app.current_tenant_id = :tid"), {"tid": tenant_id})` +immediately after session creation. Add a `_set_tenant(session, tenant_id)` helper in +`backend/app/core/db_utils.py`. + +**Files to create:** +- `backend/app/core/middleware.py` — TenantContextMiddleware +- `backend/app/core/db_utils.py` — `_set_tenant(session, tenant_id)` + +**Files to modify:** +- `backend/app/main.py` — add middleware +- `backend/app/utils/auth.py` — embed tenant_id in JWT +- All Celery task functions — accept `tenant_id: str | None` parameter, call `_set_tenant` + +### 1.4 Process Step Registry + +**Current state:** +Pipeline steps are implicit — scattered across `step_tasks.py`, `rendering/tasks.py`, +`step_processor.py`, `render_blender.py`. No central definition. + +**Target:** +A `ProcessStep` enum and registry that all tasks reference by name. + +```python +# backend/app/domains/pipeline/steps.py +class ProcessStep(str, enum.Enum): + UPLOAD_STEP = "upload_step" + PARSE_EXCEL = "parse_excel" + EXTRACT_METADATA = "extract_metadata" + OCC_GLB_EXPORT = "occ_glb_export" + RENDER_THUMBNAIL = "render_thumbnail" + RENDER_STILL = "render_still" + RENDER_TURNTABLE = "render_turntable" + EXPORT_GLB = "export_glb" + EXPORT_BLEND = "export_blend" + DELIVER = "deliver" +``` + +Each step maps to exactly one Celery task and one workflow node type. This enum becomes +the contract between the visual workflow editor and the task executor. + +--- + +## Phase 2: Pipeline Modularity (Weeks 3–4) + +Break up `step_tasks.py` (1,170 lines). One file = one pipeline stage. + +### 2.1 Decompose step_tasks.py + +**Current functions and their new homes:** + +| Current location | Function | Target file | +|---|---|---| +| `step_tasks.py` | `process_step_file` | `domains/pipeline/tasks/extract_metadata.py` | +| `step_tasks.py` | `render_step_thumbnail` | `domains/pipeline/tasks/render_thumbnail.py` | +| `step_tasks.py` | `generate_gltf_geometry_task` | `domains/pipeline/tasks/export_glb_geometry.py` | +| `step_tasks.py` | `generate_gltf_production_task` | `domains/pipeline/tasks/export_glb_production.py` | +| `step_tasks.py` | `regenerate_thumbnail` | `domains/pipeline/tasks/render_thumbnail.py` | +| `step_tasks.py` | `dispatch_order_line_render` | `domains/pipeline/tasks/dispatch.py` | +| `step_tasks.py` | `render_order_line_task` | **DELETE** (duplicate of `domains/rendering/tasks.render_order_line_still_task`) | +| `step_tasks.py` | `reextract_cad_metadata` | `domains/pipeline/tasks/extract_metadata.py` | +| `step_tasks.py` | `_auto_populate_materials_for_cad` | `domains/pipeline/tasks/auto_materials.py` | +| `step_tasks.py` | `_bbox_from_glb`, `_bbox_from_step_cadquery` | `domains/pipeline/tasks/bbox.py` | +| `rendering/tasks.py` | `render_order_line_still_task` | `domains/rendering/tasks/render_still.py` | +| `rendering/tasks.py` | `render_turntable_task` | `domains/rendering/tasks/render_turntable.py` | +| `rendering/tasks.py` | `export_gltf_for_order_line_task` | `domains/pipeline/tasks/export_glb_geometry.py` | +| `rendering/tasks.py` | `export_blend_for_order_line_task` | `domains/rendering/tasks/export_blend.py` | +| `rendering/tasks.py` | `publish_asset` | `domains/media/tasks.py` | + +**`step_tasks.py` becomes a compatibility shim** (import-only, deprecated) until all +callers are updated. Remove it in Phase 3. + +### 2.2 Render Job Document Integration + +Every Celery task in the new structure: +1. Reads/creates `RenderJobDocument` at task start +2. Updates the relevant step via `job_doc.update_step(step_name, status="running")` +3. On completion: `job_doc.update_step(step_name, status="done", duration_s=elapsed)` +4. On failure: `job_doc.set_state("failed")` + `job_doc.append_error(...)` +5. Writes document back to `order_lines.render_job_doc` + +### 2.3 Render Job Cancellation (Proper) + +**Current problem:** +`celery_app.control.revoke("render-{line_id}", terminate=True)` — this ID is synthetic +and does not match the actual Celery task ID, so revoke is a no-op. The Blender process +continues running. + +**Solution:** +1. Store the actual Celery task ID in `render_job_doc.celery_task_id` when the task starts +2. Cancel endpoint reads `render_job_doc.celery_task_id` and revokes with that real ID +3. The render subprocess uses `start_new_session=True` (already done in `render_blender.py`) + and stores `proc.pid` in the job document +4. On SIGTERM, the Celery task's signal handler calls `os.killpg(pgid, SIGTERM)`, waits 10s, + then `os.killpg(pgid, SIGKILL)` +5. Clean up: remove partial output file, remove `_frames_*` temp directory +6. Update `render_job_doc.state = "cancelled"`, clear `OrderLine.render_status = "cancelled"` + +**Files to modify:** +- `backend/app/api/routers/orders.py` — read celery_task_id from job doc, not synthetic ID +- `backend/app/domains/rendering/tasks/render_still.py` — store task ID + PID in job doc, + register SIGTERM handler +- `backend/app/domains/rendering/tasks/render_turntable.py` — same + +### 2.4 GPU-Primary Rendering + +**Current state:** +`cycles_device` defaults to "auto". When GPU is unavailable, Blender silently falls back +to CPU with no log message. The `_activate_gpu()` function in `blender_render.py` already +probes for GPU but the result is not reflected in the render job document. + +**Target:** +- `cycles_device` default changes from "auto" to "gpu" in system settings +- `_activate_gpu()` result is logged with `[GPU_PROBE]` prefix: + - Success: `[GPU_PROBE] RTX 3090 activated (OPTIX) — using GPU render` + - Failure: `[GPU_PROBE] No GPU found, falling back to CPU — set cycles_device=cpu to suppress this warning` +- GPU type and fallback reason are written to `render_job_doc.result.gpu_info` +- Admin UI shows GPU status on the Settings page (already partially exists via worker activity) + +**Files to modify:** +- `render-worker/scripts/blender_render.py` — enhance `_activate_gpu()` logging +- `backend/app/api/routers/admin.py` — change default `cycles_device` to "gpu" +- `backend/app/domains/rendering/job_document.py` — add `gpu_info` field to result + +### 2.5 Blender Script Modularity + +**Current state:** +`render-worker/scripts/blender_render.py` is 853 lines with everything inline. + +**Target structure:** +``` +render-worker/scripts/ +├── blender_render.py — entry point, arg parsing, top-level flow +├── _blender_gpu.py — GPU probe + activation +├── _blender_import.py — GLB import, rotation, smooth shading +├── _blender_materials.py — material library application + fallback +├── _blender_camera.py — auto camera from bbox, clip planes +├── _blender_scene.py — scene setup (Mode A vs Mode B) +└── _blender_post.py — (currently Pillow overlay — DELETE THIS FILE) +``` + +`blender_render.py` imports from these sub-modules. Blender Python's `sys.path` is updated +at the top of the script to include the scripts directory. + +--- + +## Phase 3: Code Deletion (Weeks 3–4, parallel with Phase 2) + +### 3.1 Remove Pillow Overlay Code + +**Location:** `render-worker/scripts/blender_render.py` lines 798–851 + +**Why it's dead:** `transparent_bg=True` is always passed for production renders. The +`else:` branch at line 802 can never execute in production. The green Schaeffler bar is +now part of the `.blend` template, not post-processing. + +**Delete:** +- Lines 798–851 in `blender_render.py` (the entire `if transparent_bg: ... else: try PIL...` block) +- Remove Pillow from render-worker dependencies in `render-worker/Dockerfile` +- Remove the line `- Schaeffler green top bar + model name label via Pillow post-processing.` + from the script docstring + +### 3.2 Remove STL Workflow Remnants + +**What to delete:** + +| Location | What to remove | +|---|---| +| `backend/app/api/routers/admin.py` | `VALID_STL_QUALITIES`, `stl_quality` from `SettingsOut`, `SettingsUpdate`, and all `SETTINGS_DEFAULTS` | +| `backend/app/api/routers/admin.py` | `generate-missing-stls` endpoint (if still present) | +| `backend/app/api/routers/cad.py` | `generate-stl/{quality}` endpoint | +| `backend/app/services/render_blender.py` | `stl_quality` parameter from `render_still()` and `render_turntable_to_file()` | +| `backend/app/services/render_blender.py` | Key `stl_duration_s` → rename to `glb_duration_s` (remove `# key kept for backward compat` comment) | +| `backend/app/tasks/step_tasks.py` | `generate_stl_cache` task (check if it still exists) | +| `render-worker/scripts/` | Any `_import_stl`, `_convert_stl`, `_scale_mm_to_m` functions | +| `backend/app/api/routers/analytics.py` | `avg_stl_s` field in analytics response | +| All render log dicts | Replace `stl_size_bytes: 0` and `stl_duration_s:` with `glb_*` equivalents | +| DB migration | `backend/alembic/versions/050_cleanup_stl_settings.py` — `DELETE FROM system_settings WHERE key = 'stl_quality'` | + +**Files to delete entirely:** +- `blender-renderer/` directory (already removed from docker-compose.yml, remove directory) +- `threejs-renderer/` directory (migration 033 already removed it from services) +- `flamenco/` directory (migration 032 removed Flamenco; verify nothing still imports from it) + +**Verify before deleting:** +```bash +grep -rn "blender-renderer\|threejs-renderer\|flamenco" backend/ frontend/ --include="*.py" --include="*.ts" --include="*.tsx" +``` + +### 3.3 Remove Compat Shims + +After all callers are migrated, delete these shim files: +- `backend/app/models/user.py` (shim → `domains/auth/models.py`) +- `backend/app/models/cad_file.py` (shim → `domains/products/models.py`) +- `backend/app/services/render_dispatcher.py` (shim, 10 lines) +- `backend/app/services/material_service.py` (shim → `domains/materials/service.py`) +- `backend/app/services/render_blender.py` (move fully into `domains/rendering/`) +- `backend/app/models/` directory → all models are already in `domains/*/models.py` + +### 3.4 Remove Duplicate render_order_line_task + +`step_tasks.render_order_line_task` (lines 705–1050 of `step_tasks.py`) duplicates +`rendering/tasks.render_order_line_still_task`. The step_tasks version has more +baggage (compat imports, `emit()` calls, stl_quality references). Delete the step_tasks +version, migrate all queue routes to the `rendering/tasks` version. + +**Migration:** +- `celery_app.py` task routes: route `app.tasks.step_tasks.*` to empty list, removing + step_tasks from the routing table after all tasks are migrated +- Update `CLAUDE.md` to reflect new task locations + +--- + +## Phase 4: Tenant & Auth (Weeks 5–6) + +### 4.1 Role Hierarchy + +**Current roles:** `admin | project_manager | client` + +**Target roles:** +```python +class UserRole(str, enum.Enum): + global_admin = "global_admin" # platform operator, bypass RLS, all tenants + tenant_admin = "tenant_admin" # per-tenant admin, full control within tenant + project_manager = "project_manager" # order/render management within tenant + client = "client" # read own orders, create draft orders +``` + +**Permission matrix:** + +| Permission | GlobalAdmin | TenantAdmin | ProjectManager | Client | +|---|---|---|---|---| +| Manage tenants | YES | no | no | no | +| Manage users (all tenants) | YES | no | no | no | +| Manage users (own tenant) | YES | YES | no | no | +| All system settings | YES | YES | no | no | +| Trigger renders | YES | YES | YES | no | +| View all orders in tenant | YES | YES | YES | no | +| Create/view own orders | YES | YES | YES | YES | +| Reject orders | YES | YES | YES | no | +| Delete renders | YES | YES | YES | no | +| View analytics | YES | YES | YES | no | + +**DB migration:** +- `backend/alembic/versions/051_role_hierarchy.py` — rename `admin` → `global_admin`, + add `tenant_admin` to the `userrole` enum; backfill existing `admin` users to `global_admin` + +**Auth utilities:** +- `require_global_admin()` — replaces `require_admin()` +- `require_tenant_admin_or_above()` — TenantAdmin or GlobalAdmin +- `require_pm_or_above()` — PM, TenantAdmin, GlobalAdmin + +### 4.2 Tenant Isolation — Consistency Audit + +**The problem:** +`database.py:build_tenant_db_dep()` yields the session without setting RLS context +(line 92 comments say "context-setting happens via set_tenant_context when needed"). +This means every endpoint that uses `Depends(get_db)` bypasses RLS. + +**Fix — Middleware approach (preferred):** + +```python +# backend/app/core/middleware.py +class TenantContextMiddleware(BaseHTTPMiddleware): + """Set PostgreSQL RLS context on every request from JWT claims.""" + + BYPASS_PATHS = {"/health", "/api/auth/login", "/api/auth/refresh"} + + async def dispatch(self, request: Request, call_next): + if request.url.path in self.BYPASS_PATHS: + return await call_next(request) + + token = self._extract_token(request) + if token: + payload = decode_token_safe(token) + tenant_id = payload.get("tenant_id") + role = payload.get("role") + request.state.tenant_id = tenant_id + request.state.role = role + + response = await call_next(request) + return response +``` + +The `get_db` dependency is modified to read `tenant_id` from `request.state`: + +```python +async def get_db(request: Request) -> AsyncGenerator[AsyncSession, None]: + async with AsyncSessionLocal() as session: + tenant_id = getattr(request.state, "tenant_id", None) + role = getattr(request.state, "role", None) + if tenant_id: + if role == "global_admin": + await session.execute(text("SET LOCAL app.current_tenant_id = 'bypass'")) + else: + await session.execute( + text("SET LOCAL app.current_tenant_id = :tid"), + {"tid": str(tenant_id)}, + ) + yield session +``` + +### 4.3 Tenant Isolation Strategy — Shared vs. Dedicated Containers + +**Decision: Shared containers with DB-level isolation (current model)** + +**Analysis:** + +| Factor | Shared containers | Dedicated containers per tenant | +|---|---|---| +| Cost | Low (6 containers total) | High (6 containers × N tenants) | +| Complexity | Low | Very high (orchestration, networking) | +| Data isolation | DB-level (RLS) | Full OS-level | +| GPU sharing | Single GPU shared | Dedicated GPU per tenant (expensive) | +| Blender jobs | Queue + concurrency control | Per-tenant render queue | +| Failure blast radius | All tenants affected by worker crash | Isolated per tenant | +| Scaling | Celery autoscale | Docker Swarm / Kubernetes HPA | +| Migration effort | Weeks (Phase 3-4) | Months (new orchestration layer) | + +**Recommendation:** Maintain shared containers with DB-level RLS isolation. Dedicated +containers are only justified if tenants have strict contractual data isolation requirements +(e.g., GDPR-mandated separate processing). For the current internal use case (Schaeffler +internal teams), RLS + tenant_id partitioning is sufficient. + +**If dedicated containers are required in future:** +- Docker Compose override file per tenant (`docker-compose.{tenant-slug}.yml`) +- Each tenant gets own PostgreSQL schema (not separate DB) with schema-based routing +- Shared MinIO with per-tenant bucket policies +- Separate Redis database (0-15) per tenant (max 16 tenants) +- Celery routing: per-tenant queue prefix `{tenant_slug}.thumbnail_rendering` + +### 4.4 Per-Tenant Feature Flags + +Add a `tenant_config` JSONB column to the `tenants` table: + +```python +# backend/alembic/versions/052_tenant_feature_flags.py +tenant_config JSONB DEFAULT '{ + "max_concurrent_renders": 3, + "render_engines_allowed": ["cycles"], + "max_order_size": 500, + "fallback_material": "SCHAEFFLER_059999_FailedMaterial", + "notifications_enabled": true, + "invoice_prefix": "INV" +}' +``` + +Feature flags checked at render dispatch time: +- `max_concurrent_renders` — enforced in Celery queue routing +- `render_engines_allowed` — validated in OutputType creation +- `fallback_material` — passed to Blender scripts (see §6.4) + +--- + +## Phase 5: Material & Rendering Improvements (Weeks 5–6) + +### 5.1 Fallback Material — SCHAEFFLER_059999_FailedMaterial + +**Current state:** +`step_processor.py:MATERIAL_PALETTE` assigns rainbow colors from a palette when material +assignment fails or no material is specified. `blender_render.py` has its own +`PALETTE_LINEAR` for the same purpose. + +**Target:** +When material resolution fails (no alias, no exact match, material library link broken), +assign `SCHAEFFLER_059999_FailedMaterial` (magenta) so failed assignments are immediately +visible in renders. + +**Implementation:** +- `domains/materials/service.py:resolve_material_map()` — instead of pass-through, return + `SCHAEFFLER_059999_FailedMaterial` for unresolved parts (configurable per-tenant via + `tenant_config.fallback_material`) +- `render-worker/scripts/blender_render.py` — when material library is provided but a + part name does not match any library material, assign `SCHAEFFLER_059999_FailedMaterial` + rather than palette color +- `render-worker/scripts/_blender_materials.py` — a new sub-module for material logic + with explicit logging: `[MATERIAL] part 'Outer_Ring' → 'SCHAEFFLER_010101_Steel-Bare' (alias match)` + and `[MATERIAL] part 'Unknown_Part' → 'SCHAEFFLER_059999_FailedMaterial' (no match)` +- `step_processor.py` — remove `MATERIAL_PALETTE` and `_material_to_color()`; the palette + is no longer used once fallback material is in place. Part colors for geometry GLB viewer + should come from the material library color map, not a rainbow palette. + +### 5.2 Remove EEVEE Fallback + +**Current state:** +`render_blender.py` has an EEVEE-to-Cycles fallback: +```python +if returncode > 0 and engine == "eevee": + logger.warning("EEVEE failed (exit %d) — retrying with Cycles", returncode) + returncode, stdout_lines2, stderr_lines2 = _run("cycles") + engine_used = "cycles (eevee fallback)" +``` + +This hides failures and makes debugging harder. Per the Blender 5.0.1 requirement, EEVEE +Next should work reliably. If it fails, it should be a hard failure, not a silent retry. + +**Target:** Remove the EEVEE-to-Cycles fallback. If EEVEE fails, the task fails with a +clear error. Set `EEVEE_FALLBACK_ENABLED=false` system setting (default false from now on). + +### 5.3 Remove Blender Version Check + +**Current state:** +`backend/app/services/render_blender.py` defines: +```python +MIN_BLENDER_VERSION = (5, 0, 1) +``` + +This constant is defined but the check that uses it has been removed. Search for any +remaining version-comparison code in `blender_render.py` and render scripts. + +**Target:** +- Remove `MIN_BLENDER_VERSION = (5, 0, 1)` from `render_blender.py` +- Remove any `bpy.app.version` comparisons in render scripts +- Blender 5.0.1+ is assumed; older versions are not supported + +--- + +## Phase 6: Notification Center Refactor (Week 7) + +### 6.1 Current Problems + +Per-render notifications (render.completed, render.failed) fire for every single +`OrderLine`. An order with 200 lines generates 200 notifications. This is too noisy. + +### 6.2 Notification Architecture + +**Three channels:** + +1. **Activity Feed** (`/api/activity`) — per-action events: every render start/complete, + every order state change, every upload. Low-level, not shown in bell dropdown. Available + in a dedicated `/activity` page for debugging. + +2. **Notification Center** (`/api/notifications`) — batch summaries only: + - "Order #ORD-2026-042 rendering complete: 47/50 succeeded, 3 failed" + - "Excel import failed: 12 products skipped (see import log)" + - "Worker recovery: 3 stalled renders requeued after 120min timeout" + +3. **System Alerts** (admin only) — infrastructure issues: GPU probe failed, Blender + binary not found, Redis connection lost. + +**Notification trigger rules:** +- `render.completed` per-line → suppress; emit batch when ALL lines in order reach terminal state +- `render.failed` per-line → suppress; emit batch on order completion +- `excel.imported` → one notification per upload with summary counts +- `order.submitted` → one notification (always keep) +- System alerts → always emit individually + +**DB changes:** +- `audit_log` — add `channel VARCHAR(20)` column: `activity | notification | alert` +- `notification_configs` — extend `event_type` to include new batch event types +- New beat task: `batch_render_notifications` — runs every 60s, checks for orders where + all lines are terminal but no batch notification has been emitted; emits the summary + +### 6.3 Per-User Notification Preferences + +Current `notification_configs` table has `event_type` + `channel` + `enabled`. Extend: +- Add `frequency: str` column — `immediate | hourly | daily | never` +- Frequency is respected by the batch notification beat task + +**Files to modify:** +- `backend/app/domains/notifications/models.py` — add `channel`, `frequency` columns +- `backend/app/services/notification_service.py` — add `emit_batch_notification()` function +- `backend/app/tasks/beat_tasks.py` — add `batch_render_notifications` schedule +- `frontend/src/pages/NotificationSettings.tsx` — add frequency selector per event type +- `frontend/src/pages/Notifications.tsx` — separate tabs for Activity | Notifications | Alerts + +--- + +## Phase 7: UI/UX Improvements (Week 7–8) + +### 7.1 Tooltip / Help Text System + +Every setting, parameter, and action in the Admin UI and order wizard needs a tooltip +explaining what it does and what it affects in the pipeline. + +**Architecture:** + +```typescript +// frontend/src/help/helpTexts.ts +export const HELP_TEXTS: Record = { + "setting.blender_cycles_samples": { + title: "Cycles Samples", + body: "Number of render samples per pixel. Higher = better quality, longer render time. 256 is a good balance for product shots. 64 is fast for previews.", + affects: ["render quality", "render time"], + unit: "samples", + range: [1, 4096], + recommendation: "256 for production, 64 for preview", + }, + "setting.gltf_preview_linear_deflection": { + title: "3D Viewer Mesh Quality", + body: "Controls tessellation precision for the 3D browser viewer. Lower values = finer mesh, larger file. 0.1mm is a good default for medium-complexity parts.", + affects: ["3D viewer file size", "viewer load time"], + unit: "mm", + }, + "action.regenerate_thumbnails": { + title: "Regenerate All Thumbnails", + body: "Re-renders thumbnails for all STEP files using current settings. This queues all files on the thumbnail_rendering worker. Expected time: N × 30s. Only needed after changing renderer settings.", + warning: "This will queue a large number of tasks. Only run during off-peak hours.", + }, + // ... all settings +} +``` + +```typescript +// frontend/src/components/HelpTooltip.tsx +interface HelpTooltipProps { + helpKey: string + position?: "top" | "right" | "bottom" | "left" +} + +export function HelpTooltip({ helpKey, position = "right" }: HelpTooltipProps) { + const help = HELP_TEXTS[helpKey] + if (!help) return null + return ( + } position={position}> + + + ) +} +``` + +**Where to add tooltips (minimum required):** +- All `system_settings` keys in Admin > Settings +- All `OutputType.render_settings` fields in the OutputType editor +- All `RenderTemplate` fields in the template editor +- All actions in Admin > Settings (regenerate thumbnails, process unprocessed, etc.) +- All fields in the Order Wizard with non-obvious meaning + +### 7.2 Media Browser Refactor + +**Current state:** +`frontend/src/pages/MediaBrowser.tsx` — exists but no details on current filter capabilities. + +**Target:** +Server-side filtered media browser with: +- Filters: `lagertyp | category_key | render_status | asset_type | tenant_id (admin)` +- Text search on product name, pim_id +- Server-side pagination (50 per page) +- Virtual scroll for large catalogs (react-virtual or TanStack Virtual) +- Batch download selected assets + +**API changes:** +``` +GET /api/media/assets? + asset_type=still& + category_key=TRB& + lagertyp=Axial-Zylinderrollenlager& + render_status=completed& + page=1& + page_size=50& + q=81113 +``` + +**DB indexes required:** +```sql +-- backend/alembic/versions/053_media_browser_indexes.py +CREATE INDEX ix_media_assets_asset_type_created ON media_assets(asset_type, created_at DESC); +CREATE INDEX ix_products_category_lagertyp ON products(category_key, lagertyp); +CREATE INDEX ix_products_name_gin ON products USING GIN(to_tsvector('simple', COALESCE(name, '') || ' ' || COALESCE(pim_id, ''))); +``` + +**Files to modify:** +- `backend/app/domains/media/router.py` — add `GET /assets` with filter params +- `backend/app/domains/media/schemas.py` — add `MediaAssetFilter` Pydantic model +- `frontend/src/pages/MediaBrowser.tsx` — complete rewrite with virtual scroll +- `frontend/src/api/media.ts` — add `getMediaAssets(filters)` function + +### 7.3 Workflow Editor — Pipeline Step Nodes + +**Current state:** +`WorkflowEditor.tsx` has 5 node types (Upload, Parse, Render, Export, Deliver) but they +do not map to actual Celery tasks. `WorkflowDefinition.config` is a free-form JSONB blob +with no schema validation. + +**Target:** +Node types correspond 1:1 to `ProcessStep` enum values. The workflow editor saves a +validated workflow config that the `dispatch_workflow()` function can execute. + +**WorkflowDefinition config schema:** +```json +{ + "version": 1, + "nodes": [ + {"id": "n1", "step": "extract_metadata", "params": {}}, + {"id": "n2", "step": "render_thumbnail", "params": {"engine": "cycles", "samples": 64}}, + {"id": "n3", "step": "render_still", "params": {"width": 2048, "height": 2048}}, + {"id": "n4", "step": "export_glb", "params": {"quality": "high"}}, + {"id": "n5", "step": "deliver", "params": {}} + ], + "edges": [ + {"from": "n1", "to": "n2"}, + {"from": "n2", "to": "n3"}, + {"from": "n3", "to": "n4"}, + {"from": "n4", "to": "n5"} + ] +} +``` + +Backend validation: `workflow_router.py` validates that all `step` values are in +`ProcessStep` enum before saving. + +Frontend: `WorkflowEditor.tsx` builds available node types from a `GET /api/workflows/steps` +endpoint that returns all `ProcessStep` entries with their parameter schemas. + +### 7.4 Kanban Rejection Flow + +**Current state:** +`OrderStatus.rejected` exists but the rejection flow is undefined. The admin panel has no +rejection UI. `rejected_at` column exists but there is no rejection reason field. + +**Target flow:** +1. **Who can reject:** `ProjectManager`, `TenantAdmin`, `GlobalAdmin` +2. **Trigger:** `POST /api/orders/{id}/reject` with body `{"reason": "...", "notify_client": true}` +3. **What happens:** + - Order status → `rejected`, `rejected_at` = now + - `rejection_reason` stored (new `Text` column on `Order`) + - All pending/processing renders are cancelled (same as cancel-renders endpoint) + - Notification emitted to order creator: "Your order #ORD-2026-042 was rejected. Reason: ..." + - Audit log entry created +4. **Client sees:** Order status badge changes to `REJECTED` with reason visible +5. **Re-submission:** Client can `POST /api/orders/{id}/resubmit` which clears rejection, + resets to `draft`, allowing edits before re-submitting. Re-submit creates a new audit log + entry and emits notification to PMs. + +**DB migration:** +- `backend/alembic/versions/054_order_rejection.py` — add `rejection_reason TEXT` to `orders` + +--- + +## Phase 8: Scalable Workers (Week 8) + +### 8.1 Current Concurrency Controls + +- `worker` (step_processing): `CELERY_WORKER_CONCURRENCY` env var, default 8 +- `render-worker` (thumbnail_rendering): hardcoded 1 (Blender serial access) +- Both require Docker service restart to change concurrency + +### 8.2 Dynamic Worker Scaling + +**Short term (no Kubernetes):** +Use Celery's built-in `autoscale` option: +```yaml +# docker-compose.yml +render-worker: + command: celery -A app.tasks.celery_app worker + --loglevel=info + -Q thumbnail_rendering + --autoscale=1,1 # min=1, max=1 (single Blender concurrency) + --concurrency=1 +``` + +For `worker`: +```yaml +worker: + command: celery -A app.tasks.celery_app worker + --loglevel=info + -Q step_processing,ai_validation + --autoscale=${MAX_CONCURRENCY:-8},${MIN_CONCURRENCY:-2} +``` + +**Per-queue concurrency via DB:** +Add a `worker_configs` table: +```sql +CREATE TABLE worker_configs ( + queue_name VARCHAR(100) PRIMARY KEY, + max_concurrency INT NOT NULL DEFAULT 8, + min_concurrency INT NOT NULL DEFAULT 2, + updated_at TIMESTAMP NOT NULL DEFAULT now() +); +``` + +A beat task `apply_worker_concurrency` runs every 5 minutes and uses Celery control +commands to adjust pool size: +```python +celery_app.control.broadcast("pool_shrink", arguments={"n": 2}, destination=["worker@host"]) +celery_app.control.broadcast("pool_grow", arguments={"n": 4}, destination=["worker@host"]) +``` + +**Long term (Kubernetes):** +Workers run as Kubernetes Deployments with HPA on `celery_queue_length` metric (exposed via +Flower or a custom `/metrics` endpoint for Prometheus). Render-workers use GPU node pools +with `nvidia.com/gpu: 1` resource requests. + +### 8.3 Worker Health Recovery + +**Current state:** +`beat_tasks.recover_stuck_cad_files` runs every 5 minutes and handles stuck processing state. + +**Extend to:** +- Detect `render_status = 'processing'` with `render_started_at` > `render_stall_timeout_minutes` ago +- SIGTERM any still-running Blender PID (stored in `render_job_doc.celery_task_id`) +- Reset `render_status` to `failed`, update `render_job_doc.state = 'failed'` +- Emit system alert notification (admin channel) +- Log with `[WORKER_RECOVERY] Stalled render for order_line {id} terminated after {N}min` + +--- + +## Detailed Task Breakdown by Area + +### A. step_tasks.py Decomposition + +**Current problems:** +- 1,170 lines, 8 distinct Celery tasks, many private helpers, multiple inline DB session + creation patterns +- Imports scattered: some at module level, some inside functions (Celery pattern) +- `render_order_line_task` (lines 705–1050+) duplicates `render_order_line_still_task` + +**Migration path:** +1. Create new `domains/pipeline/tasks/` directory with one file per step +2. Each new task calls `PipelineLogger` instead of bare `logger.info` +3. Each new task writes to `render_job_doc` via `job_document.py` helpers +4. Old `step_tasks.py` becomes import-only shim: `from app.domains.pipeline.tasks.extract_metadata import process_step_file` +5. After 2-week migration period, delete `step_tasks.py` + +### B. Auth Token Claims + +**Current:** `{"sub": user_id, "role": role, "exp": expires}` — no tenant_id in token + +**Target:** `{"sub": user_id, "role": role, "tenant_id": str(tenant_id), "exp": expires}` + +**Impact:** All existing tokens become invalid after deploy. Users must re-login. +**Mitigation:** Rotate `JWT_SECRET_KEY` as part of the deployment to force re-login. + +### C. Celery Task Routing Update + +After Phase 2 decomposition, update `celery_app.conf.update(task_routes={...})`: +```python +task_routes = { + "app.domains.pipeline.tasks.*": {"queue": "step_processing"}, + "app.domains.rendering.tasks.*": {"queue": "thumbnail_rendering"}, + "app.domains.media.tasks.*": {"queue": "step_processing"}, + "app.tasks.ai_tasks.*": {"queue": "ai_validation"}, + "app.tasks.beat_tasks.*": {"queue": "step_processing"}, +} +``` + +### D. Frontend API Client Consistency + +All `frontend/src/api/*.ts` files should: +- Use the axios client from `api/client.ts` (which injects `X-Tenant-ID` header) +- Export typed interfaces for all response shapes +- Use `useQuery` / `useMutation` from TanStack Query, not bare `axios.get` in components + +**Audit needed:** Check each `api/*.ts` file to confirm `X-Tenant-ID` header is sent +(it is wired in the axios interceptor per commit 5da90b5, but verify all files use +the configured client, not `axios.create()` directly). + +--- + +## Architectural Decisions (ADRs) + +### ADR-001: Shared containers vs. per-tenant containers +**Decision:** Shared containers with PostgreSQL RLS +**Rationale:** Cost and complexity savings. RLS provides adequate isolation for internal use. +**Consequences:** Must ensure RLS is applied consistently (Phase 1.3). Blender sessions are +shared; GPU contention is managed via Celery queue depth, not isolation. + +### ADR-002: Render Job Document as JSONB +**Decision:** Store render job state machine as JSONB in `order_lines.render_job_doc` +**Rationale:** Avoids additional `workflow_node_results` table queries for debugging; +JSONB is flexible for schema evolution; indexed for state-based queries. +**Alternatives considered:** Separate `render_job_steps` table — rejected (too many joins +for the common "show me render status" query). + +### ADR-003: No per-render notifications +**Decision:** Suppress individual render.completed notifications; emit batch at order completion +**Rationale:** An order with 200 lines generates 200 notifications under the current model. +Batch summaries at order completion are actionable; per-render events are noise. +**Consequences:** Activity feed still records all events for debugging. + +### ADR-004: GPU-first rendering +**Decision:** Default `cycles_device = "gpu"`, explicit log on CPU fallback +**Rationale:** The render-worker has GPU reservation in docker-compose.yml. CPU fallback +should be visible and logged, not silent. +**Consequences:** Renders on machines without GPU will always log a CPU fallback warning. + +### ADR-005: Fallback material over palette +**Decision:** Replace `MATERIAL_PALETTE` rainbow fallback with `SCHAEFFLER_059999_FailedMaterial` +**Rationale:** Failed material assignments should be immediately visible (magenta) rather +than disguised as intentional palette colors. +**Consequences:** Parts with missing material mapping will render magenta in both +thumbnail and production renders. This is a feature, not a bug. + +### ADR-006: Blender 5.0.1 minimum, no version guards +**Decision:** Remove all `bpy.app.version` checks and `MIN_BLENDER_VERSION` guards +**Rationale:** The project is Blender 5.0.1-only. Version shims add complexity without value. +**Consequences:** Running with an older Blender binary will cause cryptic errors. Document +the minimum version requirement clearly in the Dockerfile and README. + +--- + +## What Gets Deleted + +### Python files to delete entirely: +- `backend/app/models/user.py` — compat shim +- `backend/app/models/cad_file.py` — compat shim +- `backend/app/models/order.py` — compat shim (if exists) +- `backend/app/models/order_item.py` — compat shim +- `backend/app/models/order_line.py` — compat shim +- `backend/app/models/material.py` — compat shim +- `backend/app/models/material_alias.py` — compat shim +- `backend/app/models/render_template.py` — compat shim +- `backend/app/models/output_type.py` — compat shim +- `backend/app/models/system_setting.py` — compat shim +- `backend/app/models/template.py` — compat shim +- `backend/app/models/render_position.py` — compat shim +- `backend/app/services/render_dispatcher.py` — 10-line shim +- `backend/app/services/material_service.py` — 3-line shim +- `backend/app/tasks/step_tasks.py` — after Phase 2 migration complete +- `backend/app/domains/rendering/tasks.py` — split into per-step files in Phase 2 + +### Directories to delete entirely: +- `blender-renderer/` — HTTP microservice, removed from docker-compose in refactor/v2 +- `threejs-renderer/` — removed in migration 033 +- `flamenco/` — removed in migration 032 + +### Code blocks to delete (within files): +- `render-worker/scripts/blender_render.py` lines 798–851 — Pillow overlay +- `render-worker/scripts/blender_render.py` line 17 — docstring Pillow mention +- `backend/app/services/render_blender.py` line 17 — `MIN_BLENDER_VERSION = (5, 0, 1)` +- `backend/app/services/render_blender.py` lines 229–233 — EEVEE-to-Cycles fallback +- `backend/app/services/step_processor.py` lines 19–31 — `MATERIAL_PALETTE` + `_material_to_color()` +- `backend/app/api/routers/admin.py` — `VALID_STL_QUALITIES`, `stl_quality` in all schemas + +### System settings to delete (DB migration): +- `stl_quality` — GLB-only pipeline, no STL concept +- `threejs_render_size` — renderer removed +- `thumbnail_renderer` — was multi-value (pillow|blender|threejs), now always blender + +--- + +## Migration Strategy + +### Deployment Order (Zero-Downtime) + +**Step 1 — DB migrations (non-breaking):** +- Run migrations 048–054 (new columns: `render_job_doc`, `rejection_reason`, feature flags, etc.) +- New columns are nullable, no existing queries break + +**Step 2 — Backend deploy (backward compatible):** +- Deploy new backend with compat shims in place +- New endpoints and middleware active +- Old endpoints still work +- JWT tokens are extended with `tenant_id` claim (existing tokens without it still work + via fallback in middleware) + +**Step 3 — Celery worker deploy:** +- Deploy new `domains/pipeline/tasks/` structure +- `step_tasks.py` compat shim routes to new functions +- Old task names still registered via shim + +**Step 4 — Frontend deploy:** +- New WorkflowEditor with validated step types +- HelpTooltip components added +- MediaBrowser refactor with virtual scroll + +**Step 5 — Cleanup (breaking):** +- Remove compat shims +- Delete `step_tasks.py` +- Rotate `JWT_SECRET_KEY` to force re-login (tenant_id now required in claims) +- Run DB migration to clean up stl_quality and threejs settings + +### Rollback Plan +- All migrations have `downgrade()` implemented +- Compat shims mean old task names still work during migration window +- `render_log` column kept alongside `render_job_doc` until all consumers migrated + +### Testing Before Delete +Before deleting any compat shim or old code, verify: +```bash +grep -rn "" backend/ frontend/ --include="*.py" --include="*.ts" --include="*.tsx" +``` +Must return 0 results from non-shim files. + +--- + +## Open Questions + +These require product decisions before implementation: + +1. **Tenant onboarding flow** — How are new tenants created? Self-service signup, or + admin creates tenant + TenantAdmin user manually? What is the initial data setup? + +2. **Blender binary distribution** — Currently host-mounted (`/opt/blender:/opt/blender:ro`). + If multiple render-workers run on different hosts in a future cluster, how is Blender + distributed? Container image vs. network share? + +3. **MinIO vs. filesystem storage** — All media assets are stored on the local filesystem + (`/app/uploads` volume). MinIO is configured but not used for primary storage yet. Should + Phase 2 migrate assets to MinIO for horizontal scaling? + +4. **Invoice workflow** — `billing/models.py` has `Invoice` + `InvoiceLine` models and an + `invoices` table (migration 042). Is billing actually used? If not, should it be removed + to reduce complexity? + +5. **AI validation (Azure OpenAI)** — `ai_tasks.py` and `azure_ai.py` exist but Azure + credentials are optional. Is this feature actively used or can it be removed? + +6. **Email notifications** — SMTP settings exist in system_settings but email sending is + not implemented. Is this a required feature for the next phase? + +7. **Rejection re-submission UX** — When a client re-submits a rejected order, do they + create a new order or update the existing one? The current data model supports only + one status per order, not a history of submissions. + +8. **Media browser download format** — Bulk download: ZIP of individual files, or separate + download links? ZIP requires server-side assembly which adds load. + +9. **Tooltip language** — Help texts in English (per CLAUDE.md coding standards) or German + (for end-user-facing UI)? The admin UI is currently in English labels. + +10. **3D Viewer geometry quality** — The `gltf_preview_linear_deflection` default is 0.1mm. + For very small parts (sub-1mm features), this may be too coarse. Should the deflection + auto-scale based on the CAD file's bounding box dimensions?agentId: a6cf206cd46b868cb (for resuming to continue this agent's work if needed) +total_tokens: 132964 +tool_uses: 72 +duration_ms: 467361 \ No newline at end of file diff --git a/backend/alembic/versions/048_render_job_document.py b/backend/alembic/versions/048_render_job_document.py new file mode 100644 index 0000000..f074080 --- /dev/null +++ b/backend/alembic/versions/048_render_job_document.py @@ -0,0 +1,27 @@ +"""Add render_job_doc JSONB column to order_lines. + +Stores a structured RenderJobDocument (state machine, per-step results, +timing, GPU info) alongside the legacy render_log column. + +Revision ID: 048 +Revises: 047 +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSONB + +revision = "048" +down_revision = "047" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "order_lines", + sa.Column("render_job_doc", JSONB, nullable=True), + ) + + +def downgrade() -> None: + op.drop_column("order_lines", "render_job_doc") diff --git a/backend/app/api/routers/auth.py b/backend/app/api/routers/auth.py index 430049b..a603dfe 100644 --- a/backend/app/api/routers/auth.py +++ b/backend/app/api/routers/auth.py @@ -38,7 +38,8 @@ async def login(body: LoginRequest, db: AsyncSession = Depends(get_db)): if not user.is_active: raise HTTPException(status_code=403, detail="Account disabled") - token = create_access_token(str(user.id), user.role.value) + tenant_id = str(user.tenant_id) if user.tenant_id else None + token = create_access_token(str(user.id), user.role.value, tenant_id=tenant_id) return TokenResponse(access_token=token, user=UserOut.model_validate(user)) diff --git a/backend/app/core/db_utils.py b/backend/app/core/db_utils.py new file mode 100644 index 0000000..7908b25 --- /dev/null +++ b/backend/app/core/db_utils.py @@ -0,0 +1,71 @@ +"""Database utilities for use inside Celery tasks (sync context). + +Celery tasks bypass FastAPI middleware, so tenant RLS context must be set +manually. Use _set_tenant() immediately after creating a sync DB session. + +Usage:: + + from app.core.db_utils import set_tenant_sync, get_sync_session + + with get_sync_session() as db: + set_tenant_sync(db, tenant_id, role) + # queries here will be RLS-filtered +""" +import contextlib +from typing import Generator + +from sqlalchemy import create_engine, text +from sqlalchemy.orm import Session, sessionmaker + +from app.config import settings + + +def set_tenant_sync(session: Session, tenant_id: str | None, role: str | None = None) -> None: + """Set RLS tenant context on a *synchronous* SQLAlchemy session. + + Call this at the very start of any sync DB block inside a Celery task + when you need tenant isolation. Admins bypass RLS; all other roles get + a scoped context. If tenant_id is None the call is a no-op (global + access, i.e. no RLS enforcement). + """ + if not tenant_id: + return + if role == "admin": + session.execute(text("SET LOCAL app.current_tenant_id = 'bypass'")) + else: + session.execute( + text("SET LOCAL app.current_tenant_id = :tid"), + {"tid": tenant_id}, + ) + + +# Lazily created sync engine (reused across tasks in the same worker process) +_sync_engine = None + + +def _get_sync_engine(): + global _sync_engine + if _sync_engine is None: + sync_url = settings.database_url.replace("+asyncpg", "").replace("+aiosqlite", "") + _sync_engine = create_engine(sync_url, pool_pre_ping=True, pool_size=5, max_overflow=10) + return _sync_engine + + +@contextlib.contextmanager +def get_sync_session(tenant_id: str | None = None, role: str | None = None) -> Generator[Session, None, None]: + """Context manager that yields a synchronous DB session with optional RLS. + + Prefer using the existing async session patterns in FastAPI routes. + This helper is intended for Celery tasks only. + """ + factory = sessionmaker(bind=_get_sync_engine(), expire_on_commit=False) + with factory() as session: + if tenant_id: + set_tenant_sync(session, tenant_id, role) + try: + yield session + except Exception: + session.rollback() + raise + else: + session.commit() diff --git a/backend/app/core/middleware.py b/backend/app/core/middleware.py new file mode 100644 index 0000000..27630dd --- /dev/null +++ b/backend/app/core/middleware.py @@ -0,0 +1,49 @@ +"""Application middleware. + +TenantContextMiddleware + Decodes the JWT Bearer token (if present) from every incoming request and + stores tenant_id + role in request.state. The get_db dependency reads + request.state to automatically set the RLS context before yielding the + session — no endpoint code change required. +""" +import logging +from jose import JWTError, jwt +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.requests import Request +from starlette.responses import Response + +from app.config import settings + +_log = logging.getLogger(__name__) + + +class TenantContextMiddleware(BaseHTTPMiddleware): + """Extract JWT → inject tenant_id + role into request.state. + + Does NOT reject unauthenticated requests — that is still handled by the + route-level dependencies (require_admin, get_current_user, etc.). + Missing / invalid tokens result in request.state.tenant_id = None. + """ + + async def dispatch(self, request: Request, call_next) -> Response: + tenant_id: str | None = None + role: str | None = None + + auth_header = request.headers.get("Authorization", "") + if auth_header.startswith("Bearer "): + token = auth_header[7:] + try: + payload = jwt.decode( + token, + settings.jwt_secret_key, + algorithms=[settings.jwt_algorithm], + ) + tenant_id = payload.get("tenant_id") + role = payload.get("role") + except JWTError: + pass # invalid/expired tokens are handled per-endpoint + + request.state.tenant_id = tenant_id + request.state.role = role + + return await call_next(request) diff --git a/backend/app/core/pipeline_logger.py b/backend/app/core/pipeline_logger.py new file mode 100644 index 0000000..d278c6a --- /dev/null +++ b/backend/app/core/pipeline_logger.py @@ -0,0 +1,106 @@ +"""Structured pipeline logger. + +Wraps Python logging + Redis SSE streaming for consistent, prefixed log output +from all Celery pipeline tasks. Every method: + - emits a Python `logging` line with a [STEP_NAME] prefix + - publishes to Redis via log_task_event for SSE streaming in the UI +""" +import logging +import time +from typing import Any + +from app.core.task_logs import log_task_event + +_log = logging.getLogger(__name__) + + +class PipelineLogger: + """Structured logger for a single pipeline execution context. + + Usage in a Celery task:: + + pl = PipelineLogger(task_id=self.request.id, order_line_id=str(line.id)) + pl.step_start("occ_glb_export", {"cad_file_id": cad_file_id}) + ... + pl.step_done("occ_glb_export", duration_s=8.4, result={"size_bytes": 204800}) + """ + + def __init__(self, task_id: str | None, order_line_id: str | None = None): + self.task_id = task_id or "unknown" + self.order_line_id = order_line_id + self._step_starts: dict[str, float] = {} + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def step_start(self, step: str, context: dict[str, Any] | None = None) -> None: + self._step_starts[step] = time.time() + msg = f"[{step}] start" + if context: + msg += f" | {context}" + _log.info(msg) + log_task_event(self.task_id, msg, level="info") + + def step_progress(self, step: str, pct: int, msg: str) -> None: + full = f"[{step}] {pct}% — {msg}" + _log.info(full) + log_task_event(self.task_id, full, level="info") + + def step_done(self, step: str, duration_s: float | None = None, result: dict[str, Any] | None = None) -> None: + if duration_s is None: + start = self._step_starts.get(step) + duration_s = round(time.time() - start, 2) if start else None + parts = [f"[{step}] done"] + if duration_s is not None: + parts.append(f"{duration_s:.1f}s") + if result: + parts.append(str(result)) + msg = " | ".join(parts) + _log.info(msg) + log_task_event(self.task_id, msg, level="info") + + def step_error(self, step: str, error: str, exc: Exception | None = None) -> None: + msg = f"[{step}] ERROR — {error}" + if exc: + _log.exception(msg) + else: + _log.error(msg) + log_task_event(self.task_id, msg, level="error") + + def info(self, step: str, msg: str) -> None: + full = f"[{step}] {msg}" + _log.info(full) + log_task_event(self.task_id, full, level="info") + + def warning(self, step: str, msg: str) -> None: + full = f"[{step}] WARNING — {msg}" + _log.warning(full) + log_task_event(self.task_id, full, level="warning") + + # ------------------------------------------------------------------ + # Context manager for a single step + # ------------------------------------------------------------------ + + def step(self, step_name: str, context: dict[str, Any] | None = None) -> "_StepContext": + return _StepContext(self, step_name, context) + + +class _StepContext: + """Context manager that auto-calls step_start / step_done / step_error.""" + + def __init__(self, pl: PipelineLogger, step_name: str, context: dict | None): + self._pl = pl + self._name = step_name + self._context = context + + def __enter__(self) -> "_StepContext": + self._pl.step_start(self._name, self._context) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is None: + self._pl.step_done(self._name) + else: + self._pl.step_error(self._name, str(exc_val), exc_val) + return False # do not suppress exceptions diff --git a/backend/app/core/process_steps.py b/backend/app/core/process_steps.py new file mode 100644 index 0000000..d3b7083 --- /dev/null +++ b/backend/app/core/process_steps.py @@ -0,0 +1,39 @@ +"""Named pipeline step identifiers. + +All Celery tasks and render scripts reference these constants so that log +messages, DB records, and UI labels stay consistent across the codebase. +""" +from enum import StrEnum + + +class StepName(StrEnum): + # ── STEP file processing ────────────────────────────────────────── + RESOLVE_STEP_PATH = "resolve_step_path" + OCC_OBJECT_EXTRACT = "occ_object_extract" + OCC_GLB_EXPORT = "occ_glb_export" + GLB_BBOX = "glb_bbox" + MATERIAL_MAP_RESOLVE = "material_map_resolve" + AUTO_POPULATE_MATERIALS = "auto_populate_materials" + + # ── Thumbnail generation ───────────────────────────────────────── + BLENDER_RENDER = "blender_render" + THREEJS_RENDER = "threejs_render" + THUMBNAIL_SAVE = "thumbnail_save" + + # ── Order line render ───────────────────────────────────────────── + ORDER_LINE_SETUP = "order_line_setup" + RESOLVE_TEMPLATE = "resolve_template" + BLENDER_STILL = "blender_still" + BLENDER_TURNTABLE = "blender_turntable" + OUTPUT_SAVE = "output_save" + + # ── GLB / asset export ──────────────────────────────────────────── + EXPORT_GLB_GEOMETRY = "export_glb_geometry" + EXPORT_GLB_PRODUCTION = "export_glb_production" + EXPORT_BLEND = "export_blend" + + # ── STL cache ──────────────────────────────────────────────────── + STL_CACHE_GENERATE = "stl_cache_generate" + + # ── Notifications ───────────────────────────────────────────────── + NOTIFY = "notify" diff --git a/backend/app/database.py b/backend/app/database.py index 207ac00..3eb94be 100644 --- a/backend/app/database.py +++ b/backend/app/database.py @@ -1,9 +1,13 @@ -from typing import AsyncGenerator, Optional +from __future__ import annotations +from typing import TYPE_CHECKING, AsyncGenerator, Optional from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker from sqlalchemy.orm import DeclarativeBase from sqlalchemy import text from app.config import settings +if TYPE_CHECKING: + from starlette.requests import Request + engine = create_async_engine( settings.database_url, echo=False, @@ -23,8 +27,20 @@ class Base(DeclarativeBase): pass -async def get_db() -> AsyncGenerator[AsyncSession, None]: +async def get_db(request: "Request | None" = None) -> AsyncGenerator[AsyncSession, None]: async with AsyncSessionLocal() as session: + # Auto-apply RLS context if TenantContextMiddleware populated request.state + if request is not None: + tenant_id = getattr(request.state, "tenant_id", None) + role = getattr(request.state, "role", None) + if tenant_id: + if role == "admin": + await session.execute(text("SET LOCAL app.current_tenant_id = 'bypass'")) + else: + await session.execute( + text("SET LOCAL app.current_tenant_id = :tid"), + {"tid": tenant_id}, + ) try: yield session finally: diff --git a/backend/app/domains/orders/models.py b/backend/app/domains/orders/models.py index 36f7dd0..62f305b 100644 --- a/backend/app/domains/orders/models.py +++ b/backend/app/domains/orders/models.py @@ -131,6 +131,7 @@ class OrderLine(Base): render_status: Mapped[str] = mapped_column(String(20), nullable=False, default="pending") result_path: Mapped[str | None] = mapped_column(String(1000), nullable=True) render_log: Mapped[dict | None] = mapped_column(JSONB, nullable=True) + render_job_doc: Mapped[dict | None] = mapped_column(JSONB, nullable=True) ai_validation_status: Mapped[str] = mapped_column(String(20), nullable=False, default="not_started") ai_validation_result: Mapped[dict | None] = mapped_column(JSONB, nullable=True) flamenco_job_id: Mapped[str | None] = mapped_column(String(100), nullable=True) diff --git a/backend/app/domains/rendering/job_document.py b/backend/app/domains/rendering/job_document.py new file mode 100644 index 0000000..3207d85 --- /dev/null +++ b/backend/app/domains/rendering/job_document.py @@ -0,0 +1,161 @@ +"""RenderJobDocument — structured JSONB job ticket stored in order_lines.render_job_doc. + +Acts as the single source of truth for a render job's state machine. +Stored as JSONB in order_lines.render_job_doc; keep order_lines.render_log +for backward compat (deprecated, removed in Phase 3). + +Usage:: + + from app.domains.rendering.job_document import RenderJobDocument, JobState, StepRecord + + doc = RenderJobDocument.new(order_line_id=str(line.id), celery_task_id=self.request.id) + doc.begin_step("occ_glb_export") + ... + doc.finish_step("occ_glb_export", output={"glb_path": str(glb), "size_bytes": sz}) + doc.set_state(JobState.COMPLETED, result={"output_path": str(out)}) + + # Persist to DB (inside Celery sync task): + line.render_job_doc = doc.to_dict() + db.commit() +""" +import time +from datetime import datetime, timezone +from enum import StrEnum +from typing import Any + +from pydantic import BaseModel, Field + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +# ── State machine ───────────────────────────────────────────────────────────── + +class JobState(StrEnum): + PENDING = "pending" + QUEUED = "queued" + RUNNING = "running" + COMPLETED = "completed" + FAILED = "failed" + CANCELLED = "cancelled" + + +class StepStatus(StrEnum): + PENDING = "pending" + RUNNING = "running" + DONE = "done" + FAILED = "failed" + SKIPPED = "skipped" + + +# ── Data models ─────────────────────────────────────────────────────────────── + +class StepRecord(BaseModel): + name: str + status: StepStatus = StepStatus.PENDING + started_at: str | None = None + completed_at: str | None = None + duration_s: float | None = None + output: dict[str, Any] | None = None + error: str | None = None + + +class RenderJobDocument(BaseModel): + version: int = 1 + job_id: str # == order_line_id + created_at: str = Field(default_factory=_now_iso) + updated_at: str = Field(default_factory=_now_iso) + state: JobState = JobState.PENDING + celery_task_id: str | None = None + steps: list[StepRecord] = Field(default_factory=list) + error: str | None = None + result: dict[str, Any] | None = None + + # ── Factory ────────────────────────────────────────────────────── + + @classmethod + def new(cls, order_line_id: str, celery_task_id: str | None = None) -> "RenderJobDocument": + return cls(job_id=order_line_id, celery_task_id=celery_task_id) + + @classmethod + def from_dict(cls, d: dict | None) -> "RenderJobDocument | None": + if not d: + return None + try: + return cls.model_validate(d) + except Exception: + return None + + # ── Mutation helpers ───────────────────────────────────────────── + + def set_state(self, state: JobState, result: dict[str, Any] | None = None, error: str | None = None) -> None: + self.state = state + self.updated_at = _now_iso() + if result is not None: + self.result = result + if error is not None: + self.error = error + + def begin_step(self, step_name: str) -> StepRecord: + """Mark a step as running. Creates it if not present.""" + rec = self._get_or_create_step(step_name) + rec.status = StepStatus.RUNNING + rec.started_at = _now_iso() + self.updated_at = _now_iso() + if self.state == JobState.PENDING or self.state == JobState.QUEUED: + self.state = JobState.RUNNING + return rec + + def finish_step( + self, + step_name: str, + output: dict[str, Any] | None = None, + duration_s: float | None = None, + ) -> StepRecord: + rec = self._get_or_create_step(step_name) + rec.status = StepStatus.DONE + rec.completed_at = _now_iso() + if duration_s is not None: + rec.duration_s = round(duration_s, 2) + elif rec.started_at: + try: + start = datetime.fromisoformat(rec.started_at) + rec.duration_s = round((datetime.now(timezone.utc) - start).total_seconds(), 2) + except Exception: + pass + if output is not None: + rec.output = output + self.updated_at = _now_iso() + return rec + + def fail_step(self, step_name: str, error: str) -> StepRecord: + rec = self._get_or_create_step(step_name) + rec.status = StepStatus.FAILED + rec.completed_at = _now_iso() + rec.error = error + self.updated_at = _now_iso() + return rec + + def skip_step(self, step_name: str, reason: str | None = None) -> StepRecord: + rec = self._get_or_create_step(step_name) + rec.status = StepStatus.SKIPPED + if reason: + rec.output = {"reason": reason} + self.updated_at = _now_iso() + return rec + + # ── Serialisation ──────────────────────────────────────────────── + + def to_dict(self) -> dict: + return self.model_dump(mode="json") + + # ── Internal ───────────────────────────────────────────────────── + + def _get_or_create_step(self, step_name: str) -> StepRecord: + for rec in self.steps: + if rec.name == step_name: + return rec + rec = StepRecord(name=step_name) + self.steps.append(rec) + return rec diff --git a/backend/app/main.py b/backend/app/main.py index 41d3b9f..e1b7fb3 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -8,6 +8,7 @@ from pathlib import Path from app.config import settings from app.database import engine, Base from app.core.websocket import manager as ws_manager +from app.core.middleware import TenantContextMiddleware # Import routers from domain locations from app.domains.auth.router import router as auth_router @@ -52,6 +53,7 @@ app.add_middleware( allow_methods=["*"], allow_headers=["*"], ) +app.add_middleware(TenantContextMiddleware) # Mount static files for thumbnails (dir created in lifespan; skip if not writable) thumbnails_dir = Path(settings.upload_dir) / "thumbnails" diff --git a/backend/app/utils/auth.py b/backend/app/utils/auth.py index 0c99f71..462e726 100644 --- a/backend/app/utils/auth.py +++ b/backend/app/utils/auth.py @@ -28,9 +28,11 @@ def verify_password(plain: str, hashed: str) -> bool: return pwd_context.verify(plain, hashed) -def create_access_token(user_id: str, role: str) -> str: +def create_access_token(user_id: str, role: str, tenant_id: str | None = None) -> str: expires = datetime.utcnow() + timedelta(minutes=settings.jwt_access_token_expire_minutes) - payload = {"sub": user_id, "role": role, "exp": expires} + payload: dict = {"sub": user_id, "role": role, "exp": expires} + if tenant_id: + payload["tenant_id"] = tenant_id return jwt.encode(payload, settings.jwt_secret_key, algorithm=settings.jwt_algorithm)