feat: rich product metadata extraction from STEP files

Extract volume, surface area, part count, assembly hierarchy, and complexity from STEP files via OCC B-rep analysis. Backend: - extract_rich_metadata() in step_processor.py: computes per-part volume (BRepGProp), surface area, triangle/vertex count, assembly depth, instance count, complexity score, largest part identification - cad_metadata JSONB column on Product model (DB migration) - Auto-populated during STEP processing (non-fatal, 10s timeout) - Also stored in cad_files.mesh_attributes["rich_metadata"] - Batch re-extract endpoint: POST /admin/settings/reextract-rich-metadata AI Agent: - search_products returns part_count, volume_cm3, complexity, largest_part - query_database tool description documents cad_metadata schema Frontend: - ProductDetail page: CAD Metadata section with stat cards (parts, volume, surface area, complexity, triangles, assembly depth) - Admin System Tools: "Re-extract Rich Metadata" button for backfill Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 18:49:50 +01:00
parent 0ffc86589a
commit cfccdd5397
12 changed files with 645 additions and 170 deletions
@@ -109,6 +109,57 @@ def process_step_file(self, cad_file_id: str):
            pl.step_error("process_step_file", f"STEP metadata extraction failed: {exc}", exc)
            r.delete(lock_key)  # release lock so a retry can proceed
            raise self.retry(exc=exc, countdown=60, max_retries=3)
+
+        # Extract rich metadata (volume, surface area, complexity, etc.) — non-fatal
+        try:
+            from sqlalchemy import create_engine, update as sql_update
+            from sqlalchemy.orm import Session as SyncSession
+            from app.config import settings as cfg
+            from app.services.step_processor import extract_rich_metadata
+            from app.models.cad_file import CadFile
+            from app.models.product import Product
+            from app.core.tenant_context import set_tenant_context_sync
+
+            eng = create_engine(cfg.database_url_sync)
+            try:
+                # Load stored_path for the cad file
+                with SyncSession(eng) as session:
+                    set_tenant_context_sync(session, _tenant_id)
+                    cad_file = session.get(CadFile, cad_file_id)
+                    step_path = cad_file.stored_path if cad_file else None
+
+                if step_path:
+                    rich_meta = extract_rich_metadata(str(step_path))
+                    if rich_meta and rich_meta.get("part_count", 0) > 0:
+                        with SyncSession(eng) as session:
+                            set_tenant_context_sync(session, _tenant_id)
+                            # Merge into cad_files.mesh_attributes
+                            cad_file = session.get(CadFile, cad_file_id)
+                            if cad_file:
+                                existing_attrs = cad_file.mesh_attributes or {}
+                                existing_attrs["rich_metadata"] = rich_meta
+                                session.execute(
+                                    sql_update(CadFile)
+                                    .where(CadFile.id == cad_file_id)
+                                    .values(mesh_attributes=existing_attrs)
+                                )
+                            # Update all active products linked to this CAD file
+                            session.execute(
+                                sql_update(Product)
+                                .where(Product.cad_file_id == cad_file_id, Product.is_active.is_(True))
+                                .values(cad_metadata=rich_meta)
+                            )
+                            session.commit()
+                        logger.info(
+                            f"Rich metadata extracted for cad_file {cad_file_id}: "
+                            f"{rich_meta.get('part_count')} parts, "
+                            f"{rich_meta.get('total_volume_cm3', 0):.1f} cm³"
+                        )
+            finally:
+                eng.dispose()
+        except Exception:
+            logger.exception(f"Rich metadata extraction failed for cad_file {cad_file_id} (non-fatal)")
+
    finally:
        r.delete(lock_key)  # always release on completion or unhandled error

@@ -203,6 +254,70 @@ def _auto_populate_materials_for_cad(cad_file_id: str, tenant_id: str | None = N
    eng.dispose()


+@celery_app.task(name="app.tasks.step_tasks.reextract_rich_metadata_task", queue="step_processing")
+def reextract_rich_metadata_task():
+    """Batch re-extract rich metadata (volume, surface area, complexity) for all completed CAD files."""
+    from sqlalchemy import create_engine, select as sql_select, update as sql_update
+    from sqlalchemy.orm import Session as SyncSession
+    from app.config import settings as cfg
+    from app.models.cad_file import CadFile, ProcessingStatus
+    from app.models.product import Product
+    from app.core.tenant_context import set_tenant_context_sync
+
+    sync_url = cfg.database_url.replace("+asyncpg", "")
+    eng = create_engine(sync_url)
+    updated = 0
+    failed = 0
+
+    try:
+        with SyncSession(eng) as session:
+            cad_files = session.execute(
+                sql_select(CadFile).where(
+                    CadFile.processing_status == ProcessingStatus.completed,
+                    CadFile.stored_path.isnot(None),
+                )
+            ).scalars().all()
+            cad_entries = [(str(cf.id), cf.stored_path, cf.tenant_id) for cf in cad_files]
+
+        for cad_file_id, step_path, tenant_id in cad_entries:
+            try:
+                from app.services.step_processor import extract_rich_metadata
+                rich_meta = extract_rich_metadata(str(step_path))
+                if rich_meta and rich_meta.get("part_count", 0) > 0:
+                    with SyncSession(eng) as session:
+                        set_tenant_context_sync(session, tenant_id)
+                        # Update mesh_attributes on cad_file
+                        cad_file = session.get(CadFile, cad_file_id)
+                        if cad_file:
+                            existing_attrs = cad_file.mesh_attributes or {}
+                            existing_attrs["rich_metadata"] = rich_meta
+                            session.execute(
+                                sql_update(CadFile)
+                                .where(CadFile.id == cad_file_id)
+                                .values(mesh_attributes=existing_attrs)
+                            )
+                        # Update all active products linked to this CAD file
+                        session.execute(
+                            sql_update(Product)
+                            .where(Product.cad_file_id == cad_file_id, Product.is_active.is_(True))
+                            .values(cad_metadata=rich_meta)
+                        )
+                        session.commit()
+                    updated += 1
+                    logger.info(
+                        f"reextract_rich_metadata: {cad_file_id} -> "
+                        f"{rich_meta.get('part_count')} parts, "
+                        f"{rich_meta.get('total_volume_cm3', 0):.1f} cm3"
+                    )
+            except Exception:
+                failed += 1
+                logger.exception(f"reextract_rich_metadata failed for cad_file {cad_file_id}")
+    finally:
+        eng.dispose()
+
+    logger.info(f"reextract_rich_metadata_task complete: {updated} updated, {failed} failed")
+
+
@celery_app.task(name="app.tasks.step_tasks.reextract_cad_metadata", queue="asset_pipeline")
 def reextract_cad_metadata(cad_file_id: str):
    """Re-extract bounding-box dimensions for an already-completed CAD file.