Files
HartOMat/backend/app/domains/pipeline/tasks/extract_metadata.py
T

295 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""STEP metadata extraction tasks.
Covers:
- process_step_file — extract OCC objects + queue thumbnail
- reextract_cad_metadata — re-compute bounding-box for completed files
- _auto_populate_materials_for_cad — helper: fill cad_part_materials from Excel
- _bbox_from_glb / _bbox_from_step_cadquery — bbox helpers
"""
import logging
from pathlib import Path
from app.tasks.celery_app import celery_app
from app.core.task_logs import log_task_event
from app.core.pipeline_logger import PipelineLogger
logger = logging.getLogger(__name__)
def _bbox_from_glb(glb_path: str) -> dict | None:
"""Backward-compatible wrapper for GLB bbox extraction."""
from app.domains.rendering.workflow_runtime_services import extract_bbox_from_glb
return extract_bbox_from_glb(glb_path)
def _bbox_from_step_cadquery(step_path: str) -> dict | None:
"""Backward-compatible wrapper for STEP bbox fallback extraction."""
from app.domains.rendering.workflow_runtime_services import extract_bbox_from_step_cadquery
return extract_bbox_from_step_cadquery(step_path)
@celery_app.task(bind=True, name="app.tasks.step_tasks.process_step_file", queue="step_processing")
def process_step_file(
self,
cad_file_id: str,
workflow_run_id: str | None = None,
workflow_node_id: str | None = None,
**_: object,
):
"""Process a STEP file: extract objects, generate thumbnail, convert to glTF.
After processing completes, auto-populate cad_part_materials from Excel
component data for any linked products that don't yet have materials assigned.
A per-file Redis lock (TTL = 10 min) prevents duplicate tasks from processing
the same file concurrently — e.g. when 'Process Unprocessed' is clicked while
a file is already being processed.
"""
import redis as redis_lib
from app.config import settings as app_settings
pl = PipelineLogger(task_id=self.request.id)
pl.step_start("process_step_file", {"cad_file_id": cad_file_id})
# Resolve and log tenant context at task start (required for RLS)
from app.core.tenant_context import resolve_tenant_id_for_cad
_tenant_id = resolve_tenant_id_for_cad(cad_file_id)
lock_key = f"step_processing_lock:{cad_file_id}"
r = redis_lib.from_url(app_settings.redis_url)
acquired = r.set(lock_key, "1", nx=True, ex=600) # 10-minute TTL
if not acquired:
logger.warning(f"STEP file {cad_file_id} is already being processed — skipping duplicate task")
return
try:
pl.info("process_step_file", f"Processing STEP file (metadata only): {cad_file_id}")
try:
from app.services.step_processor import extract_cad_metadata
extract_cad_metadata(cad_file_id, tenant_id=_tenant_id)
except Exception as exc:
pl.step_error("process_step_file", f"STEP metadata extraction failed: {exc}", exc)
r.delete(lock_key) # release lock so a retry can proceed
raise self.retry(exc=exc, countdown=60, max_retries=3)
# Extract rich metadata (volume, surface area, complexity, etc.) — non-fatal
try:
from sqlalchemy import create_engine, update as sql_update
from sqlalchemy.orm import Session as SyncSession
from app.config import settings as cfg
from app.services.step_processor import extract_rich_metadata
from app.models.cad_file import CadFile
from app.models.product import Product
from app.core.tenant_context import set_tenant_context_sync
eng = create_engine(cfg.database_url_sync)
try:
# Load stored_path for the cad file
with SyncSession(eng) as session:
set_tenant_context_sync(session, _tenant_id)
cad_file = session.get(CadFile, cad_file_id)
step_path = cad_file.stored_path if cad_file else None
if step_path:
rich_meta = extract_rich_metadata(str(step_path))
if rich_meta and rich_meta.get("part_count", 0) > 0:
with SyncSession(eng) as session:
set_tenant_context_sync(session, _tenant_id)
# Merge into cad_files.mesh_attributes
cad_file = session.get(CadFile, cad_file_id)
if cad_file:
existing_attrs = cad_file.mesh_attributes or {}
existing_attrs["rich_metadata"] = rich_meta
session.execute(
sql_update(CadFile)
.where(CadFile.id == cad_file_id)
.values(mesh_attributes=existing_attrs)
)
# Update all active products linked to this CAD file
session.execute(
sql_update(Product)
.where(Product.cad_file_id == cad_file_id, Product.is_active.is_(True))
.values(cad_metadata=rich_meta)
)
session.commit()
logger.info(
f"Rich metadata extracted for cad_file {cad_file_id}: "
f"{rich_meta.get('part_count')} parts, "
f"{rich_meta.get('total_volume_cm3', 0):.1f} cm³"
)
finally:
eng.dispose()
except Exception:
logger.exception(f"Rich metadata extraction failed for cad_file {cad_file_id} (non-fatal)")
finally:
r.delete(lock_key) # always release on completion or unhandled error
pl.step_done("process_step_file")
try:
from app.domains.rendering.tasks import _update_workflow_run_status
_update_workflow_run_status(
cad_file_id,
"completed",
workflow_run_id=workflow_run_id,
workflow_node_id=workflow_node_id,
)
except Exception:
logger.exception("Failed to update workflow state for process_step_file %s", cad_file_id)
# Legacy flow still auto-queues thumbnail generation here.
# Graph-mode workflows dispatch explicit thumbnail save/render nodes instead.
if workflow_run_id is None:
from app.domains.pipeline.tasks.render_thumbnail import render_step_thumbnail
render_step_thumbnail.delay(cad_file_id)
def _auto_populate_materials_for_cad(cad_file_id: str, tenant_id: str | None = None) -> None:
"""Sync helper: auto-populate cad_part_materials from Excel for newly-processed CAD files.
Only fills products where cad_part_materials is empty or all-blank,
preventing overwrites of manually assigned materials.
"""
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from app.config import settings as app_settings
from app.core.tenant_context import set_tenant_context_sync
from app.domains.rendering.workflow_runtime_services import auto_populate_materials_for_cad
sync_url = app_settings.database_url.replace("+asyncpg", "")
eng = create_engine(sync_url)
with Session(eng) as session:
set_tenant_context_sync(session, tenant_id)
auto_populate_materials_for_cad(session, cad_file_id)
eng.dispose()
@celery_app.task(name="app.tasks.step_tasks.reextract_rich_metadata_task", queue="asset_pipeline")
def reextract_rich_metadata_task():
"""Batch re-extract rich metadata (volume, surface area, complexity) for all completed CAD files."""
from sqlalchemy import create_engine, select as sql_select, update as sql_update
from sqlalchemy.orm import Session as SyncSession
from app.config import settings as cfg
from app.models.cad_file import CadFile, ProcessingStatus
from app.models.product import Product
from app.core.tenant_context import set_tenant_context_sync
sync_url = cfg.database_url.replace("+asyncpg", "")
eng = create_engine(sync_url)
updated = 0
failed = 0
try:
with SyncSession(eng) as session:
cad_files = session.execute(
sql_select(CadFile).where(
CadFile.processing_status == ProcessingStatus.completed,
CadFile.stored_path.isnot(None),
)
).scalars().all()
cad_entries = [(str(cf.id), cf.stored_path, cf.tenant_id) for cf in cad_files]
for cad_file_id, step_path, tenant_id in cad_entries:
try:
from app.services.step_processor import extract_rich_metadata
rich_meta = extract_rich_metadata(str(step_path))
if rich_meta and rich_meta.get("part_count", 0) > 0:
with SyncSession(eng) as session:
set_tenant_context_sync(session, tenant_id)
# Update mesh_attributes on cad_file
cad_file = session.get(CadFile, cad_file_id)
if cad_file:
existing_attrs = cad_file.mesh_attributes or {}
existing_attrs["rich_metadata"] = rich_meta
session.execute(
sql_update(CadFile)
.where(CadFile.id == cad_file_id)
.values(mesh_attributes=existing_attrs)
)
# Update all active products linked to this CAD file
session.execute(
sql_update(Product)
.where(Product.cad_file_id == cad_file_id, Product.is_active.is_(True))
.values(cad_metadata=rich_meta)
)
session.commit()
updated += 1
logger.info(
f"reextract_rich_metadata: {cad_file_id} -> "
f"{rich_meta.get('part_count')} parts, "
f"{rich_meta.get('total_volume_cm3', 0):.1f} cm3"
)
except Exception:
failed += 1
logger.exception(f"reextract_rich_metadata failed for cad_file {cad_file_id}")
finally:
eng.dispose()
logger.info(f"reextract_rich_metadata_task complete: {updated} updated, {failed} failed")
@celery_app.task(name="app.tasks.step_tasks.reextract_cad_metadata", queue="asset_pipeline")
def reextract_cad_metadata(cad_file_id: str):
"""Re-extract bounding-box dimensions for an already-completed CAD file.
Uses cadquery (available in render-worker) to compute dimensions_mm.
Updates mesh_attributes without changing processing_status or re-rendering.
Safe to run on completed files.
"""
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from app.config import settings as app_settings
from app.models.cad_file import CadFile
from app.domains.rendering.workflow_runtime_services import resolve_cad_bbox
pl = PipelineLogger(task_id=None)
pl.step_start("reextract_cad_metadata", {"cad_file_id": cad_file_id})
# Resolve and log tenant context at task start (required for RLS)
from app.core.tenant_context import resolve_tenant_id_for_cad, set_tenant_context_sync
_tenant_id = resolve_tenant_id_for_cad(cad_file_id)
sync_url = app_settings.database_url.replace("+asyncpg", "")
eng = create_engine(sync_url)
with Session(eng) as session:
set_tenant_context_sync(session, _tenant_id)
cad_file = session.get(CadFile, cad_file_id)
if not cad_file or not cad_file.stored_path:
logger.warning(f"reextract_cad_metadata: file not found {cad_file_id}")
eng.dispose()
return
step_path = cad_file.stored_path
try:
p = Path(step_path)
glb_path = p.parent / f"{p.stem}_thumbnail.glb"
bbox_result = resolve_cad_bbox(step_path, glb_path=str(glb_path))
patch = bbox_result.bbox_data
if patch:
with Session(eng) as session:
set_tenant_context_sync(session, _tenant_id)
cad_file = session.get(CadFile, cad_file_id)
if cad_file:
cad_file.mesh_attributes = {**(cad_file.mesh_attributes or {}), **patch}
session.commit()
dims = patch["dimensions_mm"]
pl.step_done("reextract_cad_metadata", result={
"dimensions_mm": f"{dims['x']}×{dims['y']}×{dims['z']} mm"
})
logger.info(
f"reextract_cad_metadata: {cad_file_id}"
f"{dims['x']}×{dims['y']}×{dims['z']} mm"
)
else:
logger.warning(f"reextract_cad_metadata: no bbox data for {cad_file_id}")
except Exception as exc:
pl.step_error("reextract_cad_metadata", str(exc), exc)
logger.error(f"reextract_cad_metadata failed for {cad_file_id}: {exc}")
finally:
eng.dispose()