fix(glb): 175/175 material substitution via _AFN suffix stripping

OCC's RWGltf_CafWriter appends _AF0/_AF1 assembly-instance suffixes to mesh object names when a part appears multiple times in an assembly. The material matching in export_gltf.py only stripped Blender's .001 suffix, leaving 24/175 GLB objects without materials. Fix: strip _AFN suffixes via while loop (handles nested _AF0_AF1), add prefix fallback (longest key wins) as last resort before no-match. Also improve build_materials_from_excel Jaccard matching: - Strip _AFN and numeric hash suffixes (-21227) before tokenizing - Add prefix-based fallback (step 3) before position fallback (step 4) - Raise threshold 0.3 → 0.35 for better precision - Guard prefix matches to len >= 5 to prevent trivial false positives Result: Material substitution: 175/175 mesh objects assigned (was 151/175) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-07 22:15:11 +01:00
parent 95cfe0aa93
commit 934728da77
2 changed files with 100 additions and 14 deletions
@@ -430,11 +430,18 @@ async def regenerate_product_thumbnail(


 def _normalize_part_token_name(name: str) -> str:
-    """Lowercase, strip .prt extension, normalise separators to underscore."""
+    """Lowercase, strip .prt extension and noise suffixes, normalise separators to underscore."""
    import re as _re
    name = name.lower().strip()
    if name.endswith(".prt"):
        name = name[:-4]
+    # Strip OCC assembly-instance suffixes (_AF0, _AF1 …) that RWGltf_CafWriter adds
+    prev = None
+    while prev != name:
+        prev = name
+        name = _re.sub(r"_af\d+$", "", name)
+    # Strip trailing numeric hash suffixes from Excel .prt filenames (e.g. -21227)
+    name = _re.sub(r"-\d{4,}$", "", name)
    # Hyphens and dots → underscores for uniform token splitting
    return _re.sub(r"[-.]", "_", name)

@@ -456,23 +463,24 @@ def _jaccard(a: set, b: set) -> float:
 def build_materials_from_excel(
    cad_parts: list[str],
    excel_components: list[dict],
-    similarity_threshold: float = 0.3,
+    similarity_threshold: float = 0.35,
 ) -> list[dict]:
    """Match CAD part names to Excel components and return cad_part_materials list.

    Pure function — no DB access, sync-safe, callable from Celery tasks.

-    Matching strategy per CAD part:
+    Matching strategy per CAD part (in order):
    1. Exact case-insensitive name match
    2. Token-based Jaccard similarity on normalised filenames
-    3. Position-based fallback for low-confidence matches
+    3. Prefix-based fallback (one normalised name is a prefix of the other)
+    4. Position-based fallback for remaining low-confidence cases
    """
-    excel_entries: list[tuple[set[str], str, str]] = []
+    excel_entries: list[tuple[set[str], str, str, str]] = []
    for c in excel_components:
        raw = (c.get("part_name") or "").lower().strip()
        norm = _normalize_part_token_name(raw)
        tokens = _part_tokens(norm)
-        excel_entries.append((tokens, raw, c.get("material") or ""))
+        excel_entries.append((tokens, raw, c.get("material") or "", norm))

    new_materials: list[dict] = []
    for i, cad_part in enumerate(cad_parts):
@@ -483,16 +491,30 @@ def build_materials_from_excel(
        best_mat = ""
        best_score = 0.0

-        for tokens, raw, material in excel_entries:
+        for tokens, raw, material, excel_norm in excel_entries:
+            # 1. Exact match
            if raw == cad_raw_lower:
                best_mat = material
                best_score = 1.0
                break
+            # 2. Jaccard similarity
            score = _jaccard(tokens, cad_tokens)
            if score > best_score:
                best_score = score
                best_mat = material

+        # 3. Prefix fallback when Jaccard is below threshold but not zero:
+        # one normalised name starts with the other (handles sub-assembly variants)
+        if best_score < similarity_threshold:
+            for tokens, raw, material, excel_norm in excel_entries:
+                if len(excel_norm) >= 5 and len(cad_norm) >= 5 and (
+                    cad_norm.startswith(excel_norm) or excel_norm.startswith(cad_norm)
+                ):
+                    best_mat = material
+                    best_score = 0.7
+                    break
+
+        # 4. Position-based fallback
        if best_score < similarity_threshold:
            if i < len(excel_components):
                best_mat = excel_components[i].get("material") or ""