Update paddle, and support invoice line item

2026-02-03 21:28:06 +01:00
parent c4e3773df1
commit 35988b1ebf
41 changed files with 6832 additions and 48 deletions
--- a/scripts/ppstructure_poc.py
+++ b/scripts/ppstructure_poc.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+PP-StructureV3 POC Script
+
+Tests table detection on real Swedish invoices using PP-StructureV3.
+Run with invoice-sm120 conda environment.
+"""
+
+import sys
+from pathlib import Path
+
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root / "packages" / "backend"))
+
+from paddleocr import PPStructureV3
+import fitz  # PyMuPDF
+
+
+def render_pdf_to_image(pdf_path: str, dpi: int = 200) -> bytes:
+    """Render first page of PDF to image bytes."""
+    doc = fitz.open(pdf_path)
+    page = doc[0]
+    mat = fitz.Matrix(dpi / 72, dpi / 72)
+    pix = page.get_pixmap(matrix=mat)
+    img_bytes = pix.tobytes("png")
+    doc.close()
+    return img_bytes
+
+
+def test_table_detection(pdf_path: str) -> dict:
+    """Test PP-StructureV3 table detection on a PDF."""
+    print(f"\n{'='*60}")
+    print(f"Testing: {Path(pdf_path).name}")
+    print(f"{'='*60}")
+
+    # Render PDF to image
+    print("Rendering PDF to image...")
+    img_bytes = render_pdf_to_image(pdf_path)
+
+    # Save temp image
+    temp_img_path = "/tmp/test_invoice.png"
+    with open(temp_img_path, "wb") as f:
+        f.write(img_bytes)
+    print(f"Saved temp image: {temp_img_path}")
+
+    # Initialize PP-StructureV3
+    print("Initializing PP-StructureV3...")
+    pipeline = PPStructureV3(
+        device="gpu:0",
+        use_doc_orientation_classify=False,
+        use_doc_unwarping=False,
+    )
+
+    # Run detection
+    print("Running table detection...")
+    results = pipeline.predict(temp_img_path)
+
+    # Parse results - PaddleX 3.x returns dict-like LayoutParsingResultV2
+    tables_found = []
+    all_elements = []
+
+    for result in results if results else []:
+        # Get table results from the new API
+        table_res_list = result.get("table_res_list") if hasattr(result, "get") else None
+
+        if table_res_list:
+            print(f"  Found {len(table_res_list)} tables in table_res_list")
+            for i, table_res in enumerate(table_res_list):
+                # Debug: show all keys in table_res
+                if isinstance(table_res, dict):
+                    print(f"  Table {i+1} keys: {list(table_res.keys())}")
+                else:
+                    print(f"  Table {i+1} attrs: {[a for a in dir(table_res) if not a.startswith('_')]}")
+
+                # Extract table info - use correct key names from PaddleX 3.x
+                cell_boxes = table_res.get("cell_box_list", [])
+                html = table_res.get("pred_html", "")  # HTML is in pred_html
+                ocr_text = table_res.get("table_ocr_pred", [])
+                region_id = table_res.get("table_region_id", -1)
+                bbox = []  # bbox is stored elsewhere in parsing_res_list
+
+                print(f"  Table {i+1}:")
+                print(f"    - Cells: {len(cell_boxes) if cell_boxes is not None else 0}")
+                print(f"    - Region ID: {region_id}")
+                print(f"    - HTML length: {len(html) if html else 0}")
+                print(f"    - OCR texts: {len(ocr_text) if ocr_text else 0}")
+
+                if html:
+                    print(f"    - HTML preview: {html[:300]}...")
+
+                if ocr_text and len(ocr_text) > 0:
+                    print(f"    - First few OCR texts: {ocr_text[:3]}")
+
+                tables_found.append({
+                    "index": i,
+                    "cell_count": len(cell_boxes) if cell_boxes is not None else 0,
+                    "region_id": region_id,
+                    "html": html[:1000] if html else "",
+                    "ocr_count": len(ocr_text) if ocr_text else 0,
+                })
+
+        # Get parsing results for all layout elements
+        parsing_res_list = result.get("parsing_res_list") if hasattr(result, "get") else None
+
+        if parsing_res_list:
+            print(f"\n  Layout elements from parsing_res_list:")
+            for elem in parsing_res_list[:10]:  # Show first 10
+                label = elem.get("label", "unknown") if isinstance(elem, dict) else getattr(elem, "label", "unknown")
+                bbox = elem.get("bbox", []) if isinstance(elem, dict) else getattr(elem, "bbox", [])
+                print(f"    - {label}: {bbox}")
+                all_elements.append({"label": label, "bbox": bbox})
+
+    print(f"\nSummary:")
+    print(f"  Tables detected: {len(tables_found)}")
+    print(f"  Layout elements: {len(all_elements)}")
+
+    return {"pdf": pdf_path, "tables": tables_found, "elements": all_elements}
+
+
+def main():
+    # Find test PDFs
+    pdf_dir = Path("/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/data/admin_uploads")
+    pdf_files = list(pdf_dir.glob("*.pdf"))[:5]  # Test first 5
+
+    if not pdf_files:
+        print("No PDF files found in admin_uploads directory")
+        return
+
+    print(f"Found {len(pdf_files)} PDF files")
+
+    all_results = []
+    for pdf_file in pdf_files:
+        result = test_table_detection(str(pdf_file))
+        all_results.append(result)
+
+    # Summary
+    print(f"\n{'='*60}")
+    print("FINAL SUMMARY")
+    print(f"{'='*60}")
+    total_tables = sum(len(r["tables"]) for r in all_results)
+    print(f"Total PDFs tested: {len(all_results)}")
+    print(f"Total tables detected: {total_tables}")
+
+    for r in all_results:
+        pdf_name = Path(r["pdf"]).name
+        table_count = len(r["tables"])
+        print(f"  {pdf_name}: {table_count} tables")
+        for t in r["tables"]:
+            print(f"    - Table {t['index']+1}: {t['cell_count']} cells")
+
+
+if __name__ == "__main__":
+    main()