invoice-master-poc-v2/scripts/ppstructure_poc.py

#!/usr/bin/env python3
"""
PP-StructureV3 POC Script

Tests table detection on real Swedish invoices using PP-StructureV3.
Run with invoice-sm120 conda environment.
"""

import sys
from pathlib import Path

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root / "packages" / "backend"))

from paddleocr import PPStructureV3
import fitz  # PyMuPDF


def render_pdf_to_image(pdf_path: str, dpi: int = 200) -> bytes:
    """Render first page of PDF to image bytes."""
    doc = fitz.open(pdf_path)
    page = doc[0]
    mat = fitz.Matrix(dpi / 72, dpi / 72)
    pix = page.get_pixmap(matrix=mat)
    img_bytes = pix.tobytes("png")
    doc.close()
    return img_bytes


def test_table_detection(pdf_path: str) -> dict:
    """Test PP-StructureV3 table detection on a PDF."""
    print(f"\n{'='*60}")
    print(f"Testing: {Path(pdf_path).name}")
    print(f"{'='*60}")

    # Render PDF to image
    print("Rendering PDF to image...")
    img_bytes = render_pdf_to_image(pdf_path)

    # Save temp image
    temp_img_path = "/tmp/test_invoice.png"
    with open(temp_img_path, "wb") as f:
        f.write(img_bytes)
    print(f"Saved temp image: {temp_img_path}")

    # Initialize PP-StructureV3
    print("Initializing PP-StructureV3...")
    pipeline = PPStructureV3(
        device="gpu:0",
        use_doc_orientation_classify=False,
        use_doc_unwarping=False,
    )

    # Run detection
    print("Running table detection...")
    results = pipeline.predict(temp_img_path)

    # Parse results - PaddleX 3.x returns dict-like LayoutParsingResultV2
    tables_found = []
    all_elements = []

    for result in results if results else []:
        # Get table results from the new API
        table_res_list = result.get("table_res_list") if hasattr(result, "get") else None

        if table_res_list:
            print(f"  Found {len(table_res_list)} tables in table_res_list")
            for i, table_res in enumerate(table_res_list):
                # Debug: show all keys in table_res
                if isinstance(table_res, dict):
                    print(f"  Table {i+1} keys: {list(table_res.keys())}")
                else:
                    print(f"  Table {i+1} attrs: {[a for a in dir(table_res) if not a.startswith('_')]}")

                # Extract table info - use correct key names from PaddleX 3.x
                cell_boxes = table_res.get("cell_box_list", [])
                html = table_res.get("pred_html", "")  # HTML is in pred_html
                ocr_text = table_res.get("table_ocr_pred", [])
                region_id = table_res.get("table_region_id", -1)
                bbox = []  # bbox is stored elsewhere in parsing_res_list

                print(f"  Table {i+1}:")
                print(f"    - Cells: {len(cell_boxes) if cell_boxes is not None else 0}")
                print(f"    - Region ID: {region_id}")
                print(f"    - HTML length: {len(html) if html else 0}")
                print(f"    - OCR texts: {len(ocr_text) if ocr_text else 0}")

                if html:
                    print(f"    - HTML preview: {html[:300]}...")

                if ocr_text and len(ocr_text) > 0:
                    print(f"    - First few OCR texts: {ocr_text[:3]}")

                tables_found.append({
                    "index": i,
                    "cell_count": len(cell_boxes) if cell_boxes is not None else 0,
                    "region_id": region_id,
                    "html": html[:1000] if html else "",
                    "ocr_count": len(ocr_text) if ocr_text else 0,
                })

        # Get parsing results for all layout elements
        parsing_res_list = result.get("parsing_res_list") if hasattr(result, "get") else None

        if parsing_res_list:
            print(f"\n  Layout elements from parsing_res_list:")
            for elem in parsing_res_list[:10]:  # Show first 10
                label = elem.get("label", "unknown") if isinstance(elem, dict) else getattr(elem, "label", "unknown")
                bbox = elem.get("bbox", []) if isinstance(elem, dict) else getattr(elem, "bbox", [])
                print(f"    - {label}: {bbox}")
                all_elements.append({"label": label, "bbox": bbox})

    print(f"\nSummary:")
    print(f"  Tables detected: {len(tables_found)}")
    print(f"  Layout elements: {len(all_elements)}")

    return {"pdf": pdf_path, "tables": tables_found, "elements": all_elements}


def main():
    # Find test PDFs
    pdf_dir = Path("/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/data/admin_uploads")
    pdf_files = list(pdf_dir.glob("*.pdf"))[:5]  # Test first 5

    if not pdf_files:
        print("No PDF files found in admin_uploads directory")
        return

    print(f"Found {len(pdf_files)} PDF files")

    all_results = []
    for pdf_file in pdf_files:
        result = test_table_detection(str(pdf_file))
        all_results.append(result)

    # Summary
    print(f"\n{'='*60}")
    print("FINAL SUMMARY")
    print(f"{'='*60}")
    total_tables = sum(len(r["tables"]) for r in all_results)
    print(f"Total PDFs tested: {len(all_results)}")
    print(f"Total tables detected: {total_tables}")

    for r in all_results:
        pdf_name = Path(r["pdf"]).name
        table_count = len(r["tables"])
        print(f"  {pdf_name}: {table_count} tables")
        for t in r["tables"]:
            print(f"    - Table {t['index']+1}: {t['cell_count']} cells")


if __name__ == "__main__":
    main()