Update paddle, and support invoice line item

This commit is contained in:
Yaojia Wang
2026-02-03 21:28:06 +01:00
parent c4e3773df1
commit 35988b1ebf
41 changed files with 6832 additions and 48 deletions

154
scripts/ppstructure_poc.py Normal file
View File

@@ -0,0 +1,154 @@
#!/usr/bin/env python3
"""
PP-StructureV3 POC Script
Tests table detection on real Swedish invoices using PP-StructureV3.
Run with invoice-sm120 conda environment.
"""
import sys
from pathlib import Path
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root / "packages" / "backend"))
from paddleocr import PPStructureV3
import fitz # PyMuPDF
def render_pdf_to_image(pdf_path: str, dpi: int = 200) -> bytes:
"""Render first page of PDF to image bytes."""
doc = fitz.open(pdf_path)
page = doc[0]
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat)
img_bytes = pix.tobytes("png")
doc.close()
return img_bytes
def test_table_detection(pdf_path: str) -> dict:
"""Test PP-StructureV3 table detection on a PDF."""
print(f"\n{'='*60}")
print(f"Testing: {Path(pdf_path).name}")
print(f"{'='*60}")
# Render PDF to image
print("Rendering PDF to image...")
img_bytes = render_pdf_to_image(pdf_path)
# Save temp image
temp_img_path = "/tmp/test_invoice.png"
with open(temp_img_path, "wb") as f:
f.write(img_bytes)
print(f"Saved temp image: {temp_img_path}")
# Initialize PP-StructureV3
print("Initializing PP-StructureV3...")
pipeline = PPStructureV3(
device="gpu:0",
use_doc_orientation_classify=False,
use_doc_unwarping=False,
)
# Run detection
print("Running table detection...")
results = pipeline.predict(temp_img_path)
# Parse results - PaddleX 3.x returns dict-like LayoutParsingResultV2
tables_found = []
all_elements = []
for result in results if results else []:
# Get table results from the new API
table_res_list = result.get("table_res_list") if hasattr(result, "get") else None
if table_res_list:
print(f" Found {len(table_res_list)} tables in table_res_list")
for i, table_res in enumerate(table_res_list):
# Debug: show all keys in table_res
if isinstance(table_res, dict):
print(f" Table {i+1} keys: {list(table_res.keys())}")
else:
print(f" Table {i+1} attrs: {[a for a in dir(table_res) if not a.startswith('_')]}")
# Extract table info - use correct key names from PaddleX 3.x
cell_boxes = table_res.get("cell_box_list", [])
html = table_res.get("pred_html", "") # HTML is in pred_html
ocr_text = table_res.get("table_ocr_pred", [])
region_id = table_res.get("table_region_id", -1)
bbox = [] # bbox is stored elsewhere in parsing_res_list
print(f" Table {i+1}:")
print(f" - Cells: {len(cell_boxes) if cell_boxes is not None else 0}")
print(f" - Region ID: {region_id}")
print(f" - HTML length: {len(html) if html else 0}")
print(f" - OCR texts: {len(ocr_text) if ocr_text else 0}")
if html:
print(f" - HTML preview: {html[:300]}...")
if ocr_text and len(ocr_text) > 0:
print(f" - First few OCR texts: {ocr_text[:3]}")
tables_found.append({
"index": i,
"cell_count": len(cell_boxes) if cell_boxes is not None else 0,
"region_id": region_id,
"html": html[:1000] if html else "",
"ocr_count": len(ocr_text) if ocr_text else 0,
})
# Get parsing results for all layout elements
parsing_res_list = result.get("parsing_res_list") if hasattr(result, "get") else None
if parsing_res_list:
print(f"\n Layout elements from parsing_res_list:")
for elem in parsing_res_list[:10]: # Show first 10
label = elem.get("label", "unknown") if isinstance(elem, dict) else getattr(elem, "label", "unknown")
bbox = elem.get("bbox", []) if isinstance(elem, dict) else getattr(elem, "bbox", [])
print(f" - {label}: {bbox}")
all_elements.append({"label": label, "bbox": bbox})
print(f"\nSummary:")
print(f" Tables detected: {len(tables_found)}")
print(f" Layout elements: {len(all_elements)}")
return {"pdf": pdf_path, "tables": tables_found, "elements": all_elements}
def main():
# Find test PDFs
pdf_dir = Path("/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/data/admin_uploads")
pdf_files = list(pdf_dir.glob("*.pdf"))[:5] # Test first 5
if not pdf_files:
print("No PDF files found in admin_uploads directory")
return
print(f"Found {len(pdf_files)} PDF files")
all_results = []
for pdf_file in pdf_files:
result = test_table_detection(str(pdf_file))
all_results.append(result)
# Summary
print(f"\n{'='*60}")
print("FINAL SUMMARY")
print(f"{'='*60}")
total_tables = sum(len(r["tables"]) for r in all_results)
print(f"Total PDFs tested: {len(all_results)}")
print(f"Total tables detected: {total_tables}")
for r in all_results:
pdf_name = Path(r["pdf"]).name
table_count = len(r["tables"])
print(f" {pdf_name}: {table_count} tables")
for t in r["tables"]:
print(f" - Table {t['index']+1}: {t['cell_count']} cells")
if __name__ == "__main__":
main()