155 lines
5.4 KiB
Python
155 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PP-StructureV3 POC Script
|
|
|
|
Tests table detection on real Swedish invoices using PP-StructureV3.
|
|
Run with invoice-sm120 conda environment.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root / "packages" / "backend"))
|
|
|
|
from paddleocr import PPStructureV3
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
def render_pdf_to_image(pdf_path: str, dpi: int = 200) -> bytes:
|
|
"""Render first page of PDF to image bytes."""
|
|
doc = fitz.open(pdf_path)
|
|
page = doc[0]
|
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img_bytes = pix.tobytes("png")
|
|
doc.close()
|
|
return img_bytes
|
|
|
|
|
|
def test_table_detection(pdf_path: str) -> dict:
|
|
"""Test PP-StructureV3 table detection on a PDF."""
|
|
print(f"\n{'='*60}")
|
|
print(f"Testing: {Path(pdf_path).name}")
|
|
print(f"{'='*60}")
|
|
|
|
# Render PDF to image
|
|
print("Rendering PDF to image...")
|
|
img_bytes = render_pdf_to_image(pdf_path)
|
|
|
|
# Save temp image
|
|
temp_img_path = "/tmp/test_invoice.png"
|
|
with open(temp_img_path, "wb") as f:
|
|
f.write(img_bytes)
|
|
print(f"Saved temp image: {temp_img_path}")
|
|
|
|
# Initialize PP-StructureV3
|
|
print("Initializing PP-StructureV3...")
|
|
pipeline = PPStructureV3(
|
|
device="gpu:0",
|
|
use_doc_orientation_classify=False,
|
|
use_doc_unwarping=False,
|
|
)
|
|
|
|
# Run detection
|
|
print("Running table detection...")
|
|
results = pipeline.predict(temp_img_path)
|
|
|
|
# Parse results - PaddleX 3.x returns dict-like LayoutParsingResultV2
|
|
tables_found = []
|
|
all_elements = []
|
|
|
|
for result in results if results else []:
|
|
# Get table results from the new API
|
|
table_res_list = result.get("table_res_list") if hasattr(result, "get") else None
|
|
|
|
if table_res_list:
|
|
print(f" Found {len(table_res_list)} tables in table_res_list")
|
|
for i, table_res in enumerate(table_res_list):
|
|
# Debug: show all keys in table_res
|
|
if isinstance(table_res, dict):
|
|
print(f" Table {i+1} keys: {list(table_res.keys())}")
|
|
else:
|
|
print(f" Table {i+1} attrs: {[a for a in dir(table_res) if not a.startswith('_')]}")
|
|
|
|
# Extract table info - use correct key names from PaddleX 3.x
|
|
cell_boxes = table_res.get("cell_box_list", [])
|
|
html = table_res.get("pred_html", "") # HTML is in pred_html
|
|
ocr_text = table_res.get("table_ocr_pred", [])
|
|
region_id = table_res.get("table_region_id", -1)
|
|
bbox = [] # bbox is stored elsewhere in parsing_res_list
|
|
|
|
print(f" Table {i+1}:")
|
|
print(f" - Cells: {len(cell_boxes) if cell_boxes is not None else 0}")
|
|
print(f" - Region ID: {region_id}")
|
|
print(f" - HTML length: {len(html) if html else 0}")
|
|
print(f" - OCR texts: {len(ocr_text) if ocr_text else 0}")
|
|
|
|
if html:
|
|
print(f" - HTML preview: {html[:300]}...")
|
|
|
|
if ocr_text and len(ocr_text) > 0:
|
|
print(f" - First few OCR texts: {ocr_text[:3]}")
|
|
|
|
tables_found.append({
|
|
"index": i,
|
|
"cell_count": len(cell_boxes) if cell_boxes is not None else 0,
|
|
"region_id": region_id,
|
|
"html": html[:1000] if html else "",
|
|
"ocr_count": len(ocr_text) if ocr_text else 0,
|
|
})
|
|
|
|
# Get parsing results for all layout elements
|
|
parsing_res_list = result.get("parsing_res_list") if hasattr(result, "get") else None
|
|
|
|
if parsing_res_list:
|
|
print(f"\n Layout elements from parsing_res_list:")
|
|
for elem in parsing_res_list[:10]: # Show first 10
|
|
label = elem.get("label", "unknown") if isinstance(elem, dict) else getattr(elem, "label", "unknown")
|
|
bbox = elem.get("bbox", []) if isinstance(elem, dict) else getattr(elem, "bbox", [])
|
|
print(f" - {label}: {bbox}")
|
|
all_elements.append({"label": label, "bbox": bbox})
|
|
|
|
print(f"\nSummary:")
|
|
print(f" Tables detected: {len(tables_found)}")
|
|
print(f" Layout elements: {len(all_elements)}")
|
|
|
|
return {"pdf": pdf_path, "tables": tables_found, "elements": all_elements}
|
|
|
|
|
|
def main():
|
|
# Find test PDFs
|
|
pdf_dir = Path("/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/data/admin_uploads")
|
|
pdf_files = list(pdf_dir.glob("*.pdf"))[:5] # Test first 5
|
|
|
|
if not pdf_files:
|
|
print("No PDF files found in admin_uploads directory")
|
|
return
|
|
|
|
print(f"Found {len(pdf_files)} PDF files")
|
|
|
|
all_results = []
|
|
for pdf_file in pdf_files:
|
|
result = test_table_detection(str(pdf_file))
|
|
all_results.append(result)
|
|
|
|
# Summary
|
|
print(f"\n{'='*60}")
|
|
print("FINAL SUMMARY")
|
|
print(f"{'='*60}")
|
|
total_tables = sum(len(r["tables"]) for r in all_results)
|
|
print(f"Total PDFs tested: {len(all_results)}")
|
|
print(f"Total tables detected: {total_tables}")
|
|
|
|
for r in all_results:
|
|
pdf_name = Path(r["pdf"]).name
|
|
table_count = len(r["tables"])
|
|
print(f" {pdf_name}: {table_count} tables")
|
|
for t in r["tables"]:
|
|
print(f" - Table {t['index']+1}: {t['cell_count']} cells")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|