Update paddle, and support invoice line item
This commit is contained in:
154
scripts/ppstructure_poc.py
Normal file
154
scripts/ppstructure_poc.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PP-StructureV3 POC Script
|
||||
|
||||
Tests table detection on real Swedish invoices using PP-StructureV3.
|
||||
Run with invoice-sm120 conda environment.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root / "packages" / "backend"))
|
||||
|
||||
from paddleocr import PPStructureV3
|
||||
import fitz # PyMuPDF
|
||||
|
||||
|
||||
def render_pdf_to_image(pdf_path: str, dpi: int = 200) -> bytes:
|
||||
"""Render first page of PDF to image bytes."""
|
||||
doc = fitz.open(pdf_path)
|
||||
page = doc[0]
|
||||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img_bytes = pix.tobytes("png")
|
||||
doc.close()
|
||||
return img_bytes
|
||||
|
||||
|
||||
def test_table_detection(pdf_path: str) -> dict:
|
||||
"""Test PP-StructureV3 table detection on a PDF."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing: {Path(pdf_path).name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Render PDF to image
|
||||
print("Rendering PDF to image...")
|
||||
img_bytes = render_pdf_to_image(pdf_path)
|
||||
|
||||
# Save temp image
|
||||
temp_img_path = "/tmp/test_invoice.png"
|
||||
with open(temp_img_path, "wb") as f:
|
||||
f.write(img_bytes)
|
||||
print(f"Saved temp image: {temp_img_path}")
|
||||
|
||||
# Initialize PP-StructureV3
|
||||
print("Initializing PP-StructureV3...")
|
||||
pipeline = PPStructureV3(
|
||||
device="gpu:0",
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
)
|
||||
|
||||
# Run detection
|
||||
print("Running table detection...")
|
||||
results = pipeline.predict(temp_img_path)
|
||||
|
||||
# Parse results - PaddleX 3.x returns dict-like LayoutParsingResultV2
|
||||
tables_found = []
|
||||
all_elements = []
|
||||
|
||||
for result in results if results else []:
|
||||
# Get table results from the new API
|
||||
table_res_list = result.get("table_res_list") if hasattr(result, "get") else None
|
||||
|
||||
if table_res_list:
|
||||
print(f" Found {len(table_res_list)} tables in table_res_list")
|
||||
for i, table_res in enumerate(table_res_list):
|
||||
# Debug: show all keys in table_res
|
||||
if isinstance(table_res, dict):
|
||||
print(f" Table {i+1} keys: {list(table_res.keys())}")
|
||||
else:
|
||||
print(f" Table {i+1} attrs: {[a for a in dir(table_res) if not a.startswith('_')]}")
|
||||
|
||||
# Extract table info - use correct key names from PaddleX 3.x
|
||||
cell_boxes = table_res.get("cell_box_list", [])
|
||||
html = table_res.get("pred_html", "") # HTML is in pred_html
|
||||
ocr_text = table_res.get("table_ocr_pred", [])
|
||||
region_id = table_res.get("table_region_id", -1)
|
||||
bbox = [] # bbox is stored elsewhere in parsing_res_list
|
||||
|
||||
print(f" Table {i+1}:")
|
||||
print(f" - Cells: {len(cell_boxes) if cell_boxes is not None else 0}")
|
||||
print(f" - Region ID: {region_id}")
|
||||
print(f" - HTML length: {len(html) if html else 0}")
|
||||
print(f" - OCR texts: {len(ocr_text) if ocr_text else 0}")
|
||||
|
||||
if html:
|
||||
print(f" - HTML preview: {html[:300]}...")
|
||||
|
||||
if ocr_text and len(ocr_text) > 0:
|
||||
print(f" - First few OCR texts: {ocr_text[:3]}")
|
||||
|
||||
tables_found.append({
|
||||
"index": i,
|
||||
"cell_count": len(cell_boxes) if cell_boxes is not None else 0,
|
||||
"region_id": region_id,
|
||||
"html": html[:1000] if html else "",
|
||||
"ocr_count": len(ocr_text) if ocr_text else 0,
|
||||
})
|
||||
|
||||
# Get parsing results for all layout elements
|
||||
parsing_res_list = result.get("parsing_res_list") if hasattr(result, "get") else None
|
||||
|
||||
if parsing_res_list:
|
||||
print(f"\n Layout elements from parsing_res_list:")
|
||||
for elem in parsing_res_list[:10]: # Show first 10
|
||||
label = elem.get("label", "unknown") if isinstance(elem, dict) else getattr(elem, "label", "unknown")
|
||||
bbox = elem.get("bbox", []) if isinstance(elem, dict) else getattr(elem, "bbox", [])
|
||||
print(f" - {label}: {bbox}")
|
||||
all_elements.append({"label": label, "bbox": bbox})
|
||||
|
||||
print(f"\nSummary:")
|
||||
print(f" Tables detected: {len(tables_found)}")
|
||||
print(f" Layout elements: {len(all_elements)}")
|
||||
|
||||
return {"pdf": pdf_path, "tables": tables_found, "elements": all_elements}
|
||||
|
||||
|
||||
def main():
|
||||
# Find test PDFs
|
||||
pdf_dir = Path("/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/data/admin_uploads")
|
||||
pdf_files = list(pdf_dir.glob("*.pdf"))[:5] # Test first 5
|
||||
|
||||
if not pdf_files:
|
||||
print("No PDF files found in admin_uploads directory")
|
||||
return
|
||||
|
||||
print(f"Found {len(pdf_files)} PDF files")
|
||||
|
||||
all_results = []
|
||||
for pdf_file in pdf_files:
|
||||
result = test_table_detection(str(pdf_file))
|
||||
all_results.append(result)
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print("FINAL SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
total_tables = sum(len(r["tables"]) for r in all_results)
|
||||
print(f"Total PDFs tested: {len(all_results)}")
|
||||
print(f"Total tables detected: {total_tables}")
|
||||
|
||||
for r in all_results:
|
||||
pdf_name = Path(r["pdf"]).name
|
||||
table_count = len(r["tables"])
|
||||
print(f" {pdf_name}: {table_count} tables")
|
||||
for t in r["tables"]:
|
||||
print(f" - Table {t['index']+1}: {t['cell_count']} cells")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user