#!/usr/bin/env python3 """ PP-StructureV3 POC Script Tests table detection on real Swedish invoices using PP-StructureV3. Run with invoice-sm120 conda environment. """ import sys from pathlib import Path # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root / "packages" / "backend")) from paddleocr import PPStructureV3 import fitz # PyMuPDF def render_pdf_to_image(pdf_path: str, dpi: int = 200) -> bytes: """Render first page of PDF to image bytes.""" doc = fitz.open(pdf_path) page = doc[0] mat = fitz.Matrix(dpi / 72, dpi / 72) pix = page.get_pixmap(matrix=mat) img_bytes = pix.tobytes("png") doc.close() return img_bytes def test_table_detection(pdf_path: str) -> dict: """Test PP-StructureV3 table detection on a PDF.""" print(f"\n{'='*60}") print(f"Testing: {Path(pdf_path).name}") print(f"{'='*60}") # Render PDF to image print("Rendering PDF to image...") img_bytes = render_pdf_to_image(pdf_path) # Save temp image temp_img_path = "/tmp/test_invoice.png" with open(temp_img_path, "wb") as f: f.write(img_bytes) print(f"Saved temp image: {temp_img_path}") # Initialize PP-StructureV3 print("Initializing PP-StructureV3...") pipeline = PPStructureV3( device="gpu:0", use_doc_orientation_classify=False, use_doc_unwarping=False, ) # Run detection print("Running table detection...") results = pipeline.predict(temp_img_path) # Parse results - PaddleX 3.x returns dict-like LayoutParsingResultV2 tables_found = [] all_elements = [] for result in results if results else []: # Get table results from the new API table_res_list = result.get("table_res_list") if hasattr(result, "get") else None if table_res_list: print(f" Found {len(table_res_list)} tables in table_res_list") for i, table_res in enumerate(table_res_list): # Debug: show all keys in table_res if isinstance(table_res, dict): print(f" Table {i+1} keys: {list(table_res.keys())}") else: print(f" Table {i+1} attrs: {[a for a in dir(table_res) if not a.startswith('_')]}") # Extract table info - use correct key names from PaddleX 3.x cell_boxes = table_res.get("cell_box_list", []) html = table_res.get("pred_html", "") # HTML is in pred_html ocr_text = table_res.get("table_ocr_pred", []) region_id = table_res.get("table_region_id", -1) bbox = [] # bbox is stored elsewhere in parsing_res_list print(f" Table {i+1}:") print(f" - Cells: {len(cell_boxes) if cell_boxes is not None else 0}") print(f" - Region ID: {region_id}") print(f" - HTML length: {len(html) if html else 0}") print(f" - OCR texts: {len(ocr_text) if ocr_text else 0}") if html: print(f" - HTML preview: {html[:300]}...") if ocr_text and len(ocr_text) > 0: print(f" - First few OCR texts: {ocr_text[:3]}") tables_found.append({ "index": i, "cell_count": len(cell_boxes) if cell_boxes is not None else 0, "region_id": region_id, "html": html[:1000] if html else "", "ocr_count": len(ocr_text) if ocr_text else 0, }) # Get parsing results for all layout elements parsing_res_list = result.get("parsing_res_list") if hasattr(result, "get") else None if parsing_res_list: print(f"\n Layout elements from parsing_res_list:") for elem in parsing_res_list[:10]: # Show first 10 label = elem.get("label", "unknown") if isinstance(elem, dict) else getattr(elem, "label", "unknown") bbox = elem.get("bbox", []) if isinstance(elem, dict) else getattr(elem, "bbox", []) print(f" - {label}: {bbox}") all_elements.append({"label": label, "bbox": bbox}) print(f"\nSummary:") print(f" Tables detected: {len(tables_found)}") print(f" Layout elements: {len(all_elements)}") return {"pdf": pdf_path, "tables": tables_found, "elements": all_elements} def main(): # Find test PDFs pdf_dir = Path("/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/data/admin_uploads") pdf_files = list(pdf_dir.glob("*.pdf"))[:5] # Test first 5 if not pdf_files: print("No PDF files found in admin_uploads directory") return print(f"Found {len(pdf_files)} PDF files") all_results = [] for pdf_file in pdf_files: result = test_table_detection(str(pdf_file)) all_results.append(result) # Summary print(f"\n{'='*60}") print("FINAL SUMMARY") print(f"{'='*60}") total_tables = sum(len(r["tables"]) for r in all_results) print(f"Total PDFs tested: {len(all_results)}") print(f"Total tables detected: {total_tables}") for r in all_results: pdf_name = Path(r["pdf"]).name table_count = len(r["tables"]) print(f" {pdf_name}: {table_count} tables") for t in r["tables"]: print(f" - Table {t['index']+1}: {t['cell_count']} cells") if __name__ == "__main__": main()