This commit is contained in:
Yaojia Wang
2026-02-12 23:06:00 +01:00
parent ad5ed46b4c
commit 58d36c8927
26 changed files with 3903 additions and 2551 deletions

54
scripts/render_pdfs_v3.py Normal file
View File

@@ -0,0 +1,54 @@
#!/usr/bin/env python3
"""Render selected PDFs from v3 batch for visual comparison."""
import os
import fitz # PyMuPDF
PDF_DIR = "/mnt/c/Users/yaoji/git/Billo/Billo.Platform.Document/Billo.Platform.Document.AdminAPI/downloads/to_check"
OUTPUT_DIR = "/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/scripts/pdf_renders_v3"
# Select 10 PDFs covering different scenarios:
SELECTED = [
# Potentially wrong Amount (81648164.00 - too high?)
"b84c7d70-821d-4a1a-9be7-d7bb2392bd91.pdf",
# Only 2 fields extracted
"072571e2-da5f-4268-b1a8-f0e5a85a3ec4.pdf",
# InvoiceNumber=5085 (suspiciously short, same as BG prefix?)
"6a83ba35-afdf-4c13-ade1-25513e213637.pdf",
# InvoiceNumber=450 (very short, might be wrong)
"8551b540-d93d-459d-b7eb-e9ee086f9f16.pdf",
# InvoiceNumber=134 (very short, same as BG prefix)
"cb1bd3b1-63d0-4140-930f-e4a7ae2b6cd5.pdf",
# Large Amount=172904.52, InvoiceNumber=89902
"d121a5ee-7382-41d8-8010-63880def1f96.pdf",
# Good 9-field PDF for positive check
"6cb90895-e52b-4831-b57b-7cb968bcdd54.pdf",
# Amount=2026.00 (same as year - could be confused?)
"d376c5b5-0dc5-4ccf-b787-0d481eef8577.pdf",
# 8 fields, good coverage
"f3f5da6f-7552-4ec6-8625-3629042fbfd0.pdf",
# Low confidence Amount=596.49
"5783e4af-eef3-411c-84b1-3a8f4694fed8.pdf",
]
os.makedirs(OUTPUT_DIR, exist_ok=True)
for pdf_name in SELECTED:
pdf_path = os.path.join(PDF_DIR, pdf_name)
if not os.path.exists(pdf_path):
print(f"SKIP {pdf_name} - not found")
continue
doc = fitz.open(pdf_path)
page = doc[0]
mat = fitz.Matrix(150 / 72, 150 / 72)
pix = page.get_pixmap(matrix=mat)
out_name = pdf_name.replace(".pdf", ".png")
out_path = os.path.join(OUTPUT_DIR, out_name)
pix.save(out_path)
print(f"Rendered {pdf_name} -> {out_name} ({pix.width}x{pix.height})")
doc.close()
print(f"\nAll renders saved to {OUTPUT_DIR}")