WIP

2026-02-12 23:06:00 +01:00
parent ad5ed46b4c
commit 58d36c8927
26 changed files with 3903 additions and 2551 deletions
--- a/scripts/analyze_v3.py
+++ b/scripts/analyze_v3.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""Analyze batch inference v3 results (Round 2 fixes)."""
+
+import json
+from collections import Counter
+
+with open("scripts/inference_results_v3.json") as f:
+    results = json.load(f)
+
+total = len(results)
+success = sum(1 for r in results if r["status"] == 200)
+print(f"Total PDFs: {total}, Successful: {success}")
+print()
+
+# Summary table
+header = f"{'PDF':<40} {'Det':<4} {'Fld':<4} {'Time':<7} Fields"
+print(header)
+print("-" * 140)
+for r in results:
+    fn = r["filename"][:39]
+    data = r.get("data", {})
+    result_data = data.get("result", {})
+    fields = result_data.get("fields", {})
+    dets = len(result_data.get("detections", []))
+    nfields = len(fields)
+    t = r["time_seconds"]
+    parts = []
+    for k, v in fields.items():
+        sv = str(v)
+        if len(sv) > 30:
+            sv = sv[:27] + "..."
+        parts.append(f"{k}={sv}")
+    field_str = ", ".join(parts)
+    print(f"{fn:<40} {dets:<4} {nfields:<4} {t:<7} {field_str}")
+
+print()
+
+# Field coverage
+field_counts: Counter = Counter()
+conf_sums: Counter = Counter()
+ok_count = 0
+for r in results:
+    if r["status"] != 200:
+        continue
+    ok_count += 1
+    result_data = r["data"]["result"]
+    for k in result_data.get("fields", {}):
+        field_counts[k] += 1
+    for k, v in (result_data.get("confidence") or {}).items():
+        conf_sums[k] += v
+
+print(f"Field Coverage ({ok_count} successful PDFs):")
+hdr = f"{'Field':<35} {'Present':<10} {'Rate':<10} {'Avg Conf':<10}"
+print(hdr)
+print("-" * 65)
+for field in [
+    "InvoiceNumber", "InvoiceDate", "InvoiceDueDate", "OCR",
+    "Amount", "Bankgiro", "Plusgiro",
+    "supplier_organisation_number", "customer_number", "payment_line",
+]:
+    cnt = field_counts.get(field, 0)
+    rate = cnt / ok_count * 100 if ok_count else 0
+    avg_conf = conf_sums.get(field, 0) / cnt if cnt else 0
+    flag = ""
+    if rate < 30:
+        flag = " <<<"
+    elif rate < 60:
+        flag = " !!"
+    print(f"{field:<35} {cnt:<10} {rate:<10.1f} {avg_conf:<10.3f}{flag}")
+
+# Fallback count
+fb_count = 0
+for r in results:
+    if r["status"] == 200:
+        result_data = r["data"]["result"]
+        if result_data.get("fallback_used"):
+            fb_count += 1
+print(f"\nFallback used: {fb_count}/{ok_count}")
+
+# Low-confidence fields
+print("\nLow-confidence extractions (< 0.7):")
+for r in results:
+    if r["status"] != 200:
+        continue
+    result_data = r["data"]["result"]
+    for k, v in (result_data.get("confidence") or {}).items():
+        if v < 0.7:
+            fv = result_data.get("fields", {}).get(k, "?")
+            print(f"  [{v:.3f}] {k:<25} = {str(fv)[:40]:<40} ({r['filename'][:36]})")
+
+# PDFs with very few fields (possible issues)
+print("\nPDFs with <= 2 fields extracted:")
+for r in results:
+    if r["status"] != 200:
+        continue
+    result_data = r["data"]["result"]
+    fields = result_data.get("fields", {})
+    if len(fields) <= 2:
+        print(f"  {r['filename']}: {len(fields)} fields - {list(fields.keys())}")
+
+# Avg time
+avg_time = sum(r["time_seconds"] for r in results) / len(results)
+print(f"\nAverage processing time: {avg_time:.2f}s")
--- a/scripts/batch_inference_v3.py
+++ b/scripts/batch_inference_v3.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""Batch inference v3 - 30 random PDFs for Round 2 validation."""
+
+import json
+import os
+import random
+import time
+
+import requests
+
+PDF_DIR = "/mnt/c/Users/yaoji/git/Billo/Billo.Platform.Document/Billo.Platform.Document.AdminAPI/downloads/to_check"
+API_URL = "http://localhost:8000/api/v1/infer"
+OUTPUT_FILE = "/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/scripts/inference_results_v3.json"
+SAMPLE_SIZE = 30
+
+
+def main():
+    random.seed(99_2026)  # New seed for Round 3
+
+    all_pdfs = [f for f in os.listdir(PDF_DIR) if f.lower().endswith(".pdf")]
+    selected = random.sample(all_pdfs, min(SAMPLE_SIZE, len(all_pdfs)))
+
+    print(f"Selected {len(selected)} random PDFs for inference")
+
+    results = []
+    for i, filename in enumerate(selected, 1):
+        filepath = os.path.join(PDF_DIR, filename)
+        filesize = os.path.getsize(filepath)
+        print(f"[{i}/{len(selected)}] Processing {filename}...", end=" ", flush=True)
+
+        start = time.time()
+        try:
+            with open(filepath, "rb") as f:
+                resp = requests.post(
+                    API_URL,
+                    files={"file": (filename, f, "application/pdf")},
+                    timeout=120,
+                )
+            elapsed = round(time.time() - start, 2)
+
+            if resp.status_code == 200:
+                data = resp.json()
+                field_count = sum(
+                    1 for k, v in data.items()
+                    if k not in (
+                        "DocumentId", "confidence", "success", "fallback_used",
+                        "bboxes", "cross_validation", "processing_time_ms",
+                        "line_items", "vat_summary", "vat_validation",
+                        "raw_detections", "detection_classes", "detection_count",
+                    )
+                    and v is not None
+                )
+                det_count = data.get("detection_count", "?")
+                print(f"OK ({elapsed}s) - {field_count} fields, {det_count} detections")
+                results.append({
+                    "filename": filename,
+                    "status": resp.status_code,
+                    "time_seconds": elapsed,
+                    "filesize": filesize,
+                    "data": data,
+                })
+            else:
+                print(f"HTTP {resp.status_code} ({elapsed}s)")
+                results.append({
+                    "filename": filename,
+                    "status": resp.status_code,
+                    "time_seconds": elapsed,
+                    "filesize": filesize,
+                    "error": resp.text[:200],
+                })
+        except Exception as e:
+            elapsed = round(time.time() - start, 2)
+            print(f"FAILED ({elapsed}s) - {e}")
+            results.append({
+                "filename": filename,
+                "status": -1,
+                "time_seconds": elapsed,
+                "filesize": filesize,
+                "error": str(e),
+            })
+
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    print(f"\nResults saved to {OUTPUT_FILE}")
+
+    success = sum(1 for r in results if r["status"] == 200)
+    failed = len(results) - success
+    print(f"Total: {len(results)}, Success: {success}, Failed: {failed}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/inference_results_v3.json
+++ b/scripts/inference_results_v3.json
--- a/scripts/ppstructure_line_items_poc.py
+++ b/scripts/ppstructure_line_items_poc.py
@@ -1,387 +0,0 @@
-#!/usr/bin/env python3
-"""
-PP-StructureV3 Line Items Extraction POC
-
-Tests line items extraction from Swedish invoices using PP-StructureV3.
-Parses HTML table structure to extract structured line item data.
-
-Run with invoice-sm120 conda environment.
-"""
-
-import sys
-import re
-from pathlib import Path
-from html.parser import HTMLParser
-from dataclasses import dataclass
-
-# Add project root to path
-project_root = Path(__file__).parent.parent
-sys.path.insert(0, str(project_root / "packages" / "backend"))
-
-from paddleocr import PPStructureV3
-import fitz  # PyMuPDF
-
-
-@dataclass
-class LineItem:
-    """Single line item from invoice."""
-    row_index: int
-    article_number: str | None
-    description: str | None
-    quantity: str | None
-    unit: str | None
-    unit_price: str | None
-    amount: str | None
-    vat_rate: str | None
-    confidence: float = 0.9
-
-
-class TableHTMLParser(HTMLParser):
-    """Parse HTML table into rows and cells."""
-
-    def __init__(self):
-        super().__init__()
-        self.rows: list[list[str]] = []
-        self.current_row: list[str] = []
-        self.current_cell: str = ""
-        self.in_td = False
-        self.in_thead = False
-        self.header_row: list[str] = []
-
-    def handle_starttag(self, tag, attrs):
-        if tag == "tr":
-            self.current_row = []
-        elif tag in ("td", "th"):
-            self.in_td = True
-            self.current_cell = ""
-        elif tag == "thead":
-            self.in_thead = True
-
-    def handle_endtag(self, tag):
-        if tag in ("td", "th"):
-            self.in_td = False
-            self.current_row.append(self.current_cell.strip())
-        elif tag == "tr":
-            if self.current_row:
-                if self.in_thead:
-                    self.header_row = self.current_row
-                else:
-                    self.rows.append(self.current_row)
-        elif tag == "thead":
-            self.in_thead = False
-
-    def handle_data(self, data):
-        if self.in_td:
-            self.current_cell += data
-
-
-# Swedish column name mappings
-# Note: Some headers may contain multiple column names merged together
-COLUMN_MAPPINGS = {
-    'article_number': ['art nummer', 'artikelnummer', 'artikel', 'artnr', 'art.nr', 'art nr'],
-    'description': ['beskrivning', 'produktbeskrivning', 'produkt', 'tjänst', 'text', 'benämning', 'vara/tjänst', 'vara'],
-    'quantity': ['antal', 'qty', 'st', 'pcs', 'kvantitet'],
-    'unit': ['enhet', 'unit'],
-    'unit_price': ['á-pris', 'a-pris', 'pris', 'styckpris', 'enhetspris', 'à pris'],
-    'amount': ['belopp', 'summa', 'total', 'netto', 'rad summa'],
-    'vat_rate': ['moms', 'moms%', 'vat', 'skatt', 'moms %'],
-}
-
-
-def normalize_header(header: str) -> str:
-    """Normalize header text for matching."""
-    return header.lower().strip().replace(".", "").replace("-", " ")
-
-
-def map_columns(headers: list[str]) -> dict[int, str]:
-    """Map column indices to field names."""
-    mapping = {}
-    for idx, header in enumerate(headers):
-        normalized = normalize_header(header)
-
-        # Skip empty headers
-        if not normalized.strip():
-            continue
-
-        best_match = None
-        best_match_len = 0
-
-        for field, patterns in COLUMN_MAPPINGS.items():
-            for pattern in patterns:
-                # Require exact match or pattern must be a significant portion
-                if pattern == normalized:
-                    # Exact match - use immediately
-                    best_match = field
-                    best_match_len = len(pattern) + 100  # Prioritize exact
-                    break
-                elif pattern in normalized and len(pattern) > best_match_len:
-                    # Pattern found in header - use longer matches
-                    if len(pattern) >= 3:  # Minimum pattern length
-                        best_match = field
-                        best_match_len = len(pattern)
-
-            if best_match_len > 100:  # Was exact match
-                break
-
-        if best_match:
-            mapping[idx] = best_match
-
-    return mapping
-
-
-def parse_table_html(html: str) -> tuple[list[str], list[list[str]]]:
-    """Parse HTML table and return header and rows."""
-    parser = TableHTMLParser()
-    parser.feed(html)
-    return parser.header_row, parser.rows
-
-
-def detect_header_row(rows: list[list[str]]) -> tuple[int, list[str], bool]:
-    """
-    Detect which row is the header based on content patterns.
-
-    Returns (header_row_index, header_row, is_at_end).
-    is_at_end indicates if header is at the end (table is reversed).
-    Returns (-1, [], False) if no header detected.
-    """
-    header_keywords = set()
-    for patterns in COLUMN_MAPPINGS.values():
-        for p in patterns:
-            header_keywords.add(p.lower())
-
-    best_match = (-1, [], 0)
-
-    for i, row in enumerate(rows):
-        # Skip empty rows
-        if all(not cell.strip() for cell in row):
-            continue
-
-        # Check if row contains header keywords
-        row_text = " ".join(cell.lower() for cell in row)
-        matches = sum(1 for kw in header_keywords if kw in row_text)
-
-        # Track the best match
-        if matches > best_match[2]:
-            best_match = (i, row, matches)
-
-    if best_match[2] >= 2:
-        header_idx = best_match[0]
-        is_at_end = header_idx == len(rows) - 1 or header_idx > len(rows) // 2
-        return header_idx, best_match[1], is_at_end
-
-    return -1, [], False
-
-
-def extract_line_items(html: str) -> list[LineItem]:
-    """Extract line items from HTML table."""
-    header, rows = parse_table_html(html)
-
-    is_reversed = False
-    if not header:
-        # Try to detect header row from content
-        header_idx, detected_header, is_at_end = detect_header_row(rows)
-        if header_idx >= 0:
-            header = detected_header
-            if is_at_end:
-                # Header is at the end - table is reversed
-                is_reversed = True
-                rows = rows[:header_idx]  # Data rows are before header
-            else:
-                rows = rows[header_idx + 1:]  # Data rows start after header
-        elif rows:
-            # Fall back to first non-empty row
-            for i, row in enumerate(rows):
-                if any(cell.strip() for cell in row):
-                    header = row
-                    rows = rows[i + 1:]
-                    break
-
-    column_map = map_columns(header)
-
-    items = []
-    for row_idx, row in enumerate(rows):
-        item_data = {
-            'row_index': row_idx,
-            'article_number': None,
-            'description': None,
-            'quantity': None,
-            'unit': None,
-            'unit_price': None,
-            'amount': None,
-            'vat_rate': None,
-        }
-
-        for col_idx, cell in enumerate(row):
-            if col_idx in column_map:
-                field = column_map[col_idx]
-                item_data[field] = cell if cell else None
-
-        # Only add if we have at least description or amount
-        if item_data['description'] or item_data['amount']:
-            items.append(LineItem(**item_data))
-
-    return items
-
-
-def render_pdf_to_image(pdf_path: str, dpi: int = 200) -> bytes:
-    """Render first page of PDF to image bytes."""
-    doc = fitz.open(pdf_path)
-    page = doc[0]
-    mat = fitz.Matrix(dpi / 72, dpi / 72)
-    pix = page.get_pixmap(matrix=mat)
-    img_bytes = pix.tobytes("png")
-    doc.close()
-    return img_bytes
-
-
-def test_line_items_extraction(pdf_path: str) -> dict:
-    """Test line items extraction on a PDF."""
-    print(f"\n{'='*70}")
-    print(f"Testing Line Items Extraction: {Path(pdf_path).name}")
-    print(f"{'='*70}")
-
-    # Render PDF to image
-    print("Rendering PDF to image...")
-    img_bytes = render_pdf_to_image(pdf_path)
-
-    # Save temp image
-    temp_img_path = "/tmp/test_invoice.png"
-    with open(temp_img_path, "wb") as f:
-        f.write(img_bytes)
-
-    # Initialize PP-StructureV3
-    print("Initializing PP-StructureV3...")
-    pipeline = PPStructureV3(
-        device="gpu:0",
-        use_doc_orientation_classify=False,
-        use_doc_unwarping=False,
-    )
-
-    # Run detection
-    print("Running table detection...")
-    results = pipeline.predict(temp_img_path)
-
-    all_line_items = []
-    table_details = []
-
-    for result in results if results else []:
-        table_res_list = result.get("table_res_list") if hasattr(result, "get") else None
-
-        if table_res_list:
-            print(f"\nFound {len(table_res_list)} tables")
-
-            for i, table_res in enumerate(table_res_list):
-                html = table_res.get("pred_html", "")
-                ocr_pred = table_res.get("table_ocr_pred", {})
-
-                print(f"\n--- Table {i+1} ---")
-
-                # Debug: show full HTML for first table
-                if i == 0:
-                    print(f"  Full HTML:\n{html}")
-
-                # Debug: inspect table_ocr_pred structure
-                if isinstance(ocr_pred, dict):
-                    print(f"  table_ocr_pred keys: {list(ocr_pred.keys())}")
-                    # Check if rec_texts exists (actual OCR text)
-                    if "rec_texts" in ocr_pred:
-                        texts = ocr_pred["rec_texts"]
-                        print(f"  OCR texts count: {len(texts)}")
-                        print(f"  Sample OCR texts: {texts[:5]}")
-                elif isinstance(ocr_pred, list):
-                    print(f"  table_ocr_pred is list with {len(ocr_pred)} items")
-                    if ocr_pred:
-                        print(f"  First item type: {type(ocr_pred[0])}")
-                        print(f"  First few items: {ocr_pred[:3]}")
-
-                # Parse HTML
-                header, rows = parse_table_html(html)
-                print(f"  HTML Header (from thead): {header}")
-                print(f"  HTML Rows: {len(rows)}")
-
-                # Try to detect header if not in thead
-                detected_header = None
-                is_reversed = False
-                if not header and rows:
-                    header_idx, detected_header, is_at_end = detect_header_row(rows)
-                    if header_idx >= 0:
-                        is_reversed = is_at_end
-                        print(f"  Detected header at row {header_idx}: {detected_header}")
-                        print(f"  Table is {'REVERSED (header at bottom)' if is_reversed else 'normal'}")
-                        header = detected_header
-
-                if rows:
-                    print(f"  First row: {rows[0]}")
-                    if len(rows) > 1:
-                        print(f"  Second row: {rows[1]}")
-
-                # Check if this looks like a line items table
-                column_map = map_columns(header) if header else {}
-                print(f"  Column mapping: {column_map}")
-
-                is_line_items_table = (
-                    'description' in column_map.values() or
-                    'amount' in column_map.values() or
-                    'article_number' in column_map.values()
-                )
-
-                if is_line_items_table:
-                    print(f"  >>> This appears to be a LINE ITEMS table!")
-                    items = extract_line_items(html)
-                    print(f"  Extracted {len(items)} line items:")
-                    for item in items:
-                        print(f"    - {item.description}: {item.quantity} x {item.unit_price} = {item.amount}")
-                    all_line_items.extend(items)
-                else:
-                    print(f"  >>> This is NOT a line items table (summary/payment)")
-
-                table_details.append({
-                    "index": i,
-                    "header": header,
-                    "row_count": len(rows),
-                    "is_line_items": is_line_items_table,
-                    "column_map": column_map,
-                })
-
-    print(f"\n{'='*70}")
-    print(f"EXTRACTION SUMMARY")
-    print(f"{'='*70}")
-    print(f"Total tables: {len(table_details)}")
-    print(f"Line items tables: {sum(1 for t in table_details if t['is_line_items'])}")
-    print(f"Total line items: {len(all_line_items)}")
-
-    return {
-        "pdf": pdf_path,
-        "tables": table_details,
-        "line_items": all_line_items,
-    }
-
-
-def main():
-    import argparse
-    parser = argparse.ArgumentParser(description="Test line items extraction")
-    parser.add_argument("--pdf", type=str, help="Path to PDF file")
-    args = parser.parse_args()
-
-    if args.pdf:
-        # Test specific PDF
-        pdf_path = Path(args.pdf)
-        if not pdf_path.exists():
-            # Try relative to project root
-            pdf_path = project_root / args.pdf
-        if not pdf_path.exists():
-            print(f"PDF not found: {args.pdf}")
-            return
-        test_line_items_extraction(str(pdf_path))
-    else:
-        # Test default invoice
-        default_pdf = project_root / "exampl" / "Faktura54011.pdf"
-        if default_pdf.exists():
-            test_line_items_extraction(str(default_pdf))
-        else:
-            print(f"Default PDF not found: {default_pdf}")
-            print("Usage: python ppstructure_line_items_poc.py --pdf <path>")
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/ppstructure_poc.py
+++ b/scripts/ppstructure_poc.py
@@ -1,154 +0,0 @@
-#!/usr/bin/env python3
-"""
-PP-StructureV3 POC Script
-
-Tests table detection on real Swedish invoices using PP-StructureV3.
-Run with invoice-sm120 conda environment.
-"""
-
-import sys
-from pathlib import Path
-
-# Add project root to path
-project_root = Path(__file__).parent.parent
-sys.path.insert(0, str(project_root / "packages" / "backend"))
-
-from paddleocr import PPStructureV3
-import fitz  # PyMuPDF
-
-
-def render_pdf_to_image(pdf_path: str, dpi: int = 200) -> bytes:
-    """Render first page of PDF to image bytes."""
-    doc = fitz.open(pdf_path)
-    page = doc[0]
-    mat = fitz.Matrix(dpi / 72, dpi / 72)
-    pix = page.get_pixmap(matrix=mat)
-    img_bytes = pix.tobytes("png")
-    doc.close()
-    return img_bytes
-
-
-def test_table_detection(pdf_path: str) -> dict:
-    """Test PP-StructureV3 table detection on a PDF."""
-    print(f"\n{'='*60}")
-    print(f"Testing: {Path(pdf_path).name}")
-    print(f"{'='*60}")
-
-    # Render PDF to image
-    print("Rendering PDF to image...")
-    img_bytes = render_pdf_to_image(pdf_path)
-
-    # Save temp image
-    temp_img_path = "/tmp/test_invoice.png"
-    with open(temp_img_path, "wb") as f:
-        f.write(img_bytes)
-    print(f"Saved temp image: {temp_img_path}")
-
-    # Initialize PP-StructureV3
-    print("Initializing PP-StructureV3...")
-    pipeline = PPStructureV3(
-        device="gpu:0",
-        use_doc_orientation_classify=False,
-        use_doc_unwarping=False,
-    )
-
-    # Run detection
-    print("Running table detection...")
-    results = pipeline.predict(temp_img_path)
-
-    # Parse results - PaddleX 3.x returns dict-like LayoutParsingResultV2
-    tables_found = []
-    all_elements = []
-
-    for result in results if results else []:
-        # Get table results from the new API
-        table_res_list = result.get("table_res_list") if hasattr(result, "get") else None
-
-        if table_res_list:
-            print(f"  Found {len(table_res_list)} tables in table_res_list")
-            for i, table_res in enumerate(table_res_list):
-                # Debug: show all keys in table_res
-                if isinstance(table_res, dict):
-                    print(f"  Table {i+1} keys: {list(table_res.keys())}")
-                else:
-                    print(f"  Table {i+1} attrs: {[a for a in dir(table_res) if not a.startswith('_')]}")
-
-                # Extract table info - use correct key names from PaddleX 3.x
-                cell_boxes = table_res.get("cell_box_list", [])
-                html = table_res.get("pred_html", "")  # HTML is in pred_html
-                ocr_text = table_res.get("table_ocr_pred", [])
-                region_id = table_res.get("table_region_id", -1)
-                bbox = []  # bbox is stored elsewhere in parsing_res_list
-
-                print(f"  Table {i+1}:")
-                print(f"    - Cells: {len(cell_boxes) if cell_boxes is not None else 0}")
-                print(f"    - Region ID: {region_id}")
-                print(f"    - HTML length: {len(html) if html else 0}")
-                print(f"    - OCR texts: {len(ocr_text) if ocr_text else 0}")
-
-                if html:
-                    print(f"    - HTML preview: {html[:300]}...")
-
-                if ocr_text and len(ocr_text) > 0:
-                    print(f"    - First few OCR texts: {ocr_text[:3]}")
-
-                tables_found.append({
-                    "index": i,
-                    "cell_count": len(cell_boxes) if cell_boxes is not None else 0,
-                    "region_id": region_id,
-                    "html": html[:1000] if html else "",
-                    "ocr_count": len(ocr_text) if ocr_text else 0,
-                })
-
-        # Get parsing results for all layout elements
-        parsing_res_list = result.get("parsing_res_list") if hasattr(result, "get") else None
-
-        if parsing_res_list:
-            print(f"\n  Layout elements from parsing_res_list:")
-            for elem in parsing_res_list[:10]:  # Show first 10
-                label = elem.get("label", "unknown") if isinstance(elem, dict) else getattr(elem, "label", "unknown")
-                bbox = elem.get("bbox", []) if isinstance(elem, dict) else getattr(elem, "bbox", [])
-                print(f"    - {label}: {bbox}")
-                all_elements.append({"label": label, "bbox": bbox})
-
-    print(f"\nSummary:")
-    print(f"  Tables detected: {len(tables_found)}")
-    print(f"  Layout elements: {len(all_elements)}")
-
-    return {"pdf": pdf_path, "tables": tables_found, "elements": all_elements}
-
-
-def main():
-    # Find test PDFs
-    pdf_dir = Path("/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/data/admin_uploads")
-    pdf_files = list(pdf_dir.glob("*.pdf"))[:5]  # Test first 5
-
-    if not pdf_files:
-        print("No PDF files found in admin_uploads directory")
-        return
-
-    print(f"Found {len(pdf_files)} PDF files")
-
-    all_results = []
-    for pdf_file in pdf_files:
-        result = test_table_detection(str(pdf_file))
-        all_results.append(result)
-
-    # Summary
-    print(f"\n{'='*60}")
-    print("FINAL SUMMARY")
-    print(f"{'='*60}")
-    total_tables = sum(len(r["tables"]) for r in all_results)
-    print(f"Total PDFs tested: {len(all_results)}")
-    print(f"Total tables detected: {total_tables}")
-
-    for r in all_results:
-        pdf_name = Path(r["pdf"]).name
-        table_count = len(r["tables"])
-        print(f"  {pdf_name}: {table_count} tables")
-        for t in r["tables"]:
-            print(f"    - Table {t['index']+1}: {t['cell_count']} cells")
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/render_pdfs_v3.py
+++ b/scripts/render_pdfs_v3.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+"""Render selected PDFs from v3 batch for visual comparison."""
+
+import os
+
+import fitz  # PyMuPDF
+
+PDF_DIR = "/mnt/c/Users/yaoji/git/Billo/Billo.Platform.Document/Billo.Platform.Document.AdminAPI/downloads/to_check"
+OUTPUT_DIR = "/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/scripts/pdf_renders_v3"
+
+# Select 10 PDFs covering different scenarios:
+SELECTED = [
+    # Potentially wrong Amount (81648164.00 - too high?)
+    "b84c7d70-821d-4a1a-9be7-d7bb2392bd91.pdf",
+    # Only 2 fields extracted
+    "072571e2-da5f-4268-b1a8-f0e5a85a3ec4.pdf",
+    # InvoiceNumber=5085 (suspiciously short, same as BG prefix?)
+    "6a83ba35-afdf-4c13-ade1-25513e213637.pdf",
+    # InvoiceNumber=450 (very short, might be wrong)
+    "8551b540-d93d-459d-b7eb-e9ee086f9f16.pdf",
+    # InvoiceNumber=134 (very short, same as BG prefix)
+    "cb1bd3b1-63d0-4140-930f-e4a7ae2b6cd5.pdf",
+    # Large Amount=172904.52, InvoiceNumber=89902
+    "d121a5ee-7382-41d8-8010-63880def1f96.pdf",
+    # Good 9-field PDF for positive check
+    "6cb90895-e52b-4831-b57b-7cb968bcdd54.pdf",
+    # Amount=2026.00 (same as year - could be confused?)
+    "d376c5b5-0dc5-4ccf-b787-0d481eef8577.pdf",
+    # 8 fields, good coverage
+    "f3f5da6f-7552-4ec6-8625-3629042fbfd0.pdf",
+    # Low confidence Amount=596.49
+    "5783e4af-eef3-411c-84b1-3a8f4694fed8.pdf",
+]
+
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+for pdf_name in SELECTED:
+    pdf_path = os.path.join(PDF_DIR, pdf_name)
+    if not os.path.exists(pdf_path):
+        print(f"SKIP {pdf_name} - not found")
+        continue
+
+    doc = fitz.open(pdf_path)
+    page = doc[0]
+    mat = fitz.Matrix(150 / 72, 150 / 72)
+    pix = page.get_pixmap(matrix=mat)
+
+    out_name = pdf_name.replace(".pdf", ".png")
+    out_path = os.path.join(OUTPUT_DIR, out_name)
+    pix.save(out_path)
+    print(f"Rendered {pdf_name} -> {out_name} ({pix.width}x{pix.height})")
+    doc.close()
+
+print(f"\nAll renders saved to {OUTPUT_DIR}")