#!/usr/bin/env python3 """Batch inference v3 - 30 random PDFs for Round 2 validation.""" import json import os import random import time import requests PDF_DIR = "/mnt/c/Users/yaoji/git/Billo/Billo.Platform.Document/Billo.Platform.Document.AdminAPI/downloads/to_check" API_URL = "http://localhost:8000/api/v1/infer" OUTPUT_FILE = "/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/scripts/inference_results_v3.json" SAMPLE_SIZE = 30 def main(): random.seed(99_2026) # New seed for Round 3 all_pdfs = [f for f in os.listdir(PDF_DIR) if f.lower().endswith(".pdf")] selected = random.sample(all_pdfs, min(SAMPLE_SIZE, len(all_pdfs))) print(f"Selected {len(selected)} random PDFs for inference") results = [] for i, filename in enumerate(selected, 1): filepath = os.path.join(PDF_DIR, filename) filesize = os.path.getsize(filepath) print(f"[{i}/{len(selected)}] Processing {filename}...", end=" ", flush=True) start = time.time() try: with open(filepath, "rb") as f: resp = requests.post( API_URL, files={"file": (filename, f, "application/pdf")}, timeout=120, ) elapsed = round(time.time() - start, 2) if resp.status_code == 200: data = resp.json() field_count = sum( 1 for k, v in data.items() if k not in ( "DocumentId", "confidence", "success", "fallback_used", "bboxes", "cross_validation", "processing_time_ms", "line_items", "vat_summary", "vat_validation", "raw_detections", "detection_classes", "detection_count", ) and v is not None ) det_count = data.get("detection_count", "?") print(f"OK ({elapsed}s) - {field_count} fields, {det_count} detections") results.append({ "filename": filename, "status": resp.status_code, "time_seconds": elapsed, "filesize": filesize, "data": data, }) else: print(f"HTTP {resp.status_code} ({elapsed}s)") results.append({ "filename": filename, "status": resp.status_code, "time_seconds": elapsed, "filesize": filesize, "error": resp.text[:200], }) except Exception as e: elapsed = round(time.time() - start, 2) print(f"FAILED ({elapsed}s) - {e}") results.append({ "filename": filename, "status": -1, "time_seconds": elapsed, "filesize": filesize, "error": str(e), }) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\nResults saved to {OUTPUT_FILE}") success = sum(1 for r in results if r["status"] == 200) failed = len(results) - success print(f"Total: {len(results)}, Success: {success}, Failed: {failed}") if __name__ == "__main__": main()