Files
invoice-master-poc-v2/scripts/batch_inference_v3.py
Yaojia Wang 58d36c8927 WIP
2026-02-12 23:06:00 +01:00

93 lines
3.3 KiB
Python

#!/usr/bin/env python3
"""Batch inference v3 - 30 random PDFs for Round 2 validation."""
import json
import os
import random
import time
import requests
PDF_DIR = "/mnt/c/Users/yaoji/git/Billo/Billo.Platform.Document/Billo.Platform.Document.AdminAPI/downloads/to_check"
API_URL = "http://localhost:8000/api/v1/infer"
OUTPUT_FILE = "/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/scripts/inference_results_v3.json"
SAMPLE_SIZE = 30
def main():
random.seed(99_2026) # New seed for Round 3
all_pdfs = [f for f in os.listdir(PDF_DIR) if f.lower().endswith(".pdf")]
selected = random.sample(all_pdfs, min(SAMPLE_SIZE, len(all_pdfs)))
print(f"Selected {len(selected)} random PDFs for inference")
results = []
for i, filename in enumerate(selected, 1):
filepath = os.path.join(PDF_DIR, filename)
filesize = os.path.getsize(filepath)
print(f"[{i}/{len(selected)}] Processing {filename}...", end=" ", flush=True)
start = time.time()
try:
with open(filepath, "rb") as f:
resp = requests.post(
API_URL,
files={"file": (filename, f, "application/pdf")},
timeout=120,
)
elapsed = round(time.time() - start, 2)
if resp.status_code == 200:
data = resp.json()
field_count = sum(
1 for k, v in data.items()
if k not in (
"DocumentId", "confidence", "success", "fallback_used",
"bboxes", "cross_validation", "processing_time_ms",
"line_items", "vat_summary", "vat_validation",
"raw_detections", "detection_classes", "detection_count",
)
and v is not None
)
det_count = data.get("detection_count", "?")
print(f"OK ({elapsed}s) - {field_count} fields, {det_count} detections")
results.append({
"filename": filename,
"status": resp.status_code,
"time_seconds": elapsed,
"filesize": filesize,
"data": data,
})
else:
print(f"HTTP {resp.status_code} ({elapsed}s)")
results.append({
"filename": filename,
"status": resp.status_code,
"time_seconds": elapsed,
"filesize": filesize,
"error": resp.text[:200],
})
except Exception as e:
elapsed = round(time.time() - start, 2)
print(f"FAILED ({elapsed}s) - {e}")
results.append({
"filename": filename,
"status": -1,
"time_seconds": elapsed,
"filesize": filesize,
"error": str(e),
})
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\nResults saved to {OUTPUT_FILE}")
success = sum(1 for r in results if r["status"] == 200)
failed = len(results) - success
print(f"Total: {len(results)}, Success: {success}, Failed: {failed}")
if __name__ == "__main__":
main()