93 lines
3.3 KiB
Python
93 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Batch inference v3 - 30 random PDFs for Round 2 validation."""
|
|
|
|
import json
|
|
import os
|
|
import random
|
|
import time
|
|
|
|
import requests
|
|
|
|
PDF_DIR = "/mnt/c/Users/yaoji/git/Billo/Billo.Platform.Document/Billo.Platform.Document.AdminAPI/downloads/to_check"
|
|
API_URL = "http://localhost:8000/api/v1/infer"
|
|
OUTPUT_FILE = "/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/scripts/inference_results_v3.json"
|
|
SAMPLE_SIZE = 30
|
|
|
|
|
|
def main():
|
|
random.seed(99_2026) # New seed for Round 3
|
|
|
|
all_pdfs = [f for f in os.listdir(PDF_DIR) if f.lower().endswith(".pdf")]
|
|
selected = random.sample(all_pdfs, min(SAMPLE_SIZE, len(all_pdfs)))
|
|
|
|
print(f"Selected {len(selected)} random PDFs for inference")
|
|
|
|
results = []
|
|
for i, filename in enumerate(selected, 1):
|
|
filepath = os.path.join(PDF_DIR, filename)
|
|
filesize = os.path.getsize(filepath)
|
|
print(f"[{i}/{len(selected)}] Processing {filename}...", end=" ", flush=True)
|
|
|
|
start = time.time()
|
|
try:
|
|
with open(filepath, "rb") as f:
|
|
resp = requests.post(
|
|
API_URL,
|
|
files={"file": (filename, f, "application/pdf")},
|
|
timeout=120,
|
|
)
|
|
elapsed = round(time.time() - start, 2)
|
|
|
|
if resp.status_code == 200:
|
|
data = resp.json()
|
|
field_count = sum(
|
|
1 for k, v in data.items()
|
|
if k not in (
|
|
"DocumentId", "confidence", "success", "fallback_used",
|
|
"bboxes", "cross_validation", "processing_time_ms",
|
|
"line_items", "vat_summary", "vat_validation",
|
|
"raw_detections", "detection_classes", "detection_count",
|
|
)
|
|
and v is not None
|
|
)
|
|
det_count = data.get("detection_count", "?")
|
|
print(f"OK ({elapsed}s) - {field_count} fields, {det_count} detections")
|
|
results.append({
|
|
"filename": filename,
|
|
"status": resp.status_code,
|
|
"time_seconds": elapsed,
|
|
"filesize": filesize,
|
|
"data": data,
|
|
})
|
|
else:
|
|
print(f"HTTP {resp.status_code} ({elapsed}s)")
|
|
results.append({
|
|
"filename": filename,
|
|
"status": resp.status_code,
|
|
"time_seconds": elapsed,
|
|
"filesize": filesize,
|
|
"error": resp.text[:200],
|
|
})
|
|
except Exception as e:
|
|
elapsed = round(time.time() - start, 2)
|
|
print(f"FAILED ({elapsed}s) - {e}")
|
|
results.append({
|
|
"filename": filename,
|
|
"status": -1,
|
|
"time_seconds": elapsed,
|
|
"filesize": filesize,
|
|
"error": str(e),
|
|
})
|
|
|
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
print(f"\nResults saved to {OUTPUT_FILE}")
|
|
|
|
success = sum(1 for r in results if r["status"] == 200)
|
|
failed = len(results) - success
|
|
print(f"Total: {len(results)}, Success: {success}, Failed: {failed}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|