WIP
This commit is contained in:
92
scripts/batch_inference_v3.py
Normal file
92
scripts/batch_inference_v3.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Batch inference v3 - 30 random PDFs for Round 2 validation."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
PDF_DIR = "/mnt/c/Users/yaoji/git/Billo/Billo.Platform.Document/Billo.Platform.Document.AdminAPI/downloads/to_check"
|
||||
API_URL = "http://localhost:8000/api/v1/infer"
|
||||
OUTPUT_FILE = "/mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/scripts/inference_results_v3.json"
|
||||
SAMPLE_SIZE = 30
|
||||
|
||||
|
||||
def main():
|
||||
random.seed(99_2026) # New seed for Round 3
|
||||
|
||||
all_pdfs = [f for f in os.listdir(PDF_DIR) if f.lower().endswith(".pdf")]
|
||||
selected = random.sample(all_pdfs, min(SAMPLE_SIZE, len(all_pdfs)))
|
||||
|
||||
print(f"Selected {len(selected)} random PDFs for inference")
|
||||
|
||||
results = []
|
||||
for i, filename in enumerate(selected, 1):
|
||||
filepath = os.path.join(PDF_DIR, filename)
|
||||
filesize = os.path.getsize(filepath)
|
||||
print(f"[{i}/{len(selected)}] Processing {filename}...", end=" ", flush=True)
|
||||
|
||||
start = time.time()
|
||||
try:
|
||||
with open(filepath, "rb") as f:
|
||||
resp = requests.post(
|
||||
API_URL,
|
||||
files={"file": (filename, f, "application/pdf")},
|
||||
timeout=120,
|
||||
)
|
||||
elapsed = round(time.time() - start, 2)
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
field_count = sum(
|
||||
1 for k, v in data.items()
|
||||
if k not in (
|
||||
"DocumentId", "confidence", "success", "fallback_used",
|
||||
"bboxes", "cross_validation", "processing_time_ms",
|
||||
"line_items", "vat_summary", "vat_validation",
|
||||
"raw_detections", "detection_classes", "detection_count",
|
||||
)
|
||||
and v is not None
|
||||
)
|
||||
det_count = data.get("detection_count", "?")
|
||||
print(f"OK ({elapsed}s) - {field_count} fields, {det_count} detections")
|
||||
results.append({
|
||||
"filename": filename,
|
||||
"status": resp.status_code,
|
||||
"time_seconds": elapsed,
|
||||
"filesize": filesize,
|
||||
"data": data,
|
||||
})
|
||||
else:
|
||||
print(f"HTTP {resp.status_code} ({elapsed}s)")
|
||||
results.append({
|
||||
"filename": filename,
|
||||
"status": resp.status_code,
|
||||
"time_seconds": elapsed,
|
||||
"filesize": filesize,
|
||||
"error": resp.text[:200],
|
||||
})
|
||||
except Exception as e:
|
||||
elapsed = round(time.time() - start, 2)
|
||||
print(f"FAILED ({elapsed}s) - {e}")
|
||||
results.append({
|
||||
"filename": filename,
|
||||
"status": -1,
|
||||
"time_seconds": elapsed,
|
||||
"filesize": filesize,
|
||||
"error": str(e),
|
||||
})
|
||||
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
print(f"\nResults saved to {OUTPUT_FILE}")
|
||||
|
||||
success = sum(1 for r in results if r["status"] == 200)
|
||||
failed = len(results) - success
|
||||
print(f"Total: {len(results)}, Success: {success}, Failed: {failed}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user