This commit is contained in:
Yaojia Wang
2026-01-13 00:10:27 +01:00
parent 1b7c61cdd8
commit b26fd61852
43 changed files with 7751 additions and 578 deletions

435
src/cli/analyze_report.py Normal file
View File

@@ -0,0 +1,435 @@
#!/usr/bin/env python3
"""
Analyze Auto-Label Report
Generates statistics and insights from database or autolabel_report.jsonl
"""
import argparse
import json
import sys
from collections import defaultdict
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from config import get_db_connection_string
def load_reports_from_db() -> dict:
"""Load statistics directly from database using SQL aggregation."""
from ..data.db import DocumentDB
db = DocumentDB()
db.connect()
stats = {
'total': 0,
'successful': 0,
'failed': 0,
'by_pdf_type': defaultdict(lambda: {'total': 0, 'successful': 0}),
'by_field': defaultdict(lambda: {
'total': 0,
'matched': 0,
'exact_match': 0,
'flexible_match': 0,
'scores': [],
'by_pdf_type': defaultdict(lambda: {'total': 0, 'matched': 0})
}),
'errors': defaultdict(int),
'processing_times': [],
}
conn = db.connect()
with conn.cursor() as cursor:
# Overall stats
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN success THEN 1 ELSE 0 END) as successful,
SUM(CASE WHEN NOT success THEN 1 ELSE 0 END) as failed
FROM documents
""")
row = cursor.fetchone()
stats['total'] = row[0] or 0
stats['successful'] = row[1] or 0
stats['failed'] = row[2] or 0
# By PDF type
cursor.execute("""
SELECT
pdf_type,
COUNT(*) as total,
SUM(CASE WHEN success THEN 1 ELSE 0 END) as successful
FROM documents
GROUP BY pdf_type
""")
for row in cursor.fetchall():
pdf_type = row[0] or 'unknown'
stats['by_pdf_type'][pdf_type] = {
'total': row[1] or 0,
'successful': row[2] or 0
}
# Processing times
cursor.execute("""
SELECT AVG(processing_time_ms), MIN(processing_time_ms), MAX(processing_time_ms)
FROM documents
WHERE processing_time_ms > 0
""")
row = cursor.fetchone()
if row[0]:
stats['processing_time_stats'] = {
'avg_ms': float(row[0]),
'min_ms': float(row[1]),
'max_ms': float(row[2])
}
# Field stats
cursor.execute("""
SELECT
field_name,
COUNT(*) as total,
SUM(CASE WHEN matched THEN 1 ELSE 0 END) as matched,
SUM(CASE WHEN matched AND score >= 0.99 THEN 1 ELSE 0 END) as exact_match,
SUM(CASE WHEN matched AND score < 0.99 THEN 1 ELSE 0 END) as flexible_match,
AVG(CASE WHEN matched THEN score END) as avg_score
FROM field_results
GROUP BY field_name
ORDER BY field_name
""")
for row in cursor.fetchall():
field_name = row[0]
stats['by_field'][field_name] = {
'total': row[1] or 0,
'matched': row[2] or 0,
'exact_match': row[3] or 0,
'flexible_match': row[4] or 0,
'avg_score': float(row[5]) if row[5] else 0,
'scores': [], # Not loading individual scores for efficiency
'by_pdf_type': defaultdict(lambda: {'total': 0, 'matched': 0})
}
# Field stats by PDF type
cursor.execute("""
SELECT
fr.field_name,
d.pdf_type,
COUNT(*) as total,
SUM(CASE WHEN fr.matched THEN 1 ELSE 0 END) as matched
FROM field_results fr
JOIN documents d ON fr.document_id = d.document_id
GROUP BY fr.field_name, d.pdf_type
""")
for row in cursor.fetchall():
field_name = row[0]
pdf_type = row[1] or 'unknown'
if field_name in stats['by_field']:
stats['by_field'][field_name]['by_pdf_type'][pdf_type] = {
'total': row[2] or 0,
'matched': row[3] or 0
}
db.close()
return stats
def load_reports_from_file(report_path: str) -> list[dict]:
"""Load all reports from JSONL file(s). Supports glob patterns."""
path = Path(report_path)
# Handle glob pattern
if '*' in str(path) or '?' in str(path):
parent = path.parent
pattern = path.name
report_files = sorted(parent.glob(pattern))
else:
report_files = [path]
if not report_files:
return []
print(f"Reading {len(report_files)} report file(s):")
for f in report_files:
print(f" - {f.name}")
reports = []
for report_file in report_files:
if not report_file.exists():
continue
with open(report_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line:
reports.append(json.loads(line))
return reports
def analyze_reports(reports: list[dict]) -> dict:
"""Analyze reports and generate statistics."""
stats = {
'total': len(reports),
'successful': 0,
'failed': 0,
'by_pdf_type': defaultdict(lambda: {'total': 0, 'successful': 0}),
'by_field': defaultdict(lambda: {
'total': 0,
'matched': 0,
'exact_match': 0, # score == 1.0
'flexible_match': 0, # score < 1.0
'scores': [],
'by_pdf_type': defaultdict(lambda: {'total': 0, 'matched': 0})
}),
'errors': defaultdict(int),
'processing_times': [],
}
for report in reports:
pdf_type = report.get('pdf_type') or 'unknown'
success = report.get('success', False)
# Overall stats
if success:
stats['successful'] += 1
else:
stats['failed'] += 1
# By PDF type
stats['by_pdf_type'][pdf_type]['total'] += 1
if success:
stats['by_pdf_type'][pdf_type]['successful'] += 1
# Processing time
proc_time = report.get('processing_time_ms', 0)
if proc_time > 0:
stats['processing_times'].append(proc_time)
# Errors
for error in report.get('errors', []):
stats['errors'][error] += 1
# Field results
for field_result in report.get('field_results', []):
field_name = field_result['field_name']
matched = field_result.get('matched', False)
score = field_result.get('score', 0.0)
stats['by_field'][field_name]['total'] += 1
stats['by_field'][field_name]['by_pdf_type'][pdf_type]['total'] += 1
if matched:
stats['by_field'][field_name]['matched'] += 1
stats['by_field'][field_name]['scores'].append(score)
stats['by_field'][field_name]['by_pdf_type'][pdf_type]['matched'] += 1
if score >= 0.99:
stats['by_field'][field_name]['exact_match'] += 1
else:
stats['by_field'][field_name]['flexible_match'] += 1
return stats
def print_report(stats: dict, verbose: bool = False):
"""Print analysis report."""
print("\n" + "=" * 60)
print("AUTO-LABEL REPORT ANALYSIS")
print("=" * 60)
# Overall stats
print(f"\n{'OVERALL STATISTICS':^60}")
print("-" * 60)
total = stats['total']
successful = stats['successful']
failed = stats['failed']
success_rate = successful / total * 100 if total > 0 else 0
print(f"Total documents: {total:>8}")
print(f"Successful: {successful:>8} ({success_rate:.1f}%)")
print(f"Failed: {failed:>8} ({100-success_rate:.1f}%)")
# Processing time
if 'processing_time_stats' in stats:
pts = stats['processing_time_stats']
print(f"\nProcessing time (ms):")
print(f" Average: {pts['avg_ms']:>8.1f}")
print(f" Min: {pts['min_ms']:>8.1f}")
print(f" Max: {pts['max_ms']:>8.1f}")
elif stats.get('processing_times'):
times = stats['processing_times']
avg_time = sum(times) / len(times)
min_time = min(times)
max_time = max(times)
print(f"\nProcessing time (ms):")
print(f" Average: {avg_time:>8.1f}")
print(f" Min: {min_time:>8.1f}")
print(f" Max: {max_time:>8.1f}")
# By PDF type
print(f"\n{'BY PDF TYPE':^60}")
print("-" * 60)
print(f"{'Type':<15} {'Total':>10} {'Success':>10} {'Rate':>10}")
print("-" * 60)
for pdf_type, type_stats in sorted(stats['by_pdf_type'].items()):
type_total = type_stats['total']
type_success = type_stats['successful']
type_rate = type_success / type_total * 100 if type_total > 0 else 0
print(f"{pdf_type:<15} {type_total:>10} {type_success:>10} {type_rate:>9.1f}%")
# By field
print(f"\n{'FIELD MATCH STATISTICS':^60}")
print("-" * 60)
print(f"{'Field':<18} {'Total':>7} {'Match':>7} {'Rate':>7} {'Exact':>7} {'Flex':>7} {'AvgScore':>8}")
print("-" * 60)
for field_name in ['InvoiceNumber', 'InvoiceDate', 'InvoiceDueDate', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount']:
if field_name not in stats['by_field']:
continue
field_stats = stats['by_field'][field_name]
total = field_stats['total']
matched = field_stats['matched']
exact = field_stats['exact_match']
flex = field_stats['flexible_match']
rate = matched / total * 100 if total > 0 else 0
# Handle avg_score from either DB or file analysis
if 'avg_score' in field_stats:
avg_score = field_stats['avg_score']
elif field_stats['scores']:
avg_score = sum(field_stats['scores']) / len(field_stats['scores'])
else:
avg_score = 0
print(f"{field_name:<18} {total:>7} {matched:>7} {rate:>6.1f}% {exact:>7} {flex:>7} {avg_score:>8.3f}")
# Field match by PDF type
print(f"\n{'FIELD MATCH BY PDF TYPE':^60}")
print("-" * 60)
for pdf_type in sorted(stats['by_pdf_type'].keys()):
print(f"\n[{pdf_type.upper()}]")
print(f"{'Field':<18} {'Total':>10} {'Matched':>10} {'Rate':>10}")
print("-" * 50)
for field_name in ['InvoiceNumber', 'InvoiceDate', 'InvoiceDueDate', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount']:
if field_name not in stats['by_field']:
continue
type_stats = stats['by_field'][field_name]['by_pdf_type'].get(pdf_type, {'total': 0, 'matched': 0})
total = type_stats['total']
matched = type_stats['matched']
rate = matched / total * 100 if total > 0 else 0
print(f"{field_name:<18} {total:>10} {matched:>10} {rate:>9.1f}%")
# Errors
if stats.get('errors') and verbose:
print(f"\n{'ERRORS':^60}")
print("-" * 60)
for error, count in sorted(stats['errors'].items(), key=lambda x: -x[1])[:20]:
print(f"{count:>5}x {error[:50]}")
print("\n" + "=" * 60)
def export_json(stats: dict, output_path: str):
"""Export statistics to JSON file."""
# Convert defaultdicts to regular dicts for JSON serialization
export_data = {
'total': stats['total'],
'successful': stats['successful'],
'failed': stats['failed'],
'by_pdf_type': dict(stats['by_pdf_type']),
'by_field': {},
'errors': dict(stats.get('errors', {})),
}
# Processing time stats
if 'processing_time_stats' in stats:
export_data['processing_time_stats'] = stats['processing_time_stats']
elif stats.get('processing_times'):
times = stats['processing_times']
export_data['processing_time_stats'] = {
'avg_ms': sum(times) / len(times),
'min_ms': min(times),
'max_ms': max(times),
'count': len(times)
}
# Field stats
for field_name, field_stats in stats['by_field'].items():
avg_score = field_stats.get('avg_score', 0)
if not avg_score and field_stats.get('scores'):
avg_score = sum(field_stats['scores']) / len(field_stats['scores'])
export_data['by_field'][field_name] = {
'total': field_stats['total'],
'matched': field_stats['matched'],
'exact_match': field_stats['exact_match'],
'flexible_match': field_stats['flexible_match'],
'match_rate': field_stats['matched'] / field_stats['total'] if field_stats['total'] > 0 else 0,
'avg_score': avg_score,
'by_pdf_type': dict(field_stats['by_pdf_type'])
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(export_data, f, indent=2, ensure_ascii=False)
print(f"\nStatistics exported to: {output_path}")
def main():
parser = argparse.ArgumentParser(
description='Analyze auto-label report'
)
parser.add_argument(
'--report', '-r',
default=None,
help='Path to autolabel report JSONL file (uses database if not specified)'
)
parser.add_argument(
'--output', '-o',
help='Export statistics to JSON file'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Show detailed error messages'
)
parser.add_argument(
'--from-file',
action='store_true',
help='Force reading from JSONL file instead of database'
)
args = parser.parse_args()
# Decide source
use_db = not args.from_file and args.report is None
if use_db:
print("Loading statistics from database...")
stats = load_reports_from_db()
print(f"Loaded stats for {stats['total']} documents")
else:
report_path = args.report or 'reports/autolabel_report.jsonl'
path = Path(report_path)
# Check if file exists (handle glob patterns)
if '*' not in str(path) and '?' not in str(path) and not path.exists():
print(f"Error: Report file not found: {path}")
return 1
print(f"Loading reports from: {report_path}")
reports = load_reports_from_file(report_path)
print(f"Loaded {len(reports)} reports")
stats = analyze_reports(reports)
print_report(stats, verbose=args.verbose)
if args.output:
export_json(stats, args.output)
return 0
if __name__ == '__main__':
exit(main())