WOP

2026-01-13 00:10:27 +01:00
parent 1b7c61cdd8
commit b26fd61852
43 changed files with 7751 additions and 578 deletions
--- a/src/cli/analyze_report.py
+++ b/src/cli/analyze_report.py
@@ -0,0 +1,435 @@
+#!/usr/bin/env python3
+"""
+Analyze Auto-Label Report
+
+Generates statistics and insights from database or autolabel_report.jsonl
+"""
+
+import argparse
+import json
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+from config import get_db_connection_string
+
+
+def load_reports_from_db() -> dict:
+    """Load statistics directly from database using SQL aggregation."""
+    from ..data.db import DocumentDB
+
+    db = DocumentDB()
+    db.connect()
+
+    stats = {
+        'total': 0,
+        'successful': 0,
+        'failed': 0,
+        'by_pdf_type': defaultdict(lambda: {'total': 0, 'successful': 0}),
+        'by_field': defaultdict(lambda: {
+            'total': 0,
+            'matched': 0,
+            'exact_match': 0,
+            'flexible_match': 0,
+            'scores': [],
+            'by_pdf_type': defaultdict(lambda: {'total': 0, 'matched': 0})
+        }),
+        'errors': defaultdict(int),
+        'processing_times': [],
+    }
+
+    conn = db.connect()
+    with conn.cursor() as cursor:
+        # Overall stats
+        cursor.execute("""
+            SELECT
+                COUNT(*) as total,
+                SUM(CASE WHEN success THEN 1 ELSE 0 END) as successful,
+                SUM(CASE WHEN NOT success THEN 1 ELSE 0 END) as failed
+            FROM documents
+        """)
+        row = cursor.fetchone()
+        stats['total'] = row[0] or 0
+        stats['successful'] = row[1] or 0
+        stats['failed'] = row[2] or 0
+
+        # By PDF type
+        cursor.execute("""
+            SELECT
+                pdf_type,
+                COUNT(*) as total,
+                SUM(CASE WHEN success THEN 1 ELSE 0 END) as successful
+            FROM documents
+            GROUP BY pdf_type
+        """)
+        for row in cursor.fetchall():
+            pdf_type = row[0] or 'unknown'
+            stats['by_pdf_type'][pdf_type] = {
+                'total': row[1] or 0,
+                'successful': row[2] or 0
+            }
+
+        # Processing times
+        cursor.execute("""
+            SELECT AVG(processing_time_ms), MIN(processing_time_ms), MAX(processing_time_ms)
+            FROM documents
+            WHERE processing_time_ms > 0
+        """)
+        row = cursor.fetchone()
+        if row[0]:
+            stats['processing_time_stats'] = {
+                'avg_ms': float(row[0]),
+                'min_ms': float(row[1]),
+                'max_ms': float(row[2])
+            }
+
+        # Field stats
+        cursor.execute("""
+            SELECT
+                field_name,
+                COUNT(*) as total,
+                SUM(CASE WHEN matched THEN 1 ELSE 0 END) as matched,
+                SUM(CASE WHEN matched AND score >= 0.99 THEN 1 ELSE 0 END) as exact_match,
+                SUM(CASE WHEN matched AND score < 0.99 THEN 1 ELSE 0 END) as flexible_match,
+                AVG(CASE WHEN matched THEN score END) as avg_score
+            FROM field_results
+            GROUP BY field_name
+            ORDER BY field_name
+        """)
+        for row in cursor.fetchall():
+            field_name = row[0]
+            stats['by_field'][field_name] = {
+                'total': row[1] or 0,
+                'matched': row[2] or 0,
+                'exact_match': row[3] or 0,
+                'flexible_match': row[4] or 0,
+                'avg_score': float(row[5]) if row[5] else 0,
+                'scores': [],  # Not loading individual scores for efficiency
+                'by_pdf_type': defaultdict(lambda: {'total': 0, 'matched': 0})
+            }
+
+        # Field stats by PDF type
+        cursor.execute("""
+            SELECT
+                fr.field_name,
+                d.pdf_type,
+                COUNT(*) as total,
+                SUM(CASE WHEN fr.matched THEN 1 ELSE 0 END) as matched
+            FROM field_results fr
+            JOIN documents d ON fr.document_id = d.document_id
+            GROUP BY fr.field_name, d.pdf_type
+        """)
+        for row in cursor.fetchall():
+            field_name = row[0]
+            pdf_type = row[1] or 'unknown'
+            if field_name in stats['by_field']:
+                stats['by_field'][field_name]['by_pdf_type'][pdf_type] = {
+                    'total': row[2] or 0,
+                    'matched': row[3] or 0
+                }
+
+    db.close()
+    return stats
+
+
+def load_reports_from_file(report_path: str) -> list[dict]:
+    """Load all reports from JSONL file(s). Supports glob patterns."""
+    path = Path(report_path)
+
+    # Handle glob pattern
+    if '*' in str(path) or '?' in str(path):
+        parent = path.parent
+        pattern = path.name
+        report_files = sorted(parent.glob(pattern))
+    else:
+        report_files = [path]
+
+    if not report_files:
+        return []
+
+    print(f"Reading {len(report_files)} report file(s):")
+    for f in report_files:
+        print(f"  - {f.name}")
+
+    reports = []
+    for report_file in report_files:
+        if not report_file.exists():
+            continue
+        with open(report_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    reports.append(json.loads(line))
+
+    return reports
+
+
+def analyze_reports(reports: list[dict]) -> dict:
+    """Analyze reports and generate statistics."""
+    stats = {
+        'total': len(reports),
+        'successful': 0,
+        'failed': 0,
+        'by_pdf_type': defaultdict(lambda: {'total': 0, 'successful': 0}),
+        'by_field': defaultdict(lambda: {
+            'total': 0,
+            'matched': 0,
+            'exact_match': 0,  # score == 1.0
+            'flexible_match': 0,  # score < 1.0
+            'scores': [],
+            'by_pdf_type': defaultdict(lambda: {'total': 0, 'matched': 0})
+        }),
+        'errors': defaultdict(int),
+        'processing_times': [],
+    }
+
+    for report in reports:
+        pdf_type = report.get('pdf_type') or 'unknown'
+        success = report.get('success', False)
+
+        # Overall stats
+        if success:
+            stats['successful'] += 1
+        else:
+            stats['failed'] += 1
+
+        # By PDF type
+        stats['by_pdf_type'][pdf_type]['total'] += 1
+        if success:
+            stats['by_pdf_type'][pdf_type]['successful'] += 1
+
+        # Processing time
+        proc_time = report.get('processing_time_ms', 0)
+        if proc_time > 0:
+            stats['processing_times'].append(proc_time)
+
+        # Errors
+        for error in report.get('errors', []):
+            stats['errors'][error] += 1
+
+        # Field results
+        for field_result in report.get('field_results', []):
+            field_name = field_result['field_name']
+            matched = field_result.get('matched', False)
+            score = field_result.get('score', 0.0)
+
+            stats['by_field'][field_name]['total'] += 1
+            stats['by_field'][field_name]['by_pdf_type'][pdf_type]['total'] += 1
+
+            if matched:
+                stats['by_field'][field_name]['matched'] += 1
+                stats['by_field'][field_name]['scores'].append(score)
+                stats['by_field'][field_name]['by_pdf_type'][pdf_type]['matched'] += 1
+
+                if score >= 0.99:
+                    stats['by_field'][field_name]['exact_match'] += 1
+                else:
+                    stats['by_field'][field_name]['flexible_match'] += 1
+
+    return stats
+
+
+def print_report(stats: dict, verbose: bool = False):
+    """Print analysis report."""
+    print("\n" + "=" * 60)
+    print("AUTO-LABEL REPORT ANALYSIS")
+    print("=" * 60)
+
+    # Overall stats
+    print(f"\n{'OVERALL STATISTICS':^60}")
+    print("-" * 60)
+    total = stats['total']
+    successful = stats['successful']
+    failed = stats['failed']
+    success_rate = successful / total * 100 if total > 0 else 0
+
+    print(f"Total documents:     {total:>8}")
+    print(f"Successful:          {successful:>8} ({success_rate:.1f}%)")
+    print(f"Failed:              {failed:>8} ({100-success_rate:.1f}%)")
+
+    # Processing time
+    if 'processing_time_stats' in stats:
+        pts = stats['processing_time_stats']
+        print(f"\nProcessing time (ms):")
+        print(f"  Average:           {pts['avg_ms']:>8.1f}")
+        print(f"  Min:               {pts['min_ms']:>8.1f}")
+        print(f"  Max:               {pts['max_ms']:>8.1f}")
+    elif stats.get('processing_times'):
+        times = stats['processing_times']
+        avg_time = sum(times) / len(times)
+        min_time = min(times)
+        max_time = max(times)
+        print(f"\nProcessing time (ms):")
+        print(f"  Average:           {avg_time:>8.1f}")
+        print(f"  Min:               {min_time:>8.1f}")
+        print(f"  Max:               {max_time:>8.1f}")
+
+    # By PDF type
+    print(f"\n{'BY PDF TYPE':^60}")
+    print("-" * 60)
+    print(f"{'Type':<15} {'Total':>10} {'Success':>10} {'Rate':>10}")
+    print("-" * 60)
+    for pdf_type, type_stats in sorted(stats['by_pdf_type'].items()):
+        type_total = type_stats['total']
+        type_success = type_stats['successful']
+        type_rate = type_success / type_total * 100 if type_total > 0 else 0
+        print(f"{pdf_type:<15} {type_total:>10} {type_success:>10} {type_rate:>9.1f}%")
+
+    # By field
+    print(f"\n{'FIELD MATCH STATISTICS':^60}")
+    print("-" * 60)
+    print(f"{'Field':<18} {'Total':>7} {'Match':>7} {'Rate':>7} {'Exact':>7} {'Flex':>7} {'AvgScore':>8}")
+    print("-" * 60)
+
+    for field_name in ['InvoiceNumber', 'InvoiceDate', 'InvoiceDueDate', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount']:
+        if field_name not in stats['by_field']:
+            continue
+        field_stats = stats['by_field'][field_name]
+        total = field_stats['total']
+        matched = field_stats['matched']
+        exact = field_stats['exact_match']
+        flex = field_stats['flexible_match']
+        rate = matched / total * 100 if total > 0 else 0
+
+        # Handle avg_score from either DB or file analysis
+        if 'avg_score' in field_stats:
+            avg_score = field_stats['avg_score']
+        elif field_stats['scores']:
+            avg_score = sum(field_stats['scores']) / len(field_stats['scores'])
+        else:
+            avg_score = 0
+
+        print(f"{field_name:<18} {total:>7} {matched:>7} {rate:>6.1f}% {exact:>7} {flex:>7} {avg_score:>8.3f}")
+
+    # Field match by PDF type
+    print(f"\n{'FIELD MATCH BY PDF TYPE':^60}")
+    print("-" * 60)
+
+    for pdf_type in sorted(stats['by_pdf_type'].keys()):
+        print(f"\n[{pdf_type.upper()}]")
+        print(f"{'Field':<18} {'Total':>10} {'Matched':>10} {'Rate':>10}")
+        print("-" * 50)
+
+        for field_name in ['InvoiceNumber', 'InvoiceDate', 'InvoiceDueDate', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount']:
+            if field_name not in stats['by_field']:
+                continue
+            type_stats = stats['by_field'][field_name]['by_pdf_type'].get(pdf_type, {'total': 0, 'matched': 0})
+            total = type_stats['total']
+            matched = type_stats['matched']
+            rate = matched / total * 100 if total > 0 else 0
+            print(f"{field_name:<18} {total:>10} {matched:>10} {rate:>9.1f}%")
+
+    # Errors
+    if stats.get('errors') and verbose:
+        print(f"\n{'ERRORS':^60}")
+        print("-" * 60)
+        for error, count in sorted(stats['errors'].items(), key=lambda x: -x[1])[:20]:
+            print(f"{count:>5}x  {error[:50]}")
+
+    print("\n" + "=" * 60)
+
+
+def export_json(stats: dict, output_path: str):
+    """Export statistics to JSON file."""
+    # Convert defaultdicts to regular dicts for JSON serialization
+    export_data = {
+        'total': stats['total'],
+        'successful': stats['successful'],
+        'failed': stats['failed'],
+        'by_pdf_type': dict(stats['by_pdf_type']),
+        'by_field': {},
+        'errors': dict(stats.get('errors', {})),
+    }
+
+    # Processing time stats
+    if 'processing_time_stats' in stats:
+        export_data['processing_time_stats'] = stats['processing_time_stats']
+    elif stats.get('processing_times'):
+        times = stats['processing_times']
+        export_data['processing_time_stats'] = {
+            'avg_ms': sum(times) / len(times),
+            'min_ms': min(times),
+            'max_ms': max(times),
+            'count': len(times)
+        }
+
+    # Field stats
+    for field_name, field_stats in stats['by_field'].items():
+        avg_score = field_stats.get('avg_score', 0)
+        if not avg_score and field_stats.get('scores'):
+            avg_score = sum(field_stats['scores']) / len(field_stats['scores'])
+
+        export_data['by_field'][field_name] = {
+            'total': field_stats['total'],
+            'matched': field_stats['matched'],
+            'exact_match': field_stats['exact_match'],
+            'flexible_match': field_stats['flexible_match'],
+            'match_rate': field_stats['matched'] / field_stats['total'] if field_stats['total'] > 0 else 0,
+            'avg_score': avg_score,
+            'by_pdf_type': dict(field_stats['by_pdf_type'])
+        }
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(export_data, f, indent=2, ensure_ascii=False)
+
+    print(f"\nStatistics exported to: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Analyze auto-label report'
+    )
+    parser.add_argument(
+        '--report', '-r',
+        default=None,
+        help='Path to autolabel report JSONL file (uses database if not specified)'
+    )
+    parser.add_argument(
+        '--output', '-o',
+        help='Export statistics to JSON file'
+    )
+    parser.add_argument(
+        '--verbose', '-v',
+        action='store_true',
+        help='Show detailed error messages'
+    )
+    parser.add_argument(
+        '--from-file',
+        action='store_true',
+        help='Force reading from JSONL file instead of database'
+    )
+
+    args = parser.parse_args()
+
+    # Decide source
+    use_db = not args.from_file and args.report is None
+
+    if use_db:
+        print("Loading statistics from database...")
+        stats = load_reports_from_db()
+        print(f"Loaded stats for {stats['total']} documents")
+    else:
+        report_path = args.report or 'reports/autolabel_report.jsonl'
+        path = Path(report_path)
+
+        # Check if file exists (handle glob patterns)
+        if '*' not in str(path) and '?' not in str(path) and not path.exists():
+            print(f"Error: Report file not found: {path}")
+            return 1
+
+        print(f"Loading reports from: {report_path}")
+        reports = load_reports_from_file(report_path)
+        print(f"Loaded {len(reports)} reports")
+        stats = analyze_reports(reports)
+
+    print_report(stats, verbose=args.verbose)
+
+    if args.output:
+        export_json(stats, args.output)
+
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())