Initial commit: Invoice field extraction system using YOLO + OCR

Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 17:44:14 +01:00
commit 8938661850
35 changed files with 5020 additions and 0 deletions
--- a/src/data/autolabel_report.py
+++ b/src/data/autolabel_report.py
@@ -0,0 +1,252 @@
+"""
+Auto-Label Report Generator
+
+Generates quality control reports for auto-labeling process.
+"""
+
+import json
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+
+@dataclass
+class FieldMatchResult:
+    """Result of matching a single field."""
+    field_name: str
+    csv_value: str | None
+    matched: bool
+    score: float = 0.0
+    matched_text: str | None = None
+    candidate_used: str | None = None  # Which normalized variant matched
+    bbox: tuple[float, float, float, float] | None = None
+    page_no: int = 0
+    context_keywords: list[str] = field(default_factory=list)
+    error: str | None = None
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary."""
+        # Convert bbox to native Python floats to avoid numpy serialization issues
+        bbox_list = None
+        if self.bbox:
+            bbox_list = [float(x) for x in self.bbox]
+
+        return {
+            'field_name': self.field_name,
+            'csv_value': self.csv_value,
+            'matched': self.matched,
+            'score': float(self.score) if self.score else 0.0,
+            'matched_text': self.matched_text,
+            'candidate_used': self.candidate_used,
+            'bbox': bbox_list,
+            'page_no': int(self.page_no) if self.page_no else 0,
+            'context_keywords': self.context_keywords,
+            'error': self.error
+        }
+
+
+@dataclass
+class AutoLabelReport:
+    """Report for a single document's auto-labeling process."""
+    document_id: str
+    pdf_path: str | None = None
+    pdf_type: str | None = None  # 'text' | 'scanned' | 'mixed'
+    success: bool = False
+    total_pages: int = 0
+    fields_matched: int = 0
+    fields_total: int = 0
+    field_results: list[FieldMatchResult] = field(default_factory=list)
+    annotations_generated: int = 0
+    image_paths: list[str] = field(default_factory=list)
+    label_paths: list[str] = field(default_factory=list)
+    processing_time_ms: float = 0.0
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+    errors: list[str] = field(default_factory=list)
+
+    def add_field_result(self, result: FieldMatchResult) -> None:
+        """Add a field matching result."""
+        self.field_results.append(result)
+        self.fields_total += 1
+        if result.matched:
+            self.fields_matched += 1
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary."""
+        return {
+            'document_id': self.document_id,
+            'pdf_path': self.pdf_path,
+            'pdf_type': self.pdf_type,
+            'success': self.success,
+            'total_pages': self.total_pages,
+            'fields_matched': self.fields_matched,
+            'fields_total': self.fields_total,
+            'field_results': [r.to_dict() for r in self.field_results],
+            'annotations_generated': self.annotations_generated,
+            'image_paths': self.image_paths,
+            'label_paths': self.label_paths,
+            'processing_time_ms': self.processing_time_ms,
+            'timestamp': self.timestamp,
+            'errors': self.errors
+        }
+
+    def to_json(self, indent: int | None = None) -> str:
+        """Convert to JSON string."""
+        return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
+
+    @property
+    def match_rate(self) -> float:
+        """Calculate field match rate."""
+        if self.fields_total == 0:
+            return 0.0
+        return self.fields_matched / self.fields_total
+
+    def get_summary(self) -> dict:
+        """Get a summary of the report."""
+        return {
+            'document_id': self.document_id,
+            'success': self.success,
+            'match_rate': f"{self.match_rate:.1%}",
+            'fields': f"{self.fields_matched}/{self.fields_total}",
+            'annotations': self.annotations_generated,
+            'errors': len(self.errors)
+        }
+
+
+class ReportWriter:
+    """Writes auto-label reports to file."""
+
+    def __init__(self, output_path: str | Path):
+        """
+        Initialize report writer.
+
+        Args:
+            output_path: Path to output JSONL file
+        """
+        self.output_path = Path(output_path)
+        self.output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    def write(self, report: AutoLabelReport) -> None:
+        """Append a report to the output file."""
+        with open(self.output_path, 'a', encoding='utf-8') as f:
+            f.write(report.to_json() + '\n')
+
+    def write_dict(self, report_dict: dict) -> None:
+        """Append a report dict to the output file (for parallel processing)."""
+        import json
+        with open(self.output_path, 'a', encoding='utf-8') as f:
+            f.write(json.dumps(report_dict, ensure_ascii=False) + '\n')
+            f.flush()
+
+    def write_batch(self, reports: list[AutoLabelReport]) -> None:
+        """Write multiple reports."""
+        with open(self.output_path, 'a', encoding='utf-8') as f:
+            for report in reports:
+                f.write(report.to_json() + '\n')
+
+
+class ReportReader:
+    """Reads auto-label reports from file."""
+
+    def __init__(self, input_path: str | Path):
+        """
+        Initialize report reader.
+
+        Args:
+            input_path: Path to input JSONL file
+        """
+        self.input_path = Path(input_path)
+
+    def read_all(self) -> list[AutoLabelReport]:
+        """Read all reports from file."""
+        reports = []
+
+        if not self.input_path.exists():
+            return reports
+
+        with open(self.input_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+
+                data = json.loads(line)
+                report = self._dict_to_report(data)
+                reports.append(report)
+
+        return reports
+
+    def _dict_to_report(self, data: dict) -> AutoLabelReport:
+        """Convert dictionary to AutoLabelReport."""
+        field_results = []
+        for fr_data in data.get('field_results', []):
+            bbox = tuple(fr_data['bbox']) if fr_data.get('bbox') else None
+            field_results.append(FieldMatchResult(
+                field_name=fr_data['field_name'],
+                csv_value=fr_data.get('csv_value'),
+                matched=fr_data.get('matched', False),
+                score=fr_data.get('score', 0.0),
+                matched_text=fr_data.get('matched_text'),
+                candidate_used=fr_data.get('candidate_used'),
+                bbox=bbox,
+                page_no=fr_data.get('page_no', 0),
+                context_keywords=fr_data.get('context_keywords', []),
+                error=fr_data.get('error')
+            ))
+
+        return AutoLabelReport(
+            document_id=data['document_id'],
+            pdf_path=data.get('pdf_path'),
+            pdf_type=data.get('pdf_type'),
+            success=data.get('success', False),
+            total_pages=data.get('total_pages', 0),
+            fields_matched=data.get('fields_matched', 0),
+            fields_total=data.get('fields_total', 0),
+            field_results=field_results,
+            annotations_generated=data.get('annotations_generated', 0),
+            image_paths=data.get('image_paths', []),
+            label_paths=data.get('label_paths', []),
+            processing_time_ms=data.get('processing_time_ms', 0.0),
+            timestamp=data.get('timestamp', ''),
+            errors=data.get('errors', [])
+        )
+
+    def get_statistics(self) -> dict:
+        """Calculate statistics from all reports."""
+        reports = self.read_all()
+
+        if not reports:
+            return {'total': 0}
+
+        successful = sum(1 for r in reports if r.success)
+        total_fields_matched = sum(r.fields_matched for r in reports)
+        total_fields = sum(r.fields_total for r in reports)
+        total_annotations = sum(r.annotations_generated for r in reports)
+
+        # Per-field statistics
+        field_stats = {}
+        for report in reports:
+            for fr in report.field_results:
+                if fr.field_name not in field_stats:
+                    field_stats[fr.field_name] = {'matched': 0, 'total': 0, 'avg_score': 0.0}
+                field_stats[fr.field_name]['total'] += 1
+                if fr.matched:
+                    field_stats[fr.field_name]['matched'] += 1
+                    field_stats[fr.field_name]['avg_score'] += fr.score
+
+        # Calculate averages
+        for field_name, stats in field_stats.items():
+            if stats['matched'] > 0:
+                stats['avg_score'] /= stats['matched']
+            stats['match_rate'] = stats['matched'] / stats['total'] if stats['total'] > 0 else 0
+
+        return {
+            'total': len(reports),
+            'successful': successful,
+            'success_rate': successful / len(reports),
+            'total_fields_matched': total_fields_matched,
+            'total_fields': total_fields,
+            'overall_match_rate': total_fields_matched / total_fields if total_fields > 0 else 0,
+            'total_annotations': total_annotations,
+            'field_statistics': field_stats
+        }