Initial commit: Invoice field extraction system using YOLO + OCR

Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 17:44:14 +01:00
commit 8938661850
35 changed files with 5020 additions and 0 deletions
--- a/src/matcher/init.py
+++ b/src/matcher/init.py
@@ -0,0 +1,3 @@
+from .field_matcher import FieldMatcher, Match, find_field_matches
+
+__all__ = ['FieldMatcher', 'Match', 'find_field_matches']
--- a/src/matcher/field_matcher.py
+++ b/src/matcher/field_matcher.py
@@ -0,0 +1,618 @@
+"""
+Field Matching Module
+
+Matches normalized field values to tokens extracted from documents.
+"""
+
+from dataclasses import dataclass
+from typing import Protocol
+import re
+
+
+class TokenLike(Protocol):
+    """Protocol for token objects."""
+    text: str
+    bbox: tuple[float, float, float, float]
+    page_no: int
+
+
+@dataclass
+class Match:
+    """Represents a matched field in the document."""
+    field: str
+    value: str
+    bbox: tuple[float, float, float, float]  # (x0, y0, x1, y1)
+    page_no: int
+    score: float  # 0-1 confidence score
+    matched_text: str  # Actual text that matched
+    context_keywords: list[str]  # Nearby keywords that boosted confidence
+
+    def to_yolo_format(self, image_width: float, image_height: float, class_id: int) -> str:
+        """Convert to YOLO annotation format."""
+        x0, y0, x1, y1 = self.bbox
+
+        x_center = (x0 + x1) / 2 / image_width
+        y_center = (y0 + y1) / 2 / image_height
+        width = (x1 - x0) / image_width
+        height = (y1 - y0) / image_height
+
+        return f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}"
+
+
+# Context keywords for each field type (Swedish invoice terms)
+CONTEXT_KEYWORDS = {
+    'InvoiceNumber': ['fakturanr', 'fakturanummer', 'invoice', 'inv.nr', 'inv nr', 'nr'],
+    'InvoiceDate': ['fakturadatum', 'datum', 'date', 'utfärdad', 'utskriftsdatum', 'dokumentdatum'],
+    'InvoiceDueDate': ['förfallodatum', 'förfaller', 'due date', 'betalas senast', 'att betala senast',
+                       'förfallodag', 'oss tillhanda senast', 'senast'],
+    'OCR': ['ocr', 'referens', 'betalningsreferens', 'ref'],
+    'Bankgiro': ['bankgiro', 'bg', 'bg-nr', 'bg nr'],
+    'Plusgiro': ['plusgiro', 'pg', 'pg-nr', 'pg nr'],
+    'Amount': ['att betala', 'summa', 'total', 'belopp', 'amount', 'totalt', 'att erlägga', 'sek', 'kr'],
+}
+
+
+class FieldMatcher:
+    """Matches field values to document tokens."""
+
+    def __init__(
+        self,
+        context_radius: float = 100.0,  # pixels
+        min_score_threshold: float = 0.5
+    ):
+        """
+        Initialize the matcher.
+
+        Args:
+            context_radius: Distance to search for context keywords
+            min_score_threshold: Minimum score to consider a match valid
+        """
+        self.context_radius = context_radius
+        self.min_score_threshold = min_score_threshold
+
+    def find_matches(
+        self,
+        tokens: list[TokenLike],
+        field_name: str,
+        normalized_values: list[str],
+        page_no: int = 0
+    ) -> list[Match]:
+        """
+        Find all matches for a field in the token list.
+
+        Args:
+            tokens: List of tokens from the document
+            field_name: Name of the field to match
+            normalized_values: List of normalized value variants to search for
+            page_no: Page number to filter tokens
+
+        Returns:
+            List of Match objects sorted by score (descending)
+        """
+        matches = []
+        page_tokens = [t for t in tokens if t.page_no == page_no]
+
+        for value in normalized_values:
+            # Strategy 1: Exact token match
+            exact_matches = self._find_exact_matches(page_tokens, value, field_name)
+            matches.extend(exact_matches)
+
+            # Strategy 2: Multi-token concatenation
+            concat_matches = self._find_concatenated_matches(page_tokens, value, field_name)
+            matches.extend(concat_matches)
+
+            # Strategy 3: Fuzzy match (for amounts and dates only)
+            if field_name in ('Amount', 'InvoiceDate', 'InvoiceDueDate'):
+                fuzzy_matches = self._find_fuzzy_matches(page_tokens, value, field_name)
+                matches.extend(fuzzy_matches)
+
+            # Strategy 4: Substring match (for dates embedded in longer text)
+            if field_name in ('InvoiceDate', 'InvoiceDueDate'):
+                substring_matches = self._find_substring_matches(page_tokens, value, field_name)
+                matches.extend(substring_matches)
+
+        # Strategy 5: Flexible date matching (year-month match, nearby dates, heuristic selection)
+        # Only if no exact matches found for date fields
+        if field_name in ('InvoiceDate', 'InvoiceDueDate') and not matches:
+            flexible_matches = self._find_flexible_date_matches(
+                page_tokens, normalized_values, field_name
+            )
+            matches.extend(flexible_matches)
+
+        # Deduplicate and sort by score
+        matches = self._deduplicate_matches(matches)
+        matches.sort(key=lambda m: m.score, reverse=True)
+
+        return [m for m in matches if m.score >= self.min_score_threshold]
+
+    def _find_exact_matches(
+        self,
+        tokens: list[TokenLike],
+        value: str,
+        field_name: str
+    ) -> list[Match]:
+        """Find tokens that exactly match the value."""
+        matches = []
+
+        for token in tokens:
+            token_text = token.text.strip()
+
+            # Exact match
+            if token_text == value:
+                score = 1.0
+            # Case-insensitive match
+            elif token_text.lower() == value.lower():
+                score = 0.95
+            # Digits-only match for numeric fields
+            elif field_name in ('InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro'):
+                token_digits = re.sub(r'\D', '', token_text)
+                value_digits = re.sub(r'\D', '', value)
+                if token_digits and token_digits == value_digits:
+                    score = 0.9
+                else:
+                    continue
+            else:
+                continue
+
+            # Boost score if context keywords are nearby
+            context_keywords, context_boost = self._find_context_keywords(
+                tokens, token, field_name
+            )
+            score = min(1.0, score + context_boost)
+
+            matches.append(Match(
+                field=field_name,
+                value=value,
+                bbox=token.bbox,
+                page_no=token.page_no,
+                score=score,
+                matched_text=token_text,
+                context_keywords=context_keywords
+            ))
+
+        return matches
+
+    def _find_concatenated_matches(
+        self,
+        tokens: list[TokenLike],
+        value: str,
+        field_name: str
+    ) -> list[Match]:
+        """Find value by concatenating adjacent tokens."""
+        matches = []
+        value_clean = re.sub(r'\s+', '', value)
+
+        # Sort tokens by position (top-to-bottom, left-to-right)
+        sorted_tokens = sorted(tokens, key=lambda t: (t.bbox[1], t.bbox[0]))
+
+        for i, start_token in enumerate(sorted_tokens):
+            # Try to build the value by concatenating nearby tokens
+            concat_text = start_token.text.strip()
+            concat_bbox = list(start_token.bbox)
+            used_tokens = [start_token]
+
+            for j in range(i + 1, min(i + 5, len(sorted_tokens))):  # Max 5 tokens
+                next_token = sorted_tokens[j]
+
+                # Check if tokens are on the same line (y overlap)
+                if not self._tokens_on_same_line(start_token, next_token):
+                    break
+
+                # Check horizontal proximity
+                if next_token.bbox[0] - concat_bbox[2] > 50:  # Max 50px gap
+                    break
+
+                concat_text += next_token.text.strip()
+                used_tokens.append(next_token)
+
+                # Update bounding box
+                concat_bbox[0] = min(concat_bbox[0], next_token.bbox[0])
+                concat_bbox[1] = min(concat_bbox[1], next_token.bbox[1])
+                concat_bbox[2] = max(concat_bbox[2], next_token.bbox[2])
+                concat_bbox[3] = max(concat_bbox[3], next_token.bbox[3])
+
+                # Check for match
+                concat_clean = re.sub(r'\s+', '', concat_text)
+                if concat_clean == value_clean:
+                    context_keywords, context_boost = self._find_context_keywords(
+                        tokens, start_token, field_name
+                    )
+
+                    matches.append(Match(
+                        field=field_name,
+                        value=value,
+                        bbox=tuple(concat_bbox),
+                        page_no=start_token.page_no,
+                        score=min(1.0, 0.85 + context_boost),  # Slightly lower base score
+                        matched_text=concat_text,
+                        context_keywords=context_keywords
+                    ))
+                    break
+
+        return matches
+
+    def _find_substring_matches(
+        self,
+        tokens: list[TokenLike],
+        value: str,
+        field_name: str
+    ) -> list[Match]:
+        """
+        Find value as a substring within longer tokens.
+
+        Handles cases like 'Fakturadatum: 2026-01-09' where the date
+        is embedded in a longer text string.
+
+        Uses lower score (0.75) than exact match to prefer exact matches.
+        Only matches if the value appears as a distinct segment (not part of a number).
+        """
+        matches = []
+
+        # Only use for date fields - other fields risk false positives
+        if field_name not in ('InvoiceDate', 'InvoiceDueDate'):
+            return matches
+
+        for token in tokens:
+            token_text = token.text.strip()
+
+            # Skip if token is the same length as value (would be exact match)
+            if len(token_text) <= len(value):
+                continue
+
+            # Check if value appears as substring
+            if value in token_text:
+                # Verify it's a proper boundary match (not part of a larger number)
+                idx = token_text.find(value)
+
+                # Check character before (if exists)
+                if idx > 0:
+                    char_before = token_text[idx - 1]
+                    # Must be non-digit (allow : space - etc)
+                    if char_before.isdigit():
+                        continue
+
+                # Check character after (if exists)
+                end_idx = idx + len(value)
+                if end_idx < len(token_text):
+                    char_after = token_text[end_idx]
+                    # Must be non-digit
+                    if char_after.isdigit():
+                        continue
+
+                # Found valid substring match
+                context_keywords, context_boost = self._find_context_keywords(
+                    tokens, token, field_name
+                )
+
+                # Check if context keyword is in the same token (like "Fakturadatum:")
+                token_lower = token_text.lower()
+                inline_context = []
+                for keyword in CONTEXT_KEYWORDS.get(field_name, []):
+                    if keyword in token_lower:
+                        inline_context.append(keyword)
+
+                # Boost score if keyword is inline
+                inline_boost = 0.1 if inline_context else 0
+
+                matches.append(Match(
+                    field=field_name,
+                    value=value,
+                    bbox=token.bbox,  # Use full token bbox
+                    page_no=token.page_no,
+                    score=min(1.0, 0.75 + context_boost + inline_boost),  # Lower than exact match
+                    matched_text=token_text,
+                    context_keywords=context_keywords + inline_context
+                ))
+
+        return matches
+
+    def _find_fuzzy_matches(
+        self,
+        tokens: list[TokenLike],
+        value: str,
+        field_name: str
+    ) -> list[Match]:
+        """Find approximate matches for amounts and dates."""
+        matches = []
+
+        for token in tokens:
+            token_text = token.text.strip()
+
+            if field_name == 'Amount':
+                # Try to parse both as numbers
+                try:
+                    token_num = self._parse_amount(token_text)
+                    value_num = self._parse_amount(value)
+
+                    if token_num is not None and value_num is not None:
+                        if abs(token_num - value_num) < 0.01:  # Within 1 cent
+                            context_keywords, context_boost = self._find_context_keywords(
+                                tokens, token, field_name
+                            )
+
+                            matches.append(Match(
+                                field=field_name,
+                                value=value,
+                                bbox=token.bbox,
+                                page_no=token.page_no,
+                                score=min(1.0, 0.8 + context_boost),
+                                matched_text=token_text,
+                                context_keywords=context_keywords
+                            ))
+                except:
+                    pass
+
+        return matches
+
+    def _find_flexible_date_matches(
+        self,
+        tokens: list[TokenLike],
+        normalized_values: list[str],
+        field_name: str
+    ) -> list[Match]:
+        """
+        Flexible date matching when exact match fails.
+
+        Strategies:
+        1. Year-month match: If CSV has 2026-01-15, match any 2026-01-XX date
+        2. Nearby date match: Match dates within 7 days of CSV value
+        3. Heuristic selection: Use context keywords to select the best date
+
+        This handles cases where CSV InvoiceDate doesn't exactly match PDF,
+        but we can still find a reasonable date to label.
+        """
+        from datetime import datetime, timedelta
+
+        matches = []
+
+        # Parse the target date from normalized values
+        target_date = None
+        for value in normalized_values:
+            # Try to parse YYYY-MM-DD format
+            date_match = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', value)
+            if date_match:
+                try:
+                    target_date = datetime(
+                        int(date_match.group(1)),
+                        int(date_match.group(2)),
+                        int(date_match.group(3))
+                    )
+                    break
+                except ValueError:
+                    continue
+
+        if not target_date:
+            return matches
+
+        # Find all date-like tokens in the document
+        date_candidates = []
+        date_pattern = re.compile(r'(\d{4})-(\d{2})-(\d{2})')
+
+        for token in tokens:
+            token_text = token.text.strip()
+
+            # Search for date pattern in token
+            for match in date_pattern.finditer(token_text):
+                try:
+                    found_date = datetime(
+                        int(match.group(1)),
+                        int(match.group(2)),
+                        int(match.group(3))
+                    )
+                    date_str = match.group(0)
+
+                    # Calculate date difference
+                    days_diff = abs((found_date - target_date).days)
+
+                    # Check for context keywords
+                    context_keywords, context_boost = self._find_context_keywords(
+                        tokens, token, field_name
+                    )
+
+                    # Check if keyword is in the same token
+                    token_lower = token_text.lower()
+                    inline_keywords = []
+                    for keyword in CONTEXT_KEYWORDS.get(field_name, []):
+                        if keyword in token_lower:
+                            inline_keywords.append(keyword)
+
+                    date_candidates.append({
+                        'token': token,
+                        'date': found_date,
+                        'date_str': date_str,
+                        'matched_text': token_text,
+                        'days_diff': days_diff,
+                        'context_keywords': context_keywords + inline_keywords,
+                        'context_boost': context_boost + (0.1 if inline_keywords else 0),
+                        'same_year_month': (found_date.year == target_date.year and
+                                           found_date.month == target_date.month),
+                    })
+                except ValueError:
+                    continue
+
+        if not date_candidates:
+            return matches
+
+        # Score and rank candidates
+        for candidate in date_candidates:
+            score = 0.0
+
+            # Strategy 1: Same year-month gets higher score
+            if candidate['same_year_month']:
+                score = 0.7
+                # Bonus if day is close
+                if candidate['days_diff'] <= 7:
+                    score = 0.75
+                if candidate['days_diff'] <= 3:
+                    score = 0.8
+            # Strategy 2: Nearby dates (within 14 days)
+            elif candidate['days_diff'] <= 14:
+                score = 0.6
+            elif candidate['days_diff'] <= 30:
+                score = 0.55
+            else:
+                # Too far apart, skip unless has strong context
+                if not candidate['context_keywords']:
+                    continue
+                score = 0.5
+
+            # Strategy 3: Boost with context keywords
+            score = min(1.0, score + candidate['context_boost'])
+
+            # For InvoiceDate, prefer dates that appear near invoice-related keywords
+            # For InvoiceDueDate, prefer dates near due-date keywords
+            if candidate['context_keywords']:
+                score = min(1.0, score + 0.05)
+
+            if score >= self.min_score_threshold:
+                matches.append(Match(
+                    field=field_name,
+                    value=candidate['date_str'],
+                    bbox=candidate['token'].bbox,
+                    page_no=candidate['token'].page_no,
+                    score=score,
+                    matched_text=candidate['matched_text'],
+                    context_keywords=candidate['context_keywords']
+                ))
+
+        # Sort by score and return best matches
+        matches.sort(key=lambda m: m.score, reverse=True)
+
+        # Only return the best match to avoid multiple labels for same field
+        return matches[:1] if matches else []
+
+    def _find_context_keywords(
+        self,
+        tokens: list[TokenLike],
+        target_token: TokenLike,
+        field_name: str
+    ) -> tuple[list[str], float]:
+        """Find context keywords near the target token."""
+        keywords = CONTEXT_KEYWORDS.get(field_name, [])
+        found_keywords = []
+
+        target_center = (
+            (target_token.bbox[0] + target_token.bbox[2]) / 2,
+            (target_token.bbox[1] + target_token.bbox[3]) / 2
+        )
+
+        for token in tokens:
+            if token is target_token:
+                continue
+
+            token_center = (
+                (token.bbox[0] + token.bbox[2]) / 2,
+                (token.bbox[1] + token.bbox[3]) / 2
+            )
+
+            # Calculate distance
+            distance = (
+                (target_center[0] - token_center[0]) ** 2 +
+                (target_center[1] - token_center[1]) ** 2
+            ) ** 0.5
+
+            if distance <= self.context_radius:
+                token_lower = token.text.lower()
+                for keyword in keywords:
+                    if keyword in token_lower:
+                        found_keywords.append(keyword)
+
+        # Calculate boost based on keywords found
+        boost = min(0.15, len(found_keywords) * 0.05)
+        return found_keywords, boost
+
+    def _tokens_on_same_line(self, token1: TokenLike, token2: TokenLike) -> bool:
+        """Check if two tokens are on the same line."""
+        # Check vertical overlap
+        y_overlap = min(token1.bbox[3], token2.bbox[3]) - max(token1.bbox[1], token2.bbox[1])
+        min_height = min(token1.bbox[3] - token1.bbox[1], token2.bbox[3] - token2.bbox[1])
+        return y_overlap > min_height * 0.5
+
+    def _parse_amount(self, text: str) -> float | None:
+        """Try to parse text as a monetary amount."""
+        # Remove currency and spaces
+        text = re.sub(r'[SEK|kr|:-]', '', text, flags=re.IGNORECASE)
+        text = text.replace(' ', '').replace('\xa0', '')
+
+        # Try comma as decimal separator
+        if ',' in text and '.' not in text:
+            text = text.replace(',', '.')
+
+        try:
+            return float(text)
+        except ValueError:
+            return None
+
+    def _deduplicate_matches(self, matches: list[Match]) -> list[Match]:
+        """Remove duplicate matches based on bbox overlap."""
+        if not matches:
+            return []
+
+        # Sort by score descending
+        matches.sort(key=lambda m: m.score, reverse=True)
+        unique = []
+
+        for match in matches:
+            is_duplicate = False
+            for existing in unique:
+                if self._bbox_overlap(match.bbox, existing.bbox) > 0.7:
+                    is_duplicate = True
+                    break
+
+            if not is_duplicate:
+                unique.append(match)
+
+        return unique
+
+    def _bbox_overlap(
+        self,
+        bbox1: tuple[float, float, float, float],
+        bbox2: tuple[float, float, float, float]
+    ) -> float:
+        """Calculate IoU (Intersection over Union) of two bounding boxes."""
+        x1 = max(bbox1[0], bbox2[0])
+        y1 = max(bbox1[1], bbox2[1])
+        x2 = min(bbox1[2], bbox2[2])
+        y2 = min(bbox1[3], bbox2[3])
+
+        if x2 <= x1 or y2 <= y1:
+            return 0.0
+
+        intersection = (x2 - x1) * (y2 - y1)
+        area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
+        area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+        union = area1 + area2 - intersection
+
+        return intersection / union if union > 0 else 0.0
+
+
+def find_field_matches(
+    tokens: list[TokenLike],
+    field_values: dict[str, str],
+    page_no: int = 0
+) -> dict[str, list[Match]]:
+    """
+    Convenience function to find matches for multiple fields.
+
+    Args:
+        tokens: List of tokens from the document
+        field_values: Dict of field_name -> value to search for
+        page_no: Page number
+
+    Returns:
+        Dict of field_name -> list of matches
+    """
+    from ..normalize import normalize_field
+
+    matcher = FieldMatcher()
+    results = {}
+
+    for field_name, value in field_values.items():
+        if value is None or str(value).strip() == '':
+            continue
+
+        normalized_values = normalize_field(field_name, str(value))
+        matches = matcher.find_matches(tokens, field_name, normalized_values, page_no)
+        results[field_name] = matches
+
+    return results