Initial commit: Invoice field extraction system using YOLO + OCR

Features:
- Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations
- Flexible date matching: year-month match, nearby date tolerance
- PDF text extraction with PyMuPDF
- OCR support for scanned documents (PaddleOCR)
- YOLO training and inference pipeline
- 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-01-10 17:44:14 +01:00
commit 8938661850
35 changed files with 5020 additions and 0 deletions

3
src/matcher/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .field_matcher import FieldMatcher, Match, find_field_matches
__all__ = ['FieldMatcher', 'Match', 'find_field_matches']

View File

@@ -0,0 +1,618 @@
"""
Field Matching Module
Matches normalized field values to tokens extracted from documents.
"""
from dataclasses import dataclass
from typing import Protocol
import re
class TokenLike(Protocol):
"""Protocol for token objects."""
text: str
bbox: tuple[float, float, float, float]
page_no: int
@dataclass
class Match:
"""Represents a matched field in the document."""
field: str
value: str
bbox: tuple[float, float, float, float] # (x0, y0, x1, y1)
page_no: int
score: float # 0-1 confidence score
matched_text: str # Actual text that matched
context_keywords: list[str] # Nearby keywords that boosted confidence
def to_yolo_format(self, image_width: float, image_height: float, class_id: int) -> str:
"""Convert to YOLO annotation format."""
x0, y0, x1, y1 = self.bbox
x_center = (x0 + x1) / 2 / image_width
y_center = (y0 + y1) / 2 / image_height
width = (x1 - x0) / image_width
height = (y1 - y0) / image_height
return f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}"
# Context keywords for each field type (Swedish invoice terms)
CONTEXT_KEYWORDS = {
'InvoiceNumber': ['fakturanr', 'fakturanummer', 'invoice', 'inv.nr', 'inv nr', 'nr'],
'InvoiceDate': ['fakturadatum', 'datum', 'date', 'utfärdad', 'utskriftsdatum', 'dokumentdatum'],
'InvoiceDueDate': ['förfallodatum', 'förfaller', 'due date', 'betalas senast', 'att betala senast',
'förfallodag', 'oss tillhanda senast', 'senast'],
'OCR': ['ocr', 'referens', 'betalningsreferens', 'ref'],
'Bankgiro': ['bankgiro', 'bg', 'bg-nr', 'bg nr'],
'Plusgiro': ['plusgiro', 'pg', 'pg-nr', 'pg nr'],
'Amount': ['att betala', 'summa', 'total', 'belopp', 'amount', 'totalt', 'att erlägga', 'sek', 'kr'],
}
class FieldMatcher:
"""Matches field values to document tokens."""
def __init__(
self,
context_radius: float = 100.0, # pixels
min_score_threshold: float = 0.5
):
"""
Initialize the matcher.
Args:
context_radius: Distance to search for context keywords
min_score_threshold: Minimum score to consider a match valid
"""
self.context_radius = context_radius
self.min_score_threshold = min_score_threshold
def find_matches(
self,
tokens: list[TokenLike],
field_name: str,
normalized_values: list[str],
page_no: int = 0
) -> list[Match]:
"""
Find all matches for a field in the token list.
Args:
tokens: List of tokens from the document
field_name: Name of the field to match
normalized_values: List of normalized value variants to search for
page_no: Page number to filter tokens
Returns:
List of Match objects sorted by score (descending)
"""
matches = []
page_tokens = [t for t in tokens if t.page_no == page_no]
for value in normalized_values:
# Strategy 1: Exact token match
exact_matches = self._find_exact_matches(page_tokens, value, field_name)
matches.extend(exact_matches)
# Strategy 2: Multi-token concatenation
concat_matches = self._find_concatenated_matches(page_tokens, value, field_name)
matches.extend(concat_matches)
# Strategy 3: Fuzzy match (for amounts and dates only)
if field_name in ('Amount', 'InvoiceDate', 'InvoiceDueDate'):
fuzzy_matches = self._find_fuzzy_matches(page_tokens, value, field_name)
matches.extend(fuzzy_matches)
# Strategy 4: Substring match (for dates embedded in longer text)
if field_name in ('InvoiceDate', 'InvoiceDueDate'):
substring_matches = self._find_substring_matches(page_tokens, value, field_name)
matches.extend(substring_matches)
# Strategy 5: Flexible date matching (year-month match, nearby dates, heuristic selection)
# Only if no exact matches found for date fields
if field_name in ('InvoiceDate', 'InvoiceDueDate') and not matches:
flexible_matches = self._find_flexible_date_matches(
page_tokens, normalized_values, field_name
)
matches.extend(flexible_matches)
# Deduplicate and sort by score
matches = self._deduplicate_matches(matches)
matches.sort(key=lambda m: m.score, reverse=True)
return [m for m in matches if m.score >= self.min_score_threshold]
def _find_exact_matches(
self,
tokens: list[TokenLike],
value: str,
field_name: str
) -> list[Match]:
"""Find tokens that exactly match the value."""
matches = []
for token in tokens:
token_text = token.text.strip()
# Exact match
if token_text == value:
score = 1.0
# Case-insensitive match
elif token_text.lower() == value.lower():
score = 0.95
# Digits-only match for numeric fields
elif field_name in ('InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro'):
token_digits = re.sub(r'\D', '', token_text)
value_digits = re.sub(r'\D', '', value)
if token_digits and token_digits == value_digits:
score = 0.9
else:
continue
else:
continue
# Boost score if context keywords are nearby
context_keywords, context_boost = self._find_context_keywords(
tokens, token, field_name
)
score = min(1.0, score + context_boost)
matches.append(Match(
field=field_name,
value=value,
bbox=token.bbox,
page_no=token.page_no,
score=score,
matched_text=token_text,
context_keywords=context_keywords
))
return matches
def _find_concatenated_matches(
self,
tokens: list[TokenLike],
value: str,
field_name: str
) -> list[Match]:
"""Find value by concatenating adjacent tokens."""
matches = []
value_clean = re.sub(r'\s+', '', value)
# Sort tokens by position (top-to-bottom, left-to-right)
sorted_tokens = sorted(tokens, key=lambda t: (t.bbox[1], t.bbox[0]))
for i, start_token in enumerate(sorted_tokens):
# Try to build the value by concatenating nearby tokens
concat_text = start_token.text.strip()
concat_bbox = list(start_token.bbox)
used_tokens = [start_token]
for j in range(i + 1, min(i + 5, len(sorted_tokens))): # Max 5 tokens
next_token = sorted_tokens[j]
# Check if tokens are on the same line (y overlap)
if not self._tokens_on_same_line(start_token, next_token):
break
# Check horizontal proximity
if next_token.bbox[0] - concat_bbox[2] > 50: # Max 50px gap
break
concat_text += next_token.text.strip()
used_tokens.append(next_token)
# Update bounding box
concat_bbox[0] = min(concat_bbox[0], next_token.bbox[0])
concat_bbox[1] = min(concat_bbox[1], next_token.bbox[1])
concat_bbox[2] = max(concat_bbox[2], next_token.bbox[2])
concat_bbox[3] = max(concat_bbox[3], next_token.bbox[3])
# Check for match
concat_clean = re.sub(r'\s+', '', concat_text)
if concat_clean == value_clean:
context_keywords, context_boost = self._find_context_keywords(
tokens, start_token, field_name
)
matches.append(Match(
field=field_name,
value=value,
bbox=tuple(concat_bbox),
page_no=start_token.page_no,
score=min(1.0, 0.85 + context_boost), # Slightly lower base score
matched_text=concat_text,
context_keywords=context_keywords
))
break
return matches
def _find_substring_matches(
self,
tokens: list[TokenLike],
value: str,
field_name: str
) -> list[Match]:
"""
Find value as a substring within longer tokens.
Handles cases like 'Fakturadatum: 2026-01-09' where the date
is embedded in a longer text string.
Uses lower score (0.75) than exact match to prefer exact matches.
Only matches if the value appears as a distinct segment (not part of a number).
"""
matches = []
# Only use for date fields - other fields risk false positives
if field_name not in ('InvoiceDate', 'InvoiceDueDate'):
return matches
for token in tokens:
token_text = token.text.strip()
# Skip if token is the same length as value (would be exact match)
if len(token_text) <= len(value):
continue
# Check if value appears as substring
if value in token_text:
# Verify it's a proper boundary match (not part of a larger number)
idx = token_text.find(value)
# Check character before (if exists)
if idx > 0:
char_before = token_text[idx - 1]
# Must be non-digit (allow : space - etc)
if char_before.isdigit():
continue
# Check character after (if exists)
end_idx = idx + len(value)
if end_idx < len(token_text):
char_after = token_text[end_idx]
# Must be non-digit
if char_after.isdigit():
continue
# Found valid substring match
context_keywords, context_boost = self._find_context_keywords(
tokens, token, field_name
)
# Check if context keyword is in the same token (like "Fakturadatum:")
token_lower = token_text.lower()
inline_context = []
for keyword in CONTEXT_KEYWORDS.get(field_name, []):
if keyword in token_lower:
inline_context.append(keyword)
# Boost score if keyword is inline
inline_boost = 0.1 if inline_context else 0
matches.append(Match(
field=field_name,
value=value,
bbox=token.bbox, # Use full token bbox
page_no=token.page_no,
score=min(1.0, 0.75 + context_boost + inline_boost), # Lower than exact match
matched_text=token_text,
context_keywords=context_keywords + inline_context
))
return matches
def _find_fuzzy_matches(
self,
tokens: list[TokenLike],
value: str,
field_name: str
) -> list[Match]:
"""Find approximate matches for amounts and dates."""
matches = []
for token in tokens:
token_text = token.text.strip()
if field_name == 'Amount':
# Try to parse both as numbers
try:
token_num = self._parse_amount(token_text)
value_num = self._parse_amount(value)
if token_num is not None and value_num is not None:
if abs(token_num - value_num) < 0.01: # Within 1 cent
context_keywords, context_boost = self._find_context_keywords(
tokens, token, field_name
)
matches.append(Match(
field=field_name,
value=value,
bbox=token.bbox,
page_no=token.page_no,
score=min(1.0, 0.8 + context_boost),
matched_text=token_text,
context_keywords=context_keywords
))
except:
pass
return matches
def _find_flexible_date_matches(
self,
tokens: list[TokenLike],
normalized_values: list[str],
field_name: str
) -> list[Match]:
"""
Flexible date matching when exact match fails.
Strategies:
1. Year-month match: If CSV has 2026-01-15, match any 2026-01-XX date
2. Nearby date match: Match dates within 7 days of CSV value
3. Heuristic selection: Use context keywords to select the best date
This handles cases where CSV InvoiceDate doesn't exactly match PDF,
but we can still find a reasonable date to label.
"""
from datetime import datetime, timedelta
matches = []
# Parse the target date from normalized values
target_date = None
for value in normalized_values:
# Try to parse YYYY-MM-DD format
date_match = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', value)
if date_match:
try:
target_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3))
)
break
except ValueError:
continue
if not target_date:
return matches
# Find all date-like tokens in the document
date_candidates = []
date_pattern = re.compile(r'(\d{4})-(\d{2})-(\d{2})')
for token in tokens:
token_text = token.text.strip()
# Search for date pattern in token
for match in date_pattern.finditer(token_text):
try:
found_date = datetime(
int(match.group(1)),
int(match.group(2)),
int(match.group(3))
)
date_str = match.group(0)
# Calculate date difference
days_diff = abs((found_date - target_date).days)
# Check for context keywords
context_keywords, context_boost = self._find_context_keywords(
tokens, token, field_name
)
# Check if keyword is in the same token
token_lower = token_text.lower()
inline_keywords = []
for keyword in CONTEXT_KEYWORDS.get(field_name, []):
if keyword in token_lower:
inline_keywords.append(keyword)
date_candidates.append({
'token': token,
'date': found_date,
'date_str': date_str,
'matched_text': token_text,
'days_diff': days_diff,
'context_keywords': context_keywords + inline_keywords,
'context_boost': context_boost + (0.1 if inline_keywords else 0),
'same_year_month': (found_date.year == target_date.year and
found_date.month == target_date.month),
})
except ValueError:
continue
if not date_candidates:
return matches
# Score and rank candidates
for candidate in date_candidates:
score = 0.0
# Strategy 1: Same year-month gets higher score
if candidate['same_year_month']:
score = 0.7
# Bonus if day is close
if candidate['days_diff'] <= 7:
score = 0.75
if candidate['days_diff'] <= 3:
score = 0.8
# Strategy 2: Nearby dates (within 14 days)
elif candidate['days_diff'] <= 14:
score = 0.6
elif candidate['days_diff'] <= 30:
score = 0.55
else:
# Too far apart, skip unless has strong context
if not candidate['context_keywords']:
continue
score = 0.5
# Strategy 3: Boost with context keywords
score = min(1.0, score + candidate['context_boost'])
# For InvoiceDate, prefer dates that appear near invoice-related keywords
# For InvoiceDueDate, prefer dates near due-date keywords
if candidate['context_keywords']:
score = min(1.0, score + 0.05)
if score >= self.min_score_threshold:
matches.append(Match(
field=field_name,
value=candidate['date_str'],
bbox=candidate['token'].bbox,
page_no=candidate['token'].page_no,
score=score,
matched_text=candidate['matched_text'],
context_keywords=candidate['context_keywords']
))
# Sort by score and return best matches
matches.sort(key=lambda m: m.score, reverse=True)
# Only return the best match to avoid multiple labels for same field
return matches[:1] if matches else []
def _find_context_keywords(
self,
tokens: list[TokenLike],
target_token: TokenLike,
field_name: str
) -> tuple[list[str], float]:
"""Find context keywords near the target token."""
keywords = CONTEXT_KEYWORDS.get(field_name, [])
found_keywords = []
target_center = (
(target_token.bbox[0] + target_token.bbox[2]) / 2,
(target_token.bbox[1] + target_token.bbox[3]) / 2
)
for token in tokens:
if token is target_token:
continue
token_center = (
(token.bbox[0] + token.bbox[2]) / 2,
(token.bbox[1] + token.bbox[3]) / 2
)
# Calculate distance
distance = (
(target_center[0] - token_center[0]) ** 2 +
(target_center[1] - token_center[1]) ** 2
) ** 0.5
if distance <= self.context_radius:
token_lower = token.text.lower()
for keyword in keywords:
if keyword in token_lower:
found_keywords.append(keyword)
# Calculate boost based on keywords found
boost = min(0.15, len(found_keywords) * 0.05)
return found_keywords, boost
def _tokens_on_same_line(self, token1: TokenLike, token2: TokenLike) -> bool:
"""Check if two tokens are on the same line."""
# Check vertical overlap
y_overlap = min(token1.bbox[3], token2.bbox[3]) - max(token1.bbox[1], token2.bbox[1])
min_height = min(token1.bbox[3] - token1.bbox[1], token2.bbox[3] - token2.bbox[1])
return y_overlap > min_height * 0.5
def _parse_amount(self, text: str) -> float | None:
"""Try to parse text as a monetary amount."""
# Remove currency and spaces
text = re.sub(r'[SEK|kr|:-]', '', text, flags=re.IGNORECASE)
text = text.replace(' ', '').replace('\xa0', '')
# Try comma as decimal separator
if ',' in text and '.' not in text:
text = text.replace(',', '.')
try:
return float(text)
except ValueError:
return None
def _deduplicate_matches(self, matches: list[Match]) -> list[Match]:
"""Remove duplicate matches based on bbox overlap."""
if not matches:
return []
# Sort by score descending
matches.sort(key=lambda m: m.score, reverse=True)
unique = []
for match in matches:
is_duplicate = False
for existing in unique:
if self._bbox_overlap(match.bbox, existing.bbox) > 0.7:
is_duplicate = True
break
if not is_duplicate:
unique.append(match)
return unique
def _bbox_overlap(
self,
bbox1: tuple[float, float, float, float],
bbox2: tuple[float, float, float, float]
) -> float:
"""Calculate IoU (Intersection over Union) of two bounding boxes."""
x1 = max(bbox1[0], bbox2[0])
y1 = max(bbox1[1], bbox2[1])
x2 = min(bbox1[2], bbox2[2])
y2 = min(bbox1[3], bbox2[3])
if x2 <= x1 or y2 <= y1:
return 0.0
intersection = (x2 - x1) * (y2 - y1)
area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
union = area1 + area2 - intersection
return intersection / union if union > 0 else 0.0
def find_field_matches(
tokens: list[TokenLike],
field_values: dict[str, str],
page_no: int = 0
) -> dict[str, list[Match]]:
"""
Convenience function to find matches for multiple fields.
Args:
tokens: List of tokens from the document
field_values: Dict of field_name -> value to search for
page_no: Page number
Returns:
Dict of field_name -> list of matches
"""
from ..normalize import normalize_field
matcher = FieldMatcher()
results = {}
for field_name, value in field_values.items():
if value is None or str(value).strip() == '':
continue
normalized_values = normalize_field(field_name, str(value))
matches = matcher.find_matches(tokens, field_name, normalized_values, page_no)
results[field_name] = matches
return results