Initial commit: Invoice field extraction system using YOLO + OCR
Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
3
src/matcher/__init__.py
Normal file
3
src/matcher/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .field_matcher import FieldMatcher, Match, find_field_matches
|
||||
|
||||
__all__ = ['FieldMatcher', 'Match', 'find_field_matches']
|
||||
618
src/matcher/field_matcher.py
Normal file
618
src/matcher/field_matcher.py
Normal file
@@ -0,0 +1,618 @@
|
||||
"""
|
||||
Field Matching Module
|
||||
|
||||
Matches normalized field values to tokens extracted from documents.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Protocol
|
||||
import re
|
||||
|
||||
|
||||
class TokenLike(Protocol):
|
||||
"""Protocol for token objects."""
|
||||
text: str
|
||||
bbox: tuple[float, float, float, float]
|
||||
page_no: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class Match:
|
||||
"""Represents a matched field in the document."""
|
||||
field: str
|
||||
value: str
|
||||
bbox: tuple[float, float, float, float] # (x0, y0, x1, y1)
|
||||
page_no: int
|
||||
score: float # 0-1 confidence score
|
||||
matched_text: str # Actual text that matched
|
||||
context_keywords: list[str] # Nearby keywords that boosted confidence
|
||||
|
||||
def to_yolo_format(self, image_width: float, image_height: float, class_id: int) -> str:
|
||||
"""Convert to YOLO annotation format."""
|
||||
x0, y0, x1, y1 = self.bbox
|
||||
|
||||
x_center = (x0 + x1) / 2 / image_width
|
||||
y_center = (y0 + y1) / 2 / image_height
|
||||
width = (x1 - x0) / image_width
|
||||
height = (y1 - y0) / image_height
|
||||
|
||||
return f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}"
|
||||
|
||||
|
||||
# Context keywords for each field type (Swedish invoice terms)
|
||||
CONTEXT_KEYWORDS = {
|
||||
'InvoiceNumber': ['fakturanr', 'fakturanummer', 'invoice', 'inv.nr', 'inv nr', 'nr'],
|
||||
'InvoiceDate': ['fakturadatum', 'datum', 'date', 'utfärdad', 'utskriftsdatum', 'dokumentdatum'],
|
||||
'InvoiceDueDate': ['förfallodatum', 'förfaller', 'due date', 'betalas senast', 'att betala senast',
|
||||
'förfallodag', 'oss tillhanda senast', 'senast'],
|
||||
'OCR': ['ocr', 'referens', 'betalningsreferens', 'ref'],
|
||||
'Bankgiro': ['bankgiro', 'bg', 'bg-nr', 'bg nr'],
|
||||
'Plusgiro': ['plusgiro', 'pg', 'pg-nr', 'pg nr'],
|
||||
'Amount': ['att betala', 'summa', 'total', 'belopp', 'amount', 'totalt', 'att erlägga', 'sek', 'kr'],
|
||||
}
|
||||
|
||||
|
||||
class FieldMatcher:
|
||||
"""Matches field values to document tokens."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
context_radius: float = 100.0, # pixels
|
||||
min_score_threshold: float = 0.5
|
||||
):
|
||||
"""
|
||||
Initialize the matcher.
|
||||
|
||||
Args:
|
||||
context_radius: Distance to search for context keywords
|
||||
min_score_threshold: Minimum score to consider a match valid
|
||||
"""
|
||||
self.context_radius = context_radius
|
||||
self.min_score_threshold = min_score_threshold
|
||||
|
||||
def find_matches(
|
||||
self,
|
||||
tokens: list[TokenLike],
|
||||
field_name: str,
|
||||
normalized_values: list[str],
|
||||
page_no: int = 0
|
||||
) -> list[Match]:
|
||||
"""
|
||||
Find all matches for a field in the token list.
|
||||
|
||||
Args:
|
||||
tokens: List of tokens from the document
|
||||
field_name: Name of the field to match
|
||||
normalized_values: List of normalized value variants to search for
|
||||
page_no: Page number to filter tokens
|
||||
|
||||
Returns:
|
||||
List of Match objects sorted by score (descending)
|
||||
"""
|
||||
matches = []
|
||||
page_tokens = [t for t in tokens if t.page_no == page_no]
|
||||
|
||||
for value in normalized_values:
|
||||
# Strategy 1: Exact token match
|
||||
exact_matches = self._find_exact_matches(page_tokens, value, field_name)
|
||||
matches.extend(exact_matches)
|
||||
|
||||
# Strategy 2: Multi-token concatenation
|
||||
concat_matches = self._find_concatenated_matches(page_tokens, value, field_name)
|
||||
matches.extend(concat_matches)
|
||||
|
||||
# Strategy 3: Fuzzy match (for amounts and dates only)
|
||||
if field_name in ('Amount', 'InvoiceDate', 'InvoiceDueDate'):
|
||||
fuzzy_matches = self._find_fuzzy_matches(page_tokens, value, field_name)
|
||||
matches.extend(fuzzy_matches)
|
||||
|
||||
# Strategy 4: Substring match (for dates embedded in longer text)
|
||||
if field_name in ('InvoiceDate', 'InvoiceDueDate'):
|
||||
substring_matches = self._find_substring_matches(page_tokens, value, field_name)
|
||||
matches.extend(substring_matches)
|
||||
|
||||
# Strategy 5: Flexible date matching (year-month match, nearby dates, heuristic selection)
|
||||
# Only if no exact matches found for date fields
|
||||
if field_name in ('InvoiceDate', 'InvoiceDueDate') and not matches:
|
||||
flexible_matches = self._find_flexible_date_matches(
|
||||
page_tokens, normalized_values, field_name
|
||||
)
|
||||
matches.extend(flexible_matches)
|
||||
|
||||
# Deduplicate and sort by score
|
||||
matches = self._deduplicate_matches(matches)
|
||||
matches.sort(key=lambda m: m.score, reverse=True)
|
||||
|
||||
return [m for m in matches if m.score >= self.min_score_threshold]
|
||||
|
||||
def _find_exact_matches(
|
||||
self,
|
||||
tokens: list[TokenLike],
|
||||
value: str,
|
||||
field_name: str
|
||||
) -> list[Match]:
|
||||
"""Find tokens that exactly match the value."""
|
||||
matches = []
|
||||
|
||||
for token in tokens:
|
||||
token_text = token.text.strip()
|
||||
|
||||
# Exact match
|
||||
if token_text == value:
|
||||
score = 1.0
|
||||
# Case-insensitive match
|
||||
elif token_text.lower() == value.lower():
|
||||
score = 0.95
|
||||
# Digits-only match for numeric fields
|
||||
elif field_name in ('InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro'):
|
||||
token_digits = re.sub(r'\D', '', token_text)
|
||||
value_digits = re.sub(r'\D', '', value)
|
||||
if token_digits and token_digits == value_digits:
|
||||
score = 0.9
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
|
||||
# Boost score if context keywords are nearby
|
||||
context_keywords, context_boost = self._find_context_keywords(
|
||||
tokens, token, field_name
|
||||
)
|
||||
score = min(1.0, score + context_boost)
|
||||
|
||||
matches.append(Match(
|
||||
field=field_name,
|
||||
value=value,
|
||||
bbox=token.bbox,
|
||||
page_no=token.page_no,
|
||||
score=score,
|
||||
matched_text=token_text,
|
||||
context_keywords=context_keywords
|
||||
))
|
||||
|
||||
return matches
|
||||
|
||||
def _find_concatenated_matches(
|
||||
self,
|
||||
tokens: list[TokenLike],
|
||||
value: str,
|
||||
field_name: str
|
||||
) -> list[Match]:
|
||||
"""Find value by concatenating adjacent tokens."""
|
||||
matches = []
|
||||
value_clean = re.sub(r'\s+', '', value)
|
||||
|
||||
# Sort tokens by position (top-to-bottom, left-to-right)
|
||||
sorted_tokens = sorted(tokens, key=lambda t: (t.bbox[1], t.bbox[0]))
|
||||
|
||||
for i, start_token in enumerate(sorted_tokens):
|
||||
# Try to build the value by concatenating nearby tokens
|
||||
concat_text = start_token.text.strip()
|
||||
concat_bbox = list(start_token.bbox)
|
||||
used_tokens = [start_token]
|
||||
|
||||
for j in range(i + 1, min(i + 5, len(sorted_tokens))): # Max 5 tokens
|
||||
next_token = sorted_tokens[j]
|
||||
|
||||
# Check if tokens are on the same line (y overlap)
|
||||
if not self._tokens_on_same_line(start_token, next_token):
|
||||
break
|
||||
|
||||
# Check horizontal proximity
|
||||
if next_token.bbox[0] - concat_bbox[2] > 50: # Max 50px gap
|
||||
break
|
||||
|
||||
concat_text += next_token.text.strip()
|
||||
used_tokens.append(next_token)
|
||||
|
||||
# Update bounding box
|
||||
concat_bbox[0] = min(concat_bbox[0], next_token.bbox[0])
|
||||
concat_bbox[1] = min(concat_bbox[1], next_token.bbox[1])
|
||||
concat_bbox[2] = max(concat_bbox[2], next_token.bbox[2])
|
||||
concat_bbox[3] = max(concat_bbox[3], next_token.bbox[3])
|
||||
|
||||
# Check for match
|
||||
concat_clean = re.sub(r'\s+', '', concat_text)
|
||||
if concat_clean == value_clean:
|
||||
context_keywords, context_boost = self._find_context_keywords(
|
||||
tokens, start_token, field_name
|
||||
)
|
||||
|
||||
matches.append(Match(
|
||||
field=field_name,
|
||||
value=value,
|
||||
bbox=tuple(concat_bbox),
|
||||
page_no=start_token.page_no,
|
||||
score=min(1.0, 0.85 + context_boost), # Slightly lower base score
|
||||
matched_text=concat_text,
|
||||
context_keywords=context_keywords
|
||||
))
|
||||
break
|
||||
|
||||
return matches
|
||||
|
||||
def _find_substring_matches(
|
||||
self,
|
||||
tokens: list[TokenLike],
|
||||
value: str,
|
||||
field_name: str
|
||||
) -> list[Match]:
|
||||
"""
|
||||
Find value as a substring within longer tokens.
|
||||
|
||||
Handles cases like 'Fakturadatum: 2026-01-09' where the date
|
||||
is embedded in a longer text string.
|
||||
|
||||
Uses lower score (0.75) than exact match to prefer exact matches.
|
||||
Only matches if the value appears as a distinct segment (not part of a number).
|
||||
"""
|
||||
matches = []
|
||||
|
||||
# Only use for date fields - other fields risk false positives
|
||||
if field_name not in ('InvoiceDate', 'InvoiceDueDate'):
|
||||
return matches
|
||||
|
||||
for token in tokens:
|
||||
token_text = token.text.strip()
|
||||
|
||||
# Skip if token is the same length as value (would be exact match)
|
||||
if len(token_text) <= len(value):
|
||||
continue
|
||||
|
||||
# Check if value appears as substring
|
||||
if value in token_text:
|
||||
# Verify it's a proper boundary match (not part of a larger number)
|
||||
idx = token_text.find(value)
|
||||
|
||||
# Check character before (if exists)
|
||||
if idx > 0:
|
||||
char_before = token_text[idx - 1]
|
||||
# Must be non-digit (allow : space - etc)
|
||||
if char_before.isdigit():
|
||||
continue
|
||||
|
||||
# Check character after (if exists)
|
||||
end_idx = idx + len(value)
|
||||
if end_idx < len(token_text):
|
||||
char_after = token_text[end_idx]
|
||||
# Must be non-digit
|
||||
if char_after.isdigit():
|
||||
continue
|
||||
|
||||
# Found valid substring match
|
||||
context_keywords, context_boost = self._find_context_keywords(
|
||||
tokens, token, field_name
|
||||
)
|
||||
|
||||
# Check if context keyword is in the same token (like "Fakturadatum:")
|
||||
token_lower = token_text.lower()
|
||||
inline_context = []
|
||||
for keyword in CONTEXT_KEYWORDS.get(field_name, []):
|
||||
if keyword in token_lower:
|
||||
inline_context.append(keyword)
|
||||
|
||||
# Boost score if keyword is inline
|
||||
inline_boost = 0.1 if inline_context else 0
|
||||
|
||||
matches.append(Match(
|
||||
field=field_name,
|
||||
value=value,
|
||||
bbox=token.bbox, # Use full token bbox
|
||||
page_no=token.page_no,
|
||||
score=min(1.0, 0.75 + context_boost + inline_boost), # Lower than exact match
|
||||
matched_text=token_text,
|
||||
context_keywords=context_keywords + inline_context
|
||||
))
|
||||
|
||||
return matches
|
||||
|
||||
def _find_fuzzy_matches(
|
||||
self,
|
||||
tokens: list[TokenLike],
|
||||
value: str,
|
||||
field_name: str
|
||||
) -> list[Match]:
|
||||
"""Find approximate matches for amounts and dates."""
|
||||
matches = []
|
||||
|
||||
for token in tokens:
|
||||
token_text = token.text.strip()
|
||||
|
||||
if field_name == 'Amount':
|
||||
# Try to parse both as numbers
|
||||
try:
|
||||
token_num = self._parse_amount(token_text)
|
||||
value_num = self._parse_amount(value)
|
||||
|
||||
if token_num is not None and value_num is not None:
|
||||
if abs(token_num - value_num) < 0.01: # Within 1 cent
|
||||
context_keywords, context_boost = self._find_context_keywords(
|
||||
tokens, token, field_name
|
||||
)
|
||||
|
||||
matches.append(Match(
|
||||
field=field_name,
|
||||
value=value,
|
||||
bbox=token.bbox,
|
||||
page_no=token.page_no,
|
||||
score=min(1.0, 0.8 + context_boost),
|
||||
matched_text=token_text,
|
||||
context_keywords=context_keywords
|
||||
))
|
||||
except:
|
||||
pass
|
||||
|
||||
return matches
|
||||
|
||||
def _find_flexible_date_matches(
|
||||
self,
|
||||
tokens: list[TokenLike],
|
||||
normalized_values: list[str],
|
||||
field_name: str
|
||||
) -> list[Match]:
|
||||
"""
|
||||
Flexible date matching when exact match fails.
|
||||
|
||||
Strategies:
|
||||
1. Year-month match: If CSV has 2026-01-15, match any 2026-01-XX date
|
||||
2. Nearby date match: Match dates within 7 days of CSV value
|
||||
3. Heuristic selection: Use context keywords to select the best date
|
||||
|
||||
This handles cases where CSV InvoiceDate doesn't exactly match PDF,
|
||||
but we can still find a reasonable date to label.
|
||||
"""
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
matches = []
|
||||
|
||||
# Parse the target date from normalized values
|
||||
target_date = None
|
||||
for value in normalized_values:
|
||||
# Try to parse YYYY-MM-DD format
|
||||
date_match = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', value)
|
||||
if date_match:
|
||||
try:
|
||||
target_date = datetime(
|
||||
int(date_match.group(1)),
|
||||
int(date_match.group(2)),
|
||||
int(date_match.group(3))
|
||||
)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if not target_date:
|
||||
return matches
|
||||
|
||||
# Find all date-like tokens in the document
|
||||
date_candidates = []
|
||||
date_pattern = re.compile(r'(\d{4})-(\d{2})-(\d{2})')
|
||||
|
||||
for token in tokens:
|
||||
token_text = token.text.strip()
|
||||
|
||||
# Search for date pattern in token
|
||||
for match in date_pattern.finditer(token_text):
|
||||
try:
|
||||
found_date = datetime(
|
||||
int(match.group(1)),
|
||||
int(match.group(2)),
|
||||
int(match.group(3))
|
||||
)
|
||||
date_str = match.group(0)
|
||||
|
||||
# Calculate date difference
|
||||
days_diff = abs((found_date - target_date).days)
|
||||
|
||||
# Check for context keywords
|
||||
context_keywords, context_boost = self._find_context_keywords(
|
||||
tokens, token, field_name
|
||||
)
|
||||
|
||||
# Check if keyword is in the same token
|
||||
token_lower = token_text.lower()
|
||||
inline_keywords = []
|
||||
for keyword in CONTEXT_KEYWORDS.get(field_name, []):
|
||||
if keyword in token_lower:
|
||||
inline_keywords.append(keyword)
|
||||
|
||||
date_candidates.append({
|
||||
'token': token,
|
||||
'date': found_date,
|
||||
'date_str': date_str,
|
||||
'matched_text': token_text,
|
||||
'days_diff': days_diff,
|
||||
'context_keywords': context_keywords + inline_keywords,
|
||||
'context_boost': context_boost + (0.1 if inline_keywords else 0),
|
||||
'same_year_month': (found_date.year == target_date.year and
|
||||
found_date.month == target_date.month),
|
||||
})
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if not date_candidates:
|
||||
return matches
|
||||
|
||||
# Score and rank candidates
|
||||
for candidate in date_candidates:
|
||||
score = 0.0
|
||||
|
||||
# Strategy 1: Same year-month gets higher score
|
||||
if candidate['same_year_month']:
|
||||
score = 0.7
|
||||
# Bonus if day is close
|
||||
if candidate['days_diff'] <= 7:
|
||||
score = 0.75
|
||||
if candidate['days_diff'] <= 3:
|
||||
score = 0.8
|
||||
# Strategy 2: Nearby dates (within 14 days)
|
||||
elif candidate['days_diff'] <= 14:
|
||||
score = 0.6
|
||||
elif candidate['days_diff'] <= 30:
|
||||
score = 0.55
|
||||
else:
|
||||
# Too far apart, skip unless has strong context
|
||||
if not candidate['context_keywords']:
|
||||
continue
|
||||
score = 0.5
|
||||
|
||||
# Strategy 3: Boost with context keywords
|
||||
score = min(1.0, score + candidate['context_boost'])
|
||||
|
||||
# For InvoiceDate, prefer dates that appear near invoice-related keywords
|
||||
# For InvoiceDueDate, prefer dates near due-date keywords
|
||||
if candidate['context_keywords']:
|
||||
score = min(1.0, score + 0.05)
|
||||
|
||||
if score >= self.min_score_threshold:
|
||||
matches.append(Match(
|
||||
field=field_name,
|
||||
value=candidate['date_str'],
|
||||
bbox=candidate['token'].bbox,
|
||||
page_no=candidate['token'].page_no,
|
||||
score=score,
|
||||
matched_text=candidate['matched_text'],
|
||||
context_keywords=candidate['context_keywords']
|
||||
))
|
||||
|
||||
# Sort by score and return best matches
|
||||
matches.sort(key=lambda m: m.score, reverse=True)
|
||||
|
||||
# Only return the best match to avoid multiple labels for same field
|
||||
return matches[:1] if matches else []
|
||||
|
||||
def _find_context_keywords(
|
||||
self,
|
||||
tokens: list[TokenLike],
|
||||
target_token: TokenLike,
|
||||
field_name: str
|
||||
) -> tuple[list[str], float]:
|
||||
"""Find context keywords near the target token."""
|
||||
keywords = CONTEXT_KEYWORDS.get(field_name, [])
|
||||
found_keywords = []
|
||||
|
||||
target_center = (
|
||||
(target_token.bbox[0] + target_token.bbox[2]) / 2,
|
||||
(target_token.bbox[1] + target_token.bbox[3]) / 2
|
||||
)
|
||||
|
||||
for token in tokens:
|
||||
if token is target_token:
|
||||
continue
|
||||
|
||||
token_center = (
|
||||
(token.bbox[0] + token.bbox[2]) / 2,
|
||||
(token.bbox[1] + token.bbox[3]) / 2
|
||||
)
|
||||
|
||||
# Calculate distance
|
||||
distance = (
|
||||
(target_center[0] - token_center[0]) ** 2 +
|
||||
(target_center[1] - token_center[1]) ** 2
|
||||
) ** 0.5
|
||||
|
||||
if distance <= self.context_radius:
|
||||
token_lower = token.text.lower()
|
||||
for keyword in keywords:
|
||||
if keyword in token_lower:
|
||||
found_keywords.append(keyword)
|
||||
|
||||
# Calculate boost based on keywords found
|
||||
boost = min(0.15, len(found_keywords) * 0.05)
|
||||
return found_keywords, boost
|
||||
|
||||
def _tokens_on_same_line(self, token1: TokenLike, token2: TokenLike) -> bool:
|
||||
"""Check if two tokens are on the same line."""
|
||||
# Check vertical overlap
|
||||
y_overlap = min(token1.bbox[3], token2.bbox[3]) - max(token1.bbox[1], token2.bbox[1])
|
||||
min_height = min(token1.bbox[3] - token1.bbox[1], token2.bbox[3] - token2.bbox[1])
|
||||
return y_overlap > min_height * 0.5
|
||||
|
||||
def _parse_amount(self, text: str) -> float | None:
|
||||
"""Try to parse text as a monetary amount."""
|
||||
# Remove currency and spaces
|
||||
text = re.sub(r'[SEK|kr|:-]', '', text, flags=re.IGNORECASE)
|
||||
text = text.replace(' ', '').replace('\xa0', '')
|
||||
|
||||
# Try comma as decimal separator
|
||||
if ',' in text and '.' not in text:
|
||||
text = text.replace(',', '.')
|
||||
|
||||
try:
|
||||
return float(text)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def _deduplicate_matches(self, matches: list[Match]) -> list[Match]:
|
||||
"""Remove duplicate matches based on bbox overlap."""
|
||||
if not matches:
|
||||
return []
|
||||
|
||||
# Sort by score descending
|
||||
matches.sort(key=lambda m: m.score, reverse=True)
|
||||
unique = []
|
||||
|
||||
for match in matches:
|
||||
is_duplicate = False
|
||||
for existing in unique:
|
||||
if self._bbox_overlap(match.bbox, existing.bbox) > 0.7:
|
||||
is_duplicate = True
|
||||
break
|
||||
|
||||
if not is_duplicate:
|
||||
unique.append(match)
|
||||
|
||||
return unique
|
||||
|
||||
def _bbox_overlap(
|
||||
self,
|
||||
bbox1: tuple[float, float, float, float],
|
||||
bbox2: tuple[float, float, float, float]
|
||||
) -> float:
|
||||
"""Calculate IoU (Intersection over Union) of two bounding boxes."""
|
||||
x1 = max(bbox1[0], bbox2[0])
|
||||
y1 = max(bbox1[1], bbox2[1])
|
||||
x2 = min(bbox1[2], bbox2[2])
|
||||
y2 = min(bbox1[3], bbox2[3])
|
||||
|
||||
if x2 <= x1 or y2 <= y1:
|
||||
return 0.0
|
||||
|
||||
intersection = (x2 - x1) * (y2 - y1)
|
||||
area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
|
||||
area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
|
||||
union = area1 + area2 - intersection
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
|
||||
def find_field_matches(
|
||||
tokens: list[TokenLike],
|
||||
field_values: dict[str, str],
|
||||
page_no: int = 0
|
||||
) -> dict[str, list[Match]]:
|
||||
"""
|
||||
Convenience function to find matches for multiple fields.
|
||||
|
||||
Args:
|
||||
tokens: List of tokens from the document
|
||||
field_values: Dict of field_name -> value to search for
|
||||
page_no: Page number
|
||||
|
||||
Returns:
|
||||
Dict of field_name -> list of matches
|
||||
"""
|
||||
from ..normalize import normalize_field
|
||||
|
||||
matcher = FieldMatcher()
|
||||
results = {}
|
||||
|
||||
for field_name, value in field_values.items():
|
||||
if value is None or str(value).strip() == '':
|
||||
continue
|
||||
|
||||
normalized_values = normalize_field(field_name, str(value))
|
||||
matches = matcher.find_matches(tokens, field_name, normalized_values, page_no)
|
||||
results[field_name] = matches
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user