WIP
This commit is contained in:
@@ -670,5 +670,387 @@ class TestProcessPdfTokenPath:
|
||||
assert call_args[3] == 100 # image height
|
||||
|
||||
|
||||
class TestDpiPassthrough:
|
||||
"""Tests for DPI being passed from pipeline to FieldExtractor (Bug 1)."""
|
||||
|
||||
def test_field_extractor_receives_pipeline_dpi(self):
|
||||
"""FieldExtractor should receive the pipeline's DPI, not default to 300."""
|
||||
with patch('backend.pipeline.pipeline.YOLODetector'):
|
||||
with patch('backend.pipeline.pipeline.FieldExtractor') as mock_fe_cls:
|
||||
InferencePipeline(
|
||||
model_path='/fake/model.pt',
|
||||
dpi=150,
|
||||
use_gpu=False,
|
||||
)
|
||||
mock_fe_cls.assert_called_once_with(
|
||||
ocr_lang='en', use_gpu=False, dpi=150
|
||||
)
|
||||
|
||||
def test_field_extractor_receives_default_dpi(self):
|
||||
"""When dpi=300 (default), FieldExtractor should also get 300."""
|
||||
with patch('backend.pipeline.pipeline.YOLODetector'):
|
||||
with patch('backend.pipeline.pipeline.FieldExtractor') as mock_fe_cls:
|
||||
InferencePipeline(
|
||||
model_path='/fake/model.pt',
|
||||
dpi=300,
|
||||
use_gpu=False,
|
||||
)
|
||||
mock_fe_cls.assert_called_once_with(
|
||||
ocr_lang='en', use_gpu=False, dpi=300
|
||||
)
|
||||
|
||||
|
||||
class TestFallbackPatternExtraction:
|
||||
"""Tests for _extract_with_patterns fallback regex (Bugs 2, 3)."""
|
||||
|
||||
def _make_pipeline_with_patterns(self):
|
||||
"""Create pipeline with mocked internals for pattern testing."""
|
||||
with patch.object(InferencePipeline, '__init__', lambda self, **kw: None):
|
||||
p = InferencePipeline()
|
||||
p.dpi = 150
|
||||
p.enable_fallback = True
|
||||
return p
|
||||
|
||||
def test_bankgiro_no_match_in_org_number(self):
|
||||
"""Bankgiro regex must NOT match digits embedded in an org number."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Org.nr 802546-1610", result)
|
||||
assert 'Bankgiro' not in result.fields
|
||||
|
||||
def test_bankgiro_matches_labeled(self):
|
||||
"""Bankgiro regex should match when preceded by 'Bankgiro' label."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Bankgiro 5393-9484", result)
|
||||
assert result.fields.get('Bankgiro') == '5393-9484'
|
||||
|
||||
def test_bankgiro_matches_standalone(self):
|
||||
"""Bankgiro regex should match a standalone 4-4 digit pattern."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Betala till 5393-9484 senast", result)
|
||||
assert result.fields.get('Bankgiro') == '5393-9484'
|
||||
|
||||
def test_amount_rejects_bare_integer(self):
|
||||
"""Amount regex must NOT match bare integers like 'Summa 1'."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Summa 1 Medlemsavgift", result)
|
||||
assert 'Amount' not in result.fields
|
||||
|
||||
def test_amount_requires_decimal(self):
|
||||
"""Amount regex should require a decimal separator."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Total 5 items", result)
|
||||
assert 'Amount' not in result.fields
|
||||
|
||||
def test_amount_with_decimal_works(self):
|
||||
"""Amount regex should match Swedish decimal amounts."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Att betala 1 234,56 SEK", result)
|
||||
assert 'Amount' in result.fields
|
||||
assert float(result.fields['Amount']) == pytest.approx(1234.56, abs=0.01)
|
||||
|
||||
def test_amount_with_sek_suffix(self):
|
||||
"""Amount regex should match amounts ending with SEK."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("7 500,00 SEK", result)
|
||||
assert 'Amount' in result.fields
|
||||
assert float(result.fields['Amount']) == pytest.approx(7500.00, abs=0.01)
|
||||
|
||||
def test_fallback_extracts_invoice_date(self):
|
||||
"""Fallback should extract InvoiceDate from Swedish text."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Fakturadatum 2025-01-15 Referens ABC", result)
|
||||
assert result.fields.get('InvoiceDate') == '2025-01-15'
|
||||
|
||||
def test_fallback_extracts_due_date(self):
|
||||
"""Fallback should extract InvoiceDueDate from Swedish text."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Forfallodag 2025-02-15 Belopp", result)
|
||||
assert result.fields.get('InvoiceDueDate') == '2025-02-15'
|
||||
|
||||
def test_fallback_extracts_supplier_org(self):
|
||||
"""Fallback should extract supplier_organisation_number."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Org.nr 556123-4567 Stockholm", result)
|
||||
assert result.fields.get('supplier_organisation_number') == '556123-4567'
|
||||
|
||||
def test_fallback_extracts_plusgiro(self):
|
||||
"""Fallback should extract Plusgiro number."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Plusgiro 12 34 56-7 betalning", result)
|
||||
assert 'Plusgiro' in result.fields
|
||||
|
||||
def test_fallback_skips_year_as_invoice_number(self):
|
||||
"""Fallback should NOT extract year-like value as InvoiceNumber."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Fakturanr 2025 Datum 2025-01-15", result)
|
||||
assert 'InvoiceNumber' not in result.fields
|
||||
|
||||
def test_fallback_accepts_valid_invoice_number(self):
|
||||
"""Fallback should extract valid non-year InvoiceNumber."""
|
||||
p = self._make_pipeline_with_patterns()
|
||||
result = InferenceResult()
|
||||
p._extract_with_patterns("Fakturanr 12345 Summa", result)
|
||||
assert result.fields.get('InvoiceNumber') == '12345'
|
||||
|
||||
|
||||
class TestDateValidation:
|
||||
"""Tests for InvoiceDueDate < InvoiceDate validation (Bug 6)."""
|
||||
|
||||
def _make_pipeline_for_merge(self):
|
||||
"""Create pipeline with mocked internals for merge testing."""
|
||||
with patch.object(InferencePipeline, '__init__', lambda self, **kw: None):
|
||||
p = InferencePipeline()
|
||||
p.payment_line_parser = MagicMock()
|
||||
p.payment_line_parser.parse.return_value = MagicMock(is_valid=False)
|
||||
return p
|
||||
|
||||
def test_due_date_before_invoice_date_dropped(self):
|
||||
"""DueDate earlier than InvoiceDate should be removed."""
|
||||
from backend.pipeline.field_extractor import ExtractedField
|
||||
|
||||
p = self._make_pipeline_for_merge()
|
||||
result = InferenceResult()
|
||||
result.extracted_fields = [
|
||||
ExtractedField(
|
||||
field_name='InvoiceDate', raw_text='2026-01-16',
|
||||
normalized_value='2026-01-16', confidence=0.9,
|
||||
detection_confidence=0.9, ocr_confidence=1.0,
|
||||
bbox=(0, 0, 100, 50), page_no=0,
|
||||
),
|
||||
ExtractedField(
|
||||
field_name='InvoiceDueDate', raw_text='2025-12-01',
|
||||
normalized_value='2025-12-01', confidence=0.9,
|
||||
detection_confidence=0.9, ocr_confidence=1.0,
|
||||
bbox=(0, 60, 100, 110), page_no=0,
|
||||
),
|
||||
]
|
||||
p._merge_fields(result)
|
||||
assert 'InvoiceDate' in result.fields
|
||||
assert 'InvoiceDueDate' not in result.fields
|
||||
|
||||
def test_valid_dates_preserved(self):
|
||||
"""Both dates kept when DueDate >= InvoiceDate."""
|
||||
from backend.pipeline.field_extractor import ExtractedField
|
||||
|
||||
p = self._make_pipeline_for_merge()
|
||||
result = InferenceResult()
|
||||
result.extracted_fields = [
|
||||
ExtractedField(
|
||||
field_name='InvoiceDate', raw_text='2026-01-16',
|
||||
normalized_value='2026-01-16', confidence=0.9,
|
||||
detection_confidence=0.9, ocr_confidence=1.0,
|
||||
bbox=(0, 0, 100, 50), page_no=0,
|
||||
),
|
||||
ExtractedField(
|
||||
field_name='InvoiceDueDate', raw_text='2026-02-15',
|
||||
normalized_value='2026-02-15', confidence=0.9,
|
||||
detection_confidence=0.9, ocr_confidence=1.0,
|
||||
bbox=(0, 60, 100, 110), page_no=0,
|
||||
),
|
||||
]
|
||||
p._merge_fields(result)
|
||||
assert result.fields['InvoiceDate'] == '2026-01-16'
|
||||
assert result.fields['InvoiceDueDate'] == '2026-02-15'
|
||||
|
||||
def test_same_dates_preserved(self):
|
||||
"""Same InvoiceDate and DueDate should both be kept."""
|
||||
from backend.pipeline.field_extractor import ExtractedField
|
||||
|
||||
p = self._make_pipeline_for_merge()
|
||||
result = InferenceResult()
|
||||
result.extracted_fields = [
|
||||
ExtractedField(
|
||||
field_name='InvoiceDate', raw_text='2026-01-16',
|
||||
normalized_value='2026-01-16', confidence=0.9,
|
||||
detection_confidence=0.9, ocr_confidence=1.0,
|
||||
bbox=(0, 0, 100, 50), page_no=0,
|
||||
),
|
||||
ExtractedField(
|
||||
field_name='InvoiceDueDate', raw_text='2026-01-16',
|
||||
normalized_value='2026-01-16', confidence=0.9,
|
||||
detection_confidence=0.9, ocr_confidence=1.0,
|
||||
bbox=(0, 60, 100, 110), page_no=0,
|
||||
),
|
||||
]
|
||||
p._merge_fields(result)
|
||||
assert result.fields['InvoiceDate'] == '2026-01-16'
|
||||
assert result.fields['InvoiceDueDate'] == '2026-01-16'
|
||||
|
||||
|
||||
class TestCrossFieldDedup:
|
||||
"""Tests for cross-field deduplication of InvoiceNumber vs OCR/Bankgiro."""
|
||||
|
||||
def _make_pipeline_for_merge(self):
|
||||
"""Create pipeline with mocked internals for merge testing."""
|
||||
with patch.object(InferencePipeline, '__init__', lambda self, **kw: None):
|
||||
p = InferencePipeline()
|
||||
p.payment_line_parser = MagicMock()
|
||||
p.payment_line_parser.parse.return_value = MagicMock(is_valid=False)
|
||||
return p
|
||||
|
||||
def _make_extracted_field(self, field_name, raw_text, normalized, confidence=0.9):
|
||||
from backend.pipeline.field_extractor import ExtractedField
|
||||
return ExtractedField(
|
||||
field_name=field_name,
|
||||
raw_text=raw_text,
|
||||
normalized_value=normalized,
|
||||
confidence=confidence,
|
||||
detection_confidence=confidence,
|
||||
ocr_confidence=1.0,
|
||||
bbox=(0, 0, 100, 50),
|
||||
page_no=0,
|
||||
)
|
||||
|
||||
def test_invoice_number_not_same_as_ocr(self):
|
||||
"""When InvoiceNumber == OCR, InvoiceNumber should be dropped."""
|
||||
p = self._make_pipeline_for_merge()
|
||||
result = InferenceResult()
|
||||
result.extracted_fields = [
|
||||
self._make_extracted_field('InvoiceNumber', '9179845608', '9179845608'),
|
||||
self._make_extracted_field('OCR', '9179845608', '9179845608'),
|
||||
self._make_extracted_field('Amount', '1234,56', '1234.56'),
|
||||
]
|
||||
p._merge_fields(result)
|
||||
assert 'OCR' in result.fields
|
||||
assert result.fields['OCR'] == '9179845608'
|
||||
assert 'InvoiceNumber' not in result.fields
|
||||
|
||||
def test_invoice_number_not_same_as_bankgiro_digits(self):
|
||||
"""When InvoiceNumber digits == Bankgiro digits, InvoiceNumber should be dropped."""
|
||||
p = self._make_pipeline_for_merge()
|
||||
result = InferenceResult()
|
||||
result.extracted_fields = [
|
||||
self._make_extracted_field('InvoiceNumber', '53939484', '53939484'),
|
||||
self._make_extracted_field('Bankgiro', '5393-9484', '5393-9484'),
|
||||
self._make_extracted_field('Amount', '500,00', '500.00'),
|
||||
]
|
||||
p._merge_fields(result)
|
||||
assert 'Bankgiro' in result.fields
|
||||
assert result.fields['Bankgiro'] == '5393-9484'
|
||||
assert 'InvoiceNumber' not in result.fields
|
||||
|
||||
def test_unrelated_values_kept(self):
|
||||
"""When InvoiceNumber, OCR, and Bankgiro are all different, keep all."""
|
||||
p = self._make_pipeline_for_merge()
|
||||
result = InferenceResult()
|
||||
result.extracted_fields = [
|
||||
self._make_extracted_field('InvoiceNumber', '19061', '19061'),
|
||||
self._make_extracted_field('OCR', '9179845608', '9179845608'),
|
||||
self._make_extracted_field('Bankgiro', '5393-9484', '5393-9484'),
|
||||
]
|
||||
p._merge_fields(result)
|
||||
assert result.fields['InvoiceNumber'] == '19061'
|
||||
assert result.fields['OCR'] == '9179845608'
|
||||
assert result.fields['Bankgiro'] == '5393-9484'
|
||||
|
||||
def test_dedup_after_fallback_re_add(self):
|
||||
"""Dedup should remove InvoiceNumber re-added by fallback if it matches OCR."""
|
||||
p = self._make_pipeline_for_merge()
|
||||
result = InferenceResult()
|
||||
# Simulate state after fallback re-adds InvoiceNumber = OCR
|
||||
result.fields = {
|
||||
'OCR': '758200602426',
|
||||
'Amount': '164.00',
|
||||
'InvoiceNumber': '758200602426', # re-added by fallback
|
||||
}
|
||||
result.confidence = {
|
||||
'OCR': 0.9,
|
||||
'Amount': 0.9,
|
||||
'InvoiceNumber': 0.5, # fallback confidence
|
||||
}
|
||||
result.bboxes = {}
|
||||
p._dedup_invoice_number(result)
|
||||
assert 'InvoiceNumber' not in result.fields
|
||||
assert 'OCR' in result.fields
|
||||
|
||||
def test_invoice_number_substring_of_bankgiro(self):
|
||||
"""When InvoiceNumber digits are a substring of Bankgiro digits, drop InvoiceNumber."""
|
||||
p = self._make_pipeline_for_merge()
|
||||
result = InferenceResult()
|
||||
result.extracted_fields = [
|
||||
self._make_extracted_field('InvoiceNumber', '4639', '4639'),
|
||||
self._make_extracted_field('Bankgiro', '134-4639', '134-4639'),
|
||||
self._make_extracted_field('Amount', '500,00', '500.00'),
|
||||
]
|
||||
p._merge_fields(result)
|
||||
assert 'Bankgiro' in result.fields
|
||||
assert result.fields['Bankgiro'] == '134-4639'
|
||||
assert 'InvoiceNumber' not in result.fields
|
||||
|
||||
def test_invoice_number_not_substring_of_unrelated_bankgiro(self):
|
||||
"""When InvoiceNumber is NOT a substring of Bankgiro, keep both."""
|
||||
p = self._make_pipeline_for_merge()
|
||||
result = InferenceResult()
|
||||
result.extracted_fields = [
|
||||
self._make_extracted_field('InvoiceNumber', '19061', '19061'),
|
||||
self._make_extracted_field('Bankgiro', '5393-9484', '5393-9484'),
|
||||
self._make_extracted_field('Amount', '500,00', '500.00'),
|
||||
]
|
||||
p._merge_fields(result)
|
||||
assert result.fields['InvoiceNumber'] == '19061'
|
||||
assert result.fields['Bankgiro'] == '5393-9484'
|
||||
|
||||
|
||||
class TestFallbackTrigger:
|
||||
"""Tests for _needs_fallback trigger threshold."""
|
||||
|
||||
def _make_pipeline(self):
|
||||
with patch.object(InferencePipeline, '__init__', lambda self, **kw: None):
|
||||
p = InferencePipeline()
|
||||
return p
|
||||
|
||||
def test_fallback_triggers_when_1_key_field_missing(self):
|
||||
"""Should trigger when only 1 key field (e.g. InvoiceNumber) is missing."""
|
||||
p = self._make_pipeline()
|
||||
result = InferenceResult()
|
||||
result.fields = {
|
||||
'Amount': '1234.56',
|
||||
'OCR': '12345678901',
|
||||
'InvoiceDate': '2025-01-15',
|
||||
'InvoiceDueDate': '2025-02-15',
|
||||
'supplier_organisation_number': '556123-4567',
|
||||
}
|
||||
# InvoiceNumber missing -> should trigger
|
||||
assert p._needs_fallback(result) is True
|
||||
|
||||
def test_fallback_triggers_when_dates_missing(self):
|
||||
"""Should trigger when all key fields present but 2+ important fields missing."""
|
||||
p = self._make_pipeline()
|
||||
result = InferenceResult()
|
||||
result.fields = {
|
||||
'Amount': '1234.56',
|
||||
'InvoiceNumber': '12345',
|
||||
'OCR': '12345678901',
|
||||
}
|
||||
# InvoiceDate, InvoiceDueDate, supplier_org all missing -> should trigger
|
||||
assert p._needs_fallback(result) is True
|
||||
|
||||
def test_no_fallback_when_all_fields_present(self):
|
||||
"""Should NOT trigger when all key and important fields present."""
|
||||
p = self._make_pipeline()
|
||||
result = InferenceResult()
|
||||
result.fields = {
|
||||
'Amount': '1234.56',
|
||||
'InvoiceNumber': '12345',
|
||||
'OCR': '12345678901',
|
||||
'InvoiceDate': '2025-01-15',
|
||||
'InvoiceDueDate': '2025-02-15',
|
||||
'supplier_organisation_number': '556123-4567',
|
||||
}
|
||||
assert p._needs_fallback(result) is False
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
|
||||
Reference in New Issue
Block a user