WIP
This commit is contained in:
@@ -400,6 +400,71 @@ class TestAmountNormalizer:
|
||||
result = normalizer.normalize("Reference 12500")
|
||||
assert result.value == "12500.00"
|
||||
|
||||
def test_payment_line_kronor_ore_format(self, normalizer):
|
||||
"""Space between kronor and ore should be treated as decimal separator.
|
||||
|
||||
Swedish payment lines use space to separate kronor and ore:
|
||||
"590 00" means 590.00 SEK, NOT 59000.
|
||||
"""
|
||||
result = normalizer.normalize("590 00")
|
||||
assert result.value == "590.00"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_payment_line_kronor_ore_large_amount(self, normalizer):
|
||||
"""Large kronor/ore amount from payment line."""
|
||||
result = normalizer.normalize("15658 00")
|
||||
assert result.value == "15658.00"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_payment_line_kronor_ore_with_nonzero_ore(self, normalizer):
|
||||
"""Kronor/ore with non-zero ore."""
|
||||
result = normalizer.normalize("736 50")
|
||||
assert result.value == "736.50"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_kronor_ore_not_confused_with_thousand_separator(self, normalizer):
|
||||
"""Amount with comma decimal should NOT trigger kronor/ore pattern."""
|
||||
result = normalizer.normalize("1 234,56")
|
||||
assert result.value is not None
|
||||
# Should parse as 1234.56, not as kronor=1234 ore=56 (which is same value)
|
||||
assert float(result.value) == 1234.56
|
||||
|
||||
def test_european_dot_thousand_separator(self, normalizer):
|
||||
"""European format: dot as thousand, comma as decimal."""
|
||||
result = normalizer.normalize("2.254,50")
|
||||
assert result.value == "2254.50"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_european_dot_thousand_with_sek(self, normalizer):
|
||||
"""European format with SEK suffix."""
|
||||
result = normalizer.normalize("2.254,50 SEK")
|
||||
assert result.value == "2254.50"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_european_dot_thousand_with_kr(self, normalizer):
|
||||
"""European format with kr suffix."""
|
||||
result = normalizer.normalize("20.485,00 kr")
|
||||
assert result.value == "20485.00"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_european_large_amount(self, normalizer):
|
||||
"""Large European format amount."""
|
||||
result = normalizer.normalize("1.234.567,89")
|
||||
assert result.value == "1234567.89"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_european_in_label_context(self, normalizer):
|
||||
"""European format inside label text (like the BAUHAUS invoice bug)."""
|
||||
result = normalizer.normalize("ns Fakturabelopp: 2.254,50 SEK")
|
||||
assert result.value == "2254.50"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_anglo_comma_thousand_separator(self, normalizer):
|
||||
"""Anglo format: comma as thousand, dot as decimal."""
|
||||
result = normalizer.normalize("1,234.56")
|
||||
assert result.value == "1234.56"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_zero_amount_rejected(self, normalizer):
|
||||
"""Test that zero amounts are rejected."""
|
||||
result = normalizer.normalize("0,00 kr")
|
||||
@@ -450,6 +515,18 @@ class TestEnhancedAmountNormalizer:
|
||||
result = normalizer.normalize("Invoice for 1 234 567,89 kr")
|
||||
assert result.value is not None
|
||||
|
||||
def test_enhanced_kronor_ore_format(self, normalizer):
|
||||
"""Space between kronor and ore in enhanced normalizer."""
|
||||
result = normalizer.normalize("590 00")
|
||||
assert result.value == "590.00"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_enhanced_kronor_ore_large(self, normalizer):
|
||||
"""Large kronor/ore amount in enhanced normalizer."""
|
||||
result = normalizer.normalize("15658 00")
|
||||
assert result.value == "15658.00"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_no_amount_fails(self, normalizer):
|
||||
"""Test failure when no amount found."""
|
||||
result = normalizer.normalize("no amount")
|
||||
@@ -472,6 +549,22 @@ class TestEnhancedAmountNormalizer:
|
||||
result = normalizer.normalize("Price: 1 234 567,89")
|
||||
assert result.value is not None
|
||||
|
||||
def test_enhanced_european_dot_thousand(self, normalizer):
|
||||
"""European format in enhanced normalizer."""
|
||||
result = normalizer.normalize("2.254,50 SEK")
|
||||
assert result.value == "2254.50"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_enhanced_european_with_label(self, normalizer):
|
||||
"""European format with Swedish label keyword."""
|
||||
result = normalizer.normalize("Att betala: 2.254,50")
|
||||
assert result.value == "2254.50"
|
||||
|
||||
def test_enhanced_anglo_format(self, normalizer):
|
||||
"""Anglo format in enhanced normalizer."""
|
||||
result = normalizer.normalize("Total: 1,234.56")
|
||||
assert result.value == "1234.56"
|
||||
|
||||
def test_amount_out_of_range_rejected(self, normalizer):
|
||||
"""Test that amounts >= 10,000,000 are rejected."""
|
||||
result = normalizer.normalize("Summa: 99 999 999,00")
|
||||
|
||||
@@ -497,5 +497,178 @@ class TestExtractBusinessFeaturesErrorHandling:
|
||||
assert "NumericException" in result.errors[0]
|
||||
|
||||
|
||||
class TestProcessPdfTokenPath:
|
||||
"""Tests for PDF text token extraction path in process_pdf()."""
|
||||
|
||||
def _make_pipeline(self):
|
||||
"""Create pipeline with mocked internals, bypassing __init__."""
|
||||
with patch.object(InferencePipeline, '__init__', lambda self, **kw: None):
|
||||
p = InferencePipeline()
|
||||
p.detector = MagicMock()
|
||||
p.extractor = MagicMock()
|
||||
p.payment_line_parser = MagicMock()
|
||||
p.dpi = 300
|
||||
p.enable_fallback = False
|
||||
p.enable_business_features = False
|
||||
p.vat_tolerance = 0.5
|
||||
p.line_items_extractor = None
|
||||
p.vat_extractor = None
|
||||
p.vat_validator = None
|
||||
p._business_ocr_engine = None
|
||||
p._table_detector = None
|
||||
return p
|
||||
|
||||
def _make_detection(self, class_name='Amount', confidence=0.85, page_no=0):
|
||||
"""Create a Detection object."""
|
||||
from backend.pipeline.yolo_detector import Detection
|
||||
return Detection(
|
||||
class_id=6,
|
||||
class_name=class_name,
|
||||
confidence=confidence,
|
||||
bbox=(100.0, 200.0, 300.0, 250.0),
|
||||
page_no=page_no,
|
||||
)
|
||||
|
||||
def _make_extracted_field(self, field_name='Amount', raw_text='2.254,50',
|
||||
normalized='2254.50', confidence=0.85):
|
||||
"""Create an ExtractedField object."""
|
||||
from backend.pipeline.field_extractor import ExtractedField
|
||||
return ExtractedField(
|
||||
field_name=field_name,
|
||||
raw_text=raw_text,
|
||||
normalized_value=normalized,
|
||||
confidence=confidence,
|
||||
detection_confidence=confidence,
|
||||
ocr_confidence=1.0,
|
||||
bbox=(100.0, 200.0, 300.0, 250.0),
|
||||
page_no=0,
|
||||
)
|
||||
|
||||
def _make_image_bytes(self):
|
||||
"""Create minimal valid PNG bytes (100x100 white image)."""
|
||||
from PIL import Image as PILImage
|
||||
import io as _io
|
||||
img = PILImage.new('RGB', (100, 100), color='white')
|
||||
buf = _io.BytesIO()
|
||||
img.save(buf, format='PNG')
|
||||
return buf.getvalue()
|
||||
|
||||
@patch('shared.pdf.extractor.PDFDocument')
|
||||
@patch('shared.pdf.renderer.render_pdf_to_images')
|
||||
def test_text_pdf_uses_pdf_tokens(self, mock_render, mock_pdf_doc_cls):
|
||||
"""When PDF is text-based, extract_from_detection_with_pdf is used."""
|
||||
from shared.pdf.extractor import Token
|
||||
|
||||
pipeline = self._make_pipeline()
|
||||
detection = self._make_detection()
|
||||
image_bytes = self._make_image_bytes()
|
||||
|
||||
# Setup PDFDocument mock - text PDF with tokens
|
||||
mock_pdf_doc = MagicMock()
|
||||
mock_pdf_doc.is_text_pdf.return_value = True
|
||||
mock_pdf_doc.page_count = 1
|
||||
tokens = [Token(text="2.254,50", bbox=(100, 200, 200, 220), page_no=0)]
|
||||
mock_pdf_doc.extract_text_tokens.return_value = iter(tokens)
|
||||
mock_pdf_doc_cls.return_value.__enter__ = MagicMock(return_value=mock_pdf_doc)
|
||||
mock_pdf_doc_cls.return_value.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
pipeline.detector.detect.return_value = [detection]
|
||||
pipeline.extractor.extract_from_detection_with_pdf.return_value = (
|
||||
self._make_extracted_field()
|
||||
)
|
||||
|
||||
mock_render.return_value = iter([(0, image_bytes)])
|
||||
result = pipeline.process_pdf('/fake/invoice.pdf')
|
||||
|
||||
pipeline.extractor.extract_from_detection_with_pdf.assert_called_once()
|
||||
pipeline.extractor.extract_from_detection.assert_not_called()
|
||||
assert result.fields.get('Amount') == '2254.50'
|
||||
assert result.success is True
|
||||
|
||||
@patch('shared.pdf.extractor.PDFDocument')
|
||||
@patch('shared.pdf.renderer.render_pdf_to_images')
|
||||
def test_scanned_pdf_uses_ocr(self, mock_render, mock_pdf_doc_cls):
|
||||
"""When PDF is scanned, extract_from_detection (OCR) is used."""
|
||||
pipeline = self._make_pipeline()
|
||||
detection = self._make_detection()
|
||||
image_bytes = self._make_image_bytes()
|
||||
|
||||
mock_pdf_doc = MagicMock()
|
||||
mock_pdf_doc.is_text_pdf.return_value = False
|
||||
mock_pdf_doc_cls.return_value.__enter__ = MagicMock(return_value=mock_pdf_doc)
|
||||
mock_pdf_doc_cls.return_value.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
pipeline.detector.detect.return_value = [detection]
|
||||
pipeline.extractor.extract_from_detection.return_value = (
|
||||
self._make_extracted_field(raw_text='4.50', normalized='4.50', confidence=0.75)
|
||||
)
|
||||
|
||||
mock_render.return_value = iter([(0, image_bytes)])
|
||||
result = pipeline.process_pdf('/fake/invoice.pdf')
|
||||
|
||||
pipeline.extractor.extract_from_detection.assert_called_once()
|
||||
pipeline.extractor.extract_from_detection_with_pdf.assert_not_called()
|
||||
|
||||
@patch('shared.pdf.extractor.PDFDocument')
|
||||
@patch('shared.pdf.renderer.render_pdf_to_images')
|
||||
def test_pdf_detection_error_falls_back_to_ocr(self, mock_render, mock_pdf_doc_cls):
|
||||
"""When PDF text detection throws, fall back to OCR."""
|
||||
pipeline = self._make_pipeline()
|
||||
detection = self._make_detection()
|
||||
image_bytes = self._make_image_bytes()
|
||||
|
||||
mock_ctx = MagicMock()
|
||||
mock_ctx.__enter__ = MagicMock(side_effect=Exception("corrupt PDF"))
|
||||
mock_ctx.__exit__ = MagicMock(return_value=False)
|
||||
mock_pdf_doc_cls.return_value = mock_ctx
|
||||
|
||||
pipeline.detector.detect.return_value = [detection]
|
||||
pipeline.extractor.extract_from_detection.return_value = (
|
||||
self._make_extracted_field(raw_text='4.50', normalized='4.50', confidence=0.75)
|
||||
)
|
||||
|
||||
mock_render.return_value = iter([(0, image_bytes)])
|
||||
result = pipeline.process_pdf('/fake/invoice.pdf')
|
||||
|
||||
pipeline.extractor.extract_from_detection.assert_called_once()
|
||||
pipeline.extractor.extract_from_detection_with_pdf.assert_not_called()
|
||||
|
||||
@patch('shared.pdf.extractor.PDFDocument')
|
||||
@patch('shared.pdf.renderer.render_pdf_to_images')
|
||||
def test_text_pdf_passes_correct_args(self, mock_render, mock_pdf_doc_cls):
|
||||
"""Verify correct token list and image dimensions are passed."""
|
||||
from shared.pdf.extractor import Token
|
||||
|
||||
pipeline = self._make_pipeline()
|
||||
detection = self._make_detection()
|
||||
image_bytes = self._make_image_bytes() # 100x100 PNG
|
||||
|
||||
mock_pdf_doc = MagicMock()
|
||||
mock_pdf_doc.is_text_pdf.return_value = True
|
||||
mock_pdf_doc.page_count = 1
|
||||
tokens = [
|
||||
Token(text="Fakturabelopp:", bbox=(50, 190, 100, 210), page_no=0),
|
||||
Token(text="2.254,50", bbox=(105, 190, 180, 210), page_no=0),
|
||||
Token(text="SEK", bbox=(185, 190, 210, 210), page_no=0),
|
||||
]
|
||||
mock_pdf_doc.extract_text_tokens.return_value = iter(tokens)
|
||||
mock_pdf_doc_cls.return_value.__enter__ = MagicMock(return_value=mock_pdf_doc)
|
||||
mock_pdf_doc_cls.return_value.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
pipeline.detector.detect.return_value = [detection]
|
||||
pipeline.extractor.extract_from_detection_with_pdf.return_value = (
|
||||
self._make_extracted_field()
|
||||
)
|
||||
|
||||
mock_render.return_value = iter([(0, image_bytes)])
|
||||
pipeline.process_pdf('/fake/invoice.pdf')
|
||||
|
||||
call_args = pipeline.extractor.extract_from_detection_with_pdf.call_args[0]
|
||||
assert call_args[0] == detection
|
||||
assert len(call_args[1]) == 3 # 3 tokens passed
|
||||
assert call_args[2] == 100 # image width
|
||||
assert call_args[3] == 100 # image height
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
|
||||
Reference in New Issue
Block a user