restructure project

2026-01-27 23:58:17 +01:00
parent 58bf75db68
commit d6550375b0
230 changed files with 5513 additions and 1756 deletions
--- a/tests/pdf/test_detector.py
+++ b/tests/pdf/test_detector.py
@@ -13,7 +13,7 @@ Usage:
 import pytest
 from pathlib import Path
 from unittest.mock import patch, MagicMock
-from src.pdf.detector import (
+from shared.pdf.detector import (
    extract_text_first_page,
    is_text_pdf,
    get_pdf_type,
@@ -54,12 +54,12 @@ class TestIsTextPDF:

    def test_empty_pdf_returns_false(self):
        """Should return False for PDF with no text."""
-        with patch("src.pdf.detector.extract_text_first_page", return_value=""):
+        with patch("shared.pdf.detector.extract_text_first_page", return_value=""):
            assert is_text_pdf("test.pdf") is False

    def test_short_text_returns_false(self):
        """Should return False for PDF with very short text."""
-        with patch("src.pdf.detector.extract_text_first_page", return_value="Hello"):
+        with patch("shared.pdf.detector.extract_text_first_page", return_value="Hello"):
            assert is_text_pdf("test.pdf") is False

    def test_readable_text_with_keywords_returns_true(self):
@@ -72,7 +72,7 @@ class TestIsTextPDF:
        Moms: 25%
        """ + "a" * 200  # Ensure > 200 chars

-        with patch("src.pdf.detector.extract_text_first_page", return_value=text):
+        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            assert is_text_pdf("test.pdf") is True

    def test_garbled_text_returns_false(self):
@@ -80,7 +80,7 @@ class TestIsTextPDF:
        # Simulate garbled text (lots of non-printable characters)
        garbled = "\x00\x01\x02" * 100 + "abc" * 20  # Low readable ratio

-        with patch("src.pdf.detector.extract_text_first_page", return_value=garbled):
+        with patch("shared.pdf.detector.extract_text_first_page", return_value=garbled):
            assert is_text_pdf("test.pdf") is False

    def test_text_without_keywords_needs_high_readability(self):
@@ -88,7 +88,7 @@ class TestIsTextPDF:
        # Text without invoice keywords
        text = "The quick brown fox jumps over the lazy dog. " * 10

-        with patch("src.pdf.detector.extract_text_first_page", return_value=text):
+        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            # Should pass if readable ratio is high enough
            result = is_text_pdf("test.pdf")
            # Result depends on character ratio - ASCII text should pass
@@ -98,7 +98,7 @@ class TestIsTextPDF:
        """Should respect custom min_chars parameter."""
        text = "Short text here"  # 15 chars

-        with patch("src.pdf.detector.extract_text_first_page", return_value=text):
+        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            # Default min_chars=30 - should fail
            assert is_text_pdf("test.pdf", min_chars=30) is False
            # Custom min_chars=10 - should pass basic length check
@@ -273,7 +273,7 @@ class TestIsTextPDFKeywordDetection:
            # Create text with keyword and enough content
            text = f"Document with {keyword} keyword here" + " more text" * 50

-            with patch("src.pdf.detector.extract_text_first_page", return_value=text):
+            with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
                # Need at least 2 keywords for is_text_pdf to return True
                # So this tests if keyword is recognized when combined with others
                pass
@@ -282,7 +282,7 @@ class TestIsTextPDFKeywordDetection:
        """Should detect English invoice keywords."""
        text = "Invoice document with date and amount information" + " x" * 100

-        with patch("src.pdf.detector.extract_text_first_page", return_value=text):
+        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            # invoice + date = 2 keywords
            result = is_text_pdf("test.pdf")
            assert result is True
@@ -292,7 +292,7 @@ class TestIsTextPDFKeywordDetection:
        # Only one keyword
        text = "This is a faktura document" + " x" * 200

-        with patch("src.pdf.detector.extract_text_first_page", return_value=text):
+        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            # With only 1 keyword, falls back to other checks
            # Should still pass if readability is high
            pass
@@ -306,7 +306,7 @@ class TestReadabilityChecks:
        # Pure ASCII text
        text = "This is a normal document with only ASCII characters. " * 10

-        with patch("src.pdf.detector.extract_text_first_page", return_value=text):
+        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            result = is_text_pdf("test.pdf")
            assert result is True

@@ -314,7 +314,7 @@ class TestReadabilityChecks:
        """Should accept Swedish characters as readable."""
        text = "Fakturadatum för årets moms på öre belopp" + " normal" * 50

-        with patch("src.pdf.detector.extract_text_first_page", return_value=text):
+        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            result = is_text_pdf("test.pdf")
            assert result is True

@@ -326,7 +326,7 @@ class TestReadabilityChecks:
        unreadable = "\x80\x81\x82" * 50  # 150 unreadable chars
        text = readable + unreadable

-        with patch("src.pdf.detector.extract_text_first_page", return_value=text):
+        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            result = is_text_pdf("test.pdf")
            assert result is False

--- a/tests/pdf/test_extractor.py
+++ b/tests/pdf/test_extractor.py
@@ -12,7 +12,7 @@ Usage:
 import pytest
 from pathlib import Path
 from unittest.mock import patch, MagicMock
-from src.pdf.extractor import (
+from shared.pdf.extractor import (
    Token,
    PDFDocument,
    extract_text_tokens,
@@ -509,7 +509,7 @@ class TestPDFDocumentIsTextPDF:
        mock_doc = MagicMock()

        with patch("fitz.open", return_value=mock_doc):
-            with patch("src.pdf.extractor._is_text_pdf_standalone", return_value=True) as mock_check:
+            with patch("shared.pdf.extractor._is_text_pdf_standalone", return_value=True) as mock_check:
                with PDFDocument("test.pdf") as pdf:
                    result = pdf.is_text_pdf(min_chars=50)