restructure project
This commit is contained in:
@@ -13,7 +13,7 @@ Usage:
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
from src.pdf.detector import (
|
||||
from shared.pdf.detector import (
|
||||
extract_text_first_page,
|
||||
is_text_pdf,
|
||||
get_pdf_type,
|
||||
@@ -54,12 +54,12 @@ class TestIsTextPDF:
|
||||
|
||||
def test_empty_pdf_returns_false(self):
|
||||
"""Should return False for PDF with no text."""
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=""):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value=""):
|
||||
assert is_text_pdf("test.pdf") is False
|
||||
|
||||
def test_short_text_returns_false(self):
|
||||
"""Should return False for PDF with very short text."""
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value="Hello"):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value="Hello"):
|
||||
assert is_text_pdf("test.pdf") is False
|
||||
|
||||
def test_readable_text_with_keywords_returns_true(self):
|
||||
@@ -72,7 +72,7 @@ class TestIsTextPDF:
|
||||
Moms: 25%
|
||||
""" + "a" * 200 # Ensure > 200 chars
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
||||
assert is_text_pdf("test.pdf") is True
|
||||
|
||||
def test_garbled_text_returns_false(self):
|
||||
@@ -80,7 +80,7 @@ class TestIsTextPDF:
|
||||
# Simulate garbled text (lots of non-printable characters)
|
||||
garbled = "\x00\x01\x02" * 100 + "abc" * 20 # Low readable ratio
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=garbled):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value=garbled):
|
||||
assert is_text_pdf("test.pdf") is False
|
||||
|
||||
def test_text_without_keywords_needs_high_readability(self):
|
||||
@@ -88,7 +88,7 @@ class TestIsTextPDF:
|
||||
# Text without invoice keywords
|
||||
text = "The quick brown fox jumps over the lazy dog. " * 10
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
||||
# Should pass if readable ratio is high enough
|
||||
result = is_text_pdf("test.pdf")
|
||||
# Result depends on character ratio - ASCII text should pass
|
||||
@@ -98,7 +98,7 @@ class TestIsTextPDF:
|
||||
"""Should respect custom min_chars parameter."""
|
||||
text = "Short text here" # 15 chars
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
||||
# Default min_chars=30 - should fail
|
||||
assert is_text_pdf("test.pdf", min_chars=30) is False
|
||||
# Custom min_chars=10 - should pass basic length check
|
||||
@@ -273,7 +273,7 @@ class TestIsTextPDFKeywordDetection:
|
||||
# Create text with keyword and enough content
|
||||
text = f"Document with {keyword} keyword here" + " more text" * 50
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
||||
# Need at least 2 keywords for is_text_pdf to return True
|
||||
# So this tests if keyword is recognized when combined with others
|
||||
pass
|
||||
@@ -282,7 +282,7 @@ class TestIsTextPDFKeywordDetection:
|
||||
"""Should detect English invoice keywords."""
|
||||
text = "Invoice document with date and amount information" + " x" * 100
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
||||
# invoice + date = 2 keywords
|
||||
result = is_text_pdf("test.pdf")
|
||||
assert result is True
|
||||
@@ -292,7 +292,7 @@ class TestIsTextPDFKeywordDetection:
|
||||
# Only one keyword
|
||||
text = "This is a faktura document" + " x" * 200
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
||||
# With only 1 keyword, falls back to other checks
|
||||
# Should still pass if readability is high
|
||||
pass
|
||||
@@ -306,7 +306,7 @@ class TestReadabilityChecks:
|
||||
# Pure ASCII text
|
||||
text = "This is a normal document with only ASCII characters. " * 10
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
||||
result = is_text_pdf("test.pdf")
|
||||
assert result is True
|
||||
|
||||
@@ -314,7 +314,7 @@ class TestReadabilityChecks:
|
||||
"""Should accept Swedish characters as readable."""
|
||||
text = "Fakturadatum för årets moms på öre belopp" + " normal" * 50
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
||||
result = is_text_pdf("test.pdf")
|
||||
assert result is True
|
||||
|
||||
@@ -326,7 +326,7 @@ class TestReadabilityChecks:
|
||||
unreadable = "\x80\x81\x82" * 50 # 150 unreadable chars
|
||||
text = readable + unreadable
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
||||
result = is_text_pdf("test.pdf")
|
||||
assert result is False
|
||||
|
||||
|
||||
Reference in New Issue
Block a user