restructure project

This commit is contained in:
Yaojia Wang
2026-01-27 23:58:17 +01:00
parent 58bf75db68
commit d6550375b0
230 changed files with 5513 additions and 1756 deletions

View File

@@ -13,7 +13,7 @@ Usage:
import pytest
from pathlib import Path
from unittest.mock import patch, MagicMock
from src.pdf.detector import (
from shared.pdf.detector import (
extract_text_first_page,
is_text_pdf,
get_pdf_type,
@@ -54,12 +54,12 @@ class TestIsTextPDF:
def test_empty_pdf_returns_false(self):
"""Should return False for PDF with no text."""
with patch("src.pdf.detector.extract_text_first_page", return_value=""):
with patch("shared.pdf.detector.extract_text_first_page", return_value=""):
assert is_text_pdf("test.pdf") is False
def test_short_text_returns_false(self):
"""Should return False for PDF with very short text."""
with patch("src.pdf.detector.extract_text_first_page", return_value="Hello"):
with patch("shared.pdf.detector.extract_text_first_page", return_value="Hello"):
assert is_text_pdf("test.pdf") is False
def test_readable_text_with_keywords_returns_true(self):
@@ -72,7 +72,7 @@ class TestIsTextPDF:
Moms: 25%
""" + "a" * 200 # Ensure > 200 chars
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
assert is_text_pdf("test.pdf") is True
def test_garbled_text_returns_false(self):
@@ -80,7 +80,7 @@ class TestIsTextPDF:
# Simulate garbled text (lots of non-printable characters)
garbled = "\x00\x01\x02" * 100 + "abc" * 20 # Low readable ratio
with patch("src.pdf.detector.extract_text_first_page", return_value=garbled):
with patch("shared.pdf.detector.extract_text_first_page", return_value=garbled):
assert is_text_pdf("test.pdf") is False
def test_text_without_keywords_needs_high_readability(self):
@@ -88,7 +88,7 @@ class TestIsTextPDF:
# Text without invoice keywords
text = "The quick brown fox jumps over the lazy dog. " * 10
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
# Should pass if readable ratio is high enough
result = is_text_pdf("test.pdf")
# Result depends on character ratio - ASCII text should pass
@@ -98,7 +98,7 @@ class TestIsTextPDF:
"""Should respect custom min_chars parameter."""
text = "Short text here" # 15 chars
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
# Default min_chars=30 - should fail
assert is_text_pdf("test.pdf", min_chars=30) is False
# Custom min_chars=10 - should pass basic length check
@@ -273,7 +273,7 @@ class TestIsTextPDFKeywordDetection:
# Create text with keyword and enough content
text = f"Document with {keyword} keyword here" + " more text" * 50
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
# Need at least 2 keywords for is_text_pdf to return True
# So this tests if keyword is recognized when combined with others
pass
@@ -282,7 +282,7 @@ class TestIsTextPDFKeywordDetection:
"""Should detect English invoice keywords."""
text = "Invoice document with date and amount information" + " x" * 100
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
# invoice + date = 2 keywords
result = is_text_pdf("test.pdf")
assert result is True
@@ -292,7 +292,7 @@ class TestIsTextPDFKeywordDetection:
# Only one keyword
text = "This is a faktura document" + " x" * 200
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
# With only 1 keyword, falls back to other checks
# Should still pass if readability is high
pass
@@ -306,7 +306,7 @@ class TestReadabilityChecks:
# Pure ASCII text
text = "This is a normal document with only ASCII characters. " * 10
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
result = is_text_pdf("test.pdf")
assert result is True
@@ -314,7 +314,7 @@ class TestReadabilityChecks:
"""Should accept Swedish characters as readable."""
text = "Fakturadatum för årets moms på öre belopp" + " normal" * 50
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
result = is_text_pdf("test.pdf")
assert result is True
@@ -326,7 +326,7 @@ class TestReadabilityChecks:
unreadable = "\x80\x81\x82" * 50 # 150 unreadable chars
text = readable + unreadable
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
result = is_text_pdf("test.pdf")
assert result is False

View File

@@ -12,7 +12,7 @@ Usage:
import pytest
from pathlib import Path
from unittest.mock import patch, MagicMock
from src.pdf.extractor import (
from shared.pdf.extractor import (
Token,
PDFDocument,
extract_text_tokens,
@@ -509,7 +509,7 @@ class TestPDFDocumentIsTextPDF:
mock_doc = MagicMock()
with patch("fitz.open", return_value=mock_doc):
with patch("src.pdf.extractor._is_text_pdf_standalone", return_value=True) as mock_check:
with patch("shared.pdf.extractor._is_text_pdf_standalone", return_value=True) as mock_check:
with PDFDocument("test.pdf") as pdf:
result = pdf.is_text_pdf(min_chars=50)