""" Tests for PDF Magic Bytes Validation in Document Upload. TDD: These tests are written FIRST, before implementation. They should FAIL initially until the validation logic is implemented. """ import pytest from io import BytesIO from unittest.mock import MagicMock, patch, AsyncMock from uuid import UUID from fastapi import UploadFile from fastapi.testclient import TestClient from backend.web.api.v1.admin.documents import create_documents_router from backend.web.config import StorageConfig # Test constants TEST_DOC_UUID = "550e8400-e29b-41d4-a716-446655440000" TEST_TOKEN = "test-admin-token-12345" class TestPDFMagicBytesValidation: """Tests for PDF magic bytes validation during upload.""" @pytest.fixture def storage_config(self, tmp_path): """Create a StorageConfig for testing.""" return StorageConfig( upload_dir=tmp_path / "uploads", result_dir=tmp_path / "results", max_file_size_mb=50, ) @pytest.fixture def mock_dependencies(self): """Create mock dependencies for document upload.""" mock_docs = MagicMock() mock_docs.create.return_value = TEST_DOC_UUID mock_annotations = MagicMock() mock_annotations.get_for_document.return_value = [] return { "docs": mock_docs, "annotations": mock_annotations, } @pytest.fixture def valid_pdf_content(self) -> bytes: """Create valid PDF content with correct magic bytes.""" # PDF files must start with %PDF return b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF" @pytest.fixture def invalid_pdf_content_exe(self) -> bytes: """Create content that looks like an executable (MZ header).""" return b"MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\xff\xff" @pytest.fixture def invalid_pdf_content_text(self) -> bytes: """Create plain text content masquerading as PDF.""" return b"This is not a PDF file, just plain text." @pytest.fixture def invalid_pdf_content_html(self) -> bytes: """Create HTML content masquerading as PDF.""" return b"Not a PDF" @pytest.fixture def empty_content(self) -> bytes: """Create empty file content.""" return b"" @pytest.fixture def almost_valid_pdf(self) -> bytes: """Create content that starts with %PD but not %PDF.""" return b"%PD-1.4\nNot quite right" def test_valid_pdf_passes_validation(self, valid_pdf_content): """Test that a valid PDF file with correct magic bytes passes validation. A valid PDF must start with the bytes b'%PDF'. """ # Import the validation function (to be implemented) from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes # Should not raise any exception validate_pdf_magic_bytes(valid_pdf_content) def test_invalid_pdf_exe_fails_validation(self, invalid_pdf_content_exe): """Test that an executable file renamed to .pdf fails validation. This is a security test - attackers might try to upload malicious executables by renaming them to .pdf. """ from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes with pytest.raises(ValueError) as exc_info: validate_pdf_magic_bytes(invalid_pdf_content_exe) assert "Invalid PDF file" in str(exc_info.value) assert "valid PDF header" in str(exc_info.value) def test_invalid_pdf_text_fails_validation(self, invalid_pdf_content_text): """Test that plain text file renamed to .pdf fails validation.""" from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes with pytest.raises(ValueError) as exc_info: validate_pdf_magic_bytes(invalid_pdf_content_text) assert "Invalid PDF file" in str(exc_info.value) def test_invalid_pdf_html_fails_validation(self, invalid_pdf_content_html): """Test that HTML file renamed to .pdf fails validation.""" from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes with pytest.raises(ValueError) as exc_info: validate_pdf_magic_bytes(invalid_pdf_content_html) assert "Invalid PDF file" in str(exc_info.value) def test_empty_file_fails_validation(self, empty_content): """Test that an empty file fails validation. Empty files cannot be valid PDFs and should be rejected. """ from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes with pytest.raises(ValueError) as exc_info: validate_pdf_magic_bytes(empty_content) assert "Invalid PDF file" in str(exc_info.value) def test_almost_valid_pdf_fails_validation(self, almost_valid_pdf): """Test that content starting with %PD but not %PDF fails validation. The magic bytes must be exactly %PDF (4 bytes). """ from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes with pytest.raises(ValueError) as exc_info: validate_pdf_magic_bytes(almost_valid_pdf) assert "Invalid PDF file" in str(exc_info.value) def test_pdf_magic_bytes_constant(self): """Test that PDF magic bytes constant is correctly defined.""" from backend.web.api.v1.admin.documents import PDF_MAGIC_BYTES assert PDF_MAGIC_BYTES == b"%PDF" def test_validation_is_case_sensitive(self): """Test that magic bytes validation is case-sensitive. %pdf (lowercase) should fail - PDF magic bytes are uppercase. """ from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes lowercase_pdf = b"%pdf-1.4\nfake content" with pytest.raises(ValueError) as exc_info: validate_pdf_magic_bytes(lowercase_pdf) assert "Invalid PDF file" in str(exc_info.value) class TestDocumentUploadWithMagicBytesValidation: """Integration tests for document upload with magic bytes validation.""" @pytest.fixture def storage_config(self, tmp_path): """Create a StorageConfig for testing.""" return StorageConfig( upload_dir=tmp_path / "uploads", result_dir=tmp_path / "results", max_file_size_mb=50, ) @pytest.fixture def valid_pdf_content(self) -> bytes: """Create valid PDF content.""" return b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF" @pytest.fixture def invalid_pdf_content(self) -> bytes: """Create invalid PDF content (executable header).""" return b"MZ\x90\x00\x03\x00\x00\x00" def test_upload_valid_pdf_succeeds( self, storage_config, valid_pdf_content ): """Test that uploading a valid PDF with correct magic bytes succeeds.""" router = create_documents_router(storage_config) # Find the upload endpoint (path includes prefix /admin/documents) upload_route = None for route in router.routes: if hasattr(route, 'methods') and 'POST' in route.methods: if route.path == "/admin/documents": upload_route = route break assert upload_route is not None, "Upload route should exist" # Validate that valid PDF content passes validation from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes validate_pdf_magic_bytes(valid_pdf_content) # Should not raise def test_upload_invalid_pdf_returns_400( self, storage_config, invalid_pdf_content ): """Test that uploading an invalid PDF returns HTTP 400. The error message should clearly indicate the PDF header is invalid. """ from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes # Simulate what the upload endpoint should do try: validate_pdf_magic_bytes(invalid_pdf_content) pytest.fail("Should have raised ValueError for invalid PDF") except ValueError as e: # The endpoint should convert this to HTTP 400 assert "Invalid PDF file" in str(e) assert "valid PDF header" in str(e) def test_upload_empty_pdf_returns_400(self, storage_config): """Test that uploading an empty file returns HTTP 400.""" from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes empty_content = b"" with pytest.raises(ValueError) as exc_info: validate_pdf_magic_bytes(empty_content) assert "Invalid PDF file" in str(exc_info.value) class TestNonPDFFileValidation: """Tests to ensure non-PDF files are not affected by magic bytes validation.""" def test_png_files_skip_pdf_validation(self): """Test that PNG files do not go through PDF magic bytes validation. Only files with .pdf extension should be validated for PDF magic bytes. """ # PNG magic bytes png_content = b"\x89PNG\r\n\x1a\n" file_ext = ".png" # PNG files should not be validated with PDF magic bytes check # The validation should only apply to .pdf files assert file_ext != ".pdf" def test_jpg_files_skip_pdf_validation(self): """Test that JPG files do not go through PDF magic bytes validation.""" # JPEG magic bytes jpg_content = b"\xff\xd8\xff\xe0" file_ext = ".jpg" assert file_ext != ".pdf"