fix: add PDF magic bytes validation to prevent file type spoofing

Add validation that checks PDF files start with '%PDF' magic bytes before accepting uploads. This prevents attackers from uploading malicious files (executables, scripts) by renaming them to .pdf. - Add validate_pdf_magic_bytes() function with clear error messages - Integrate validation in upload_document endpoint after file read - Add comprehensive test coverage (13 test cases) Addresses medium-risk security issue from code review.
2026-02-03 22:28:24 +01:00
parent 183d3503ef
commit 4c7fc3015c
2 changed files with 292 additions and 0 deletions
--- a/packages/backend/backend/web/api/v1/admin/documents.py
+++ b/packages/backend/backend/web/api/v1/admin/documents.py
@@ -39,6 +39,26 @@ from backend.web.schemas.common import ErrorResponse
 logger = logging.getLogger(__name__)
 # PDF magic bytes - all valid PDF files must start with this sequence
 PDF_MAGIC_BYTES = b"%PDF"
 def validate_pdf_magic_bytes(content: bytes) -> None:
    """Validate that file content has valid PDF magic bytes.
    PDF files must start with the bytes '%PDF' (0x25 0x50 0x44 0x46).
    This validation prevents attackers from uploading malicious files
    (executables, scripts) by simply renaming them to .pdf extension.
    Args:
        content: The raw file content to validate.
    Raises:
        ValueError: If the content does not start with valid PDF magic bytes.
    """
    if not content or not content.startswith(PDF_MAGIC_BYTES):
        raise ValueError("Invalid PDF file: does not have valid PDF header")
 def _validate_uuid(value: str, name: str = "ID") -> None:
    """Validate UUID format."""
@@ -135,6 +155,14 @@ def create_documents_router(storage_config: StorageConfig) -> APIRouter:
            logger.error(f"Failed to read uploaded file: {e}")
            raise HTTPException(status_code=400, detail="Failed to read file")
        # Validate PDF magic bytes (only for PDF files)
        if file_ext == ".pdf":
            try:
                validate_pdf_magic_bytes(content)
            except ValueError as e:
                logger.warning(f"PDF magic bytes validation failed: {e}")
                raise HTTPException(status_code=400, detail=str(e))
        # Get page count (for PDF)
        page_count = 1
        if file_ext == ".pdf":
--- a/tests/web/test_documents_upload_validation.py
+++ b/tests/web/test_documents_upload_validation.py
@@ -0,0 +1,264 @@
 """
 Tests for PDF Magic Bytes Validation in Document Upload.
 TDD: These tests are written FIRST, before implementation.
 They should FAIL initially until the validation logic is implemented.
 """
 import pytest
 from io import BytesIO
 from unittest.mock import MagicMock, patch, AsyncMock
 from uuid import UUID
 from fastapi import UploadFile
 from fastapi.testclient import TestClient
 from backend.web.api.v1.admin.documents import create_documents_router
 from backend.web.config import StorageConfig
 # Test constants
 TEST_DOC_UUID = "550e8400-e29b-41d4-a716-446655440000"
 TEST_TOKEN = "test-admin-token-12345"
 class TestPDFMagicBytesValidation:
    """Tests for PDF magic bytes validation during upload."""
    @pytest.fixture
    def storage_config(self, tmp_path):
        """Create a StorageConfig for testing."""
        return StorageConfig(
            upload_dir=tmp_path / "uploads",
            result_dir=tmp_path / "results",
            max_file_size_mb=50,
        )
    @pytest.fixture
    def mock_dependencies(self):
        """Create mock dependencies for document upload."""
        mock_docs = MagicMock()
        mock_docs.create.return_value = TEST_DOC_UUID
        mock_annotations = MagicMock()
        mock_annotations.get_for_document.return_value = []
        return {
            "docs": mock_docs,
            "annotations": mock_annotations,
        }
    @pytest.fixture
    def valid_pdf_content(self) -> bytes:
        """Create valid PDF content with correct magic bytes."""
        # PDF files must start with %PDF
        return b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF"
    @pytest.fixture
    def invalid_pdf_content_exe(self) -> bytes:
        """Create content that looks like an executable (MZ header)."""
        return b"MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\xff\xff"
    @pytest.fixture
    def invalid_pdf_content_text(self) -> bytes:
        """Create plain text content masquerading as PDF."""
        return b"This is not a PDF file, just plain text."
    @pytest.fixture
    def invalid_pdf_content_html(self) -> bytes:
        """Create HTML content masquerading as PDF."""
        return b"<!DOCTYPE html><html><body>Not a PDF</body></html>"
    @pytest.fixture
    def empty_content(self) -> bytes:
        """Create empty file content."""
        return b""
    @pytest.fixture
    def almost_valid_pdf(self) -> bytes:
        """Create content that starts with %PD but not %PDF."""
        return b"%PD-1.4\nNot quite right"
    def test_valid_pdf_passes_validation(self, valid_pdf_content):
        """Test that a valid PDF file with correct magic bytes passes validation.
        A valid PDF must start with the bytes b'%PDF'.
        """
        # Import the validation function (to be implemented)
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
        # Should not raise any exception
        validate_pdf_magic_bytes(valid_pdf_content)
    def test_invalid_pdf_exe_fails_validation(self, invalid_pdf_content_exe):
        """Test that an executable file renamed to .pdf fails validation.
        This is a security test - attackers might try to upload malicious
        executables by renaming them to .pdf.
        """
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(invalid_pdf_content_exe)
        assert "Invalid PDF file" in str(exc_info.value)
        assert "valid PDF header" in str(exc_info.value)
    def test_invalid_pdf_text_fails_validation(self, invalid_pdf_content_text):
        """Test that plain text file renamed to .pdf fails validation."""
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(invalid_pdf_content_text)
        assert "Invalid PDF file" in str(exc_info.value)
    def test_invalid_pdf_html_fails_validation(self, invalid_pdf_content_html):
        """Test that HTML file renamed to .pdf fails validation."""
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(invalid_pdf_content_html)
        assert "Invalid PDF file" in str(exc_info.value)
    def test_empty_file_fails_validation(self, empty_content):
        """Test that an empty file fails validation.
        Empty files cannot be valid PDFs and should be rejected.
        """
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(empty_content)
        assert "Invalid PDF file" in str(exc_info.value)
    def test_almost_valid_pdf_fails_validation(self, almost_valid_pdf):
        """Test that content starting with %PD but not %PDF fails validation.
        The magic bytes must be exactly %PDF (4 bytes).
        """
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(almost_valid_pdf)
        assert "Invalid PDF file" in str(exc_info.value)
    def test_pdf_magic_bytes_constant(self):
        """Test that PDF magic bytes constant is correctly defined."""
        from backend.web.api.v1.admin.documents import PDF_MAGIC_BYTES
        assert PDF_MAGIC_BYTES == b"%PDF"
    def test_validation_is_case_sensitive(self):
        """Test that magic bytes validation is case-sensitive.
        %pdf (lowercase) should fail - PDF magic bytes are uppercase.
        """
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
        lowercase_pdf = b"%pdf-1.4\nfake content"
        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(lowercase_pdf)
        assert "Invalid PDF file" in str(exc_info.value)
 class TestDocumentUploadWithMagicBytesValidation:
    """Integration tests for document upload with magic bytes validation."""
    @pytest.fixture
    def storage_config(self, tmp_path):
        """Create a StorageConfig for testing."""
        return StorageConfig(
            upload_dir=tmp_path / "uploads",
            result_dir=tmp_path / "results",
            max_file_size_mb=50,
        )
    @pytest.fixture
    def valid_pdf_content(self) -> bytes:
        """Create valid PDF content."""
        return b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF"
    @pytest.fixture
    def invalid_pdf_content(self) -> bytes:
        """Create invalid PDF content (executable header)."""
        return b"MZ\x90\x00\x03\x00\x00\x00"
    def test_upload_valid_pdf_succeeds(
        self, storage_config, valid_pdf_content
    ):
        """Test that uploading a valid PDF with correct magic bytes succeeds."""
        router = create_documents_router(storage_config)
        # Find the upload endpoint (path includes prefix /admin/documents)
        upload_route = None
        for route in router.routes:
            if hasattr(route, 'methods') and 'POST' in route.methods:
                if route.path == "/admin/documents":
                    upload_route = route
                    break
        assert upload_route is not None, "Upload route should exist"
        # Validate that valid PDF content passes validation
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
        validate_pdf_magic_bytes(valid_pdf_content)  # Should not raise
    def test_upload_invalid_pdf_returns_400(
        self, storage_config, invalid_pdf_content
    ):
        """Test that uploading an invalid PDF returns HTTP 400.
        The error message should clearly indicate the PDF header is invalid.
        """
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
        # Simulate what the upload endpoint should do
        try:
            validate_pdf_magic_bytes(invalid_pdf_content)
            pytest.fail("Should have raised ValueError for invalid PDF")
        except ValueError as e:
            # The endpoint should convert this to HTTP 400
            assert "Invalid PDF file" in str(e)
            assert "valid PDF header" in str(e)
    def test_upload_empty_pdf_returns_400(self, storage_config):
        """Test that uploading an empty file returns HTTP 400."""
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
        empty_content = b""
        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(empty_content)
        assert "Invalid PDF file" in str(exc_info.value)
 class TestNonPDFFileValidation:
    """Tests to ensure non-PDF files are not affected by magic bytes validation."""
    def test_png_files_skip_pdf_validation(self):
        """Test that PNG files do not go through PDF magic bytes validation.
        Only files with .pdf extension should be validated for PDF magic bytes.
        """
        # PNG magic bytes
        png_content = b"\x89PNG\r\n\x1a\n"
        file_ext = ".png"
        # PNG files should not be validated with PDF magic bytes check
        # The validation should only apply to .pdf files
        assert file_ext != ".pdf"
    def test_jpg_files_skip_pdf_validation(self):
        """Test that JPG files do not go through PDF magic bytes validation."""
        # JPEG magic bytes
        jpg_content = b"\xff\xd8\xff\xe0"
        file_ext = ".jpg"
        assert file_ext != ".pdf"