invoice-master-poc-v2/tests/web/test_documents_upload_validation.py

"""
Tests for PDF Magic Bytes Validation in Document Upload.

TDD: These tests are written FIRST, before implementation.
They should FAIL initially until the validation logic is implemented.
"""

import pytest
from io import BytesIO
from unittest.mock import MagicMock, patch, AsyncMock
from uuid import UUID

from fastapi import UploadFile
from fastapi.testclient import TestClient

from backend.web.api.v1.admin.documents import create_documents_router
from backend.web.config import StorageConfig


# Test constants
TEST_DOC_UUID = "550e8400-e29b-41d4-a716-446655440000"
TEST_TOKEN = "test-admin-token-12345"


class TestPDFMagicBytesValidation:
    """Tests for PDF magic bytes validation during upload."""

    @pytest.fixture
    def storage_config(self, tmp_path):
        """Create a StorageConfig for testing."""
        return StorageConfig(
            upload_dir=tmp_path / "uploads",
            result_dir=tmp_path / "results",
            max_file_size_mb=50,
        )

    @pytest.fixture
    def mock_dependencies(self):
        """Create mock dependencies for document upload."""
        mock_docs = MagicMock()
        mock_docs.create.return_value = TEST_DOC_UUID

        mock_annotations = MagicMock()
        mock_annotations.get_for_document.return_value = []

        return {
            "docs": mock_docs,
            "annotations": mock_annotations,
        }

    @pytest.fixture
    def valid_pdf_content(self) -> bytes:
        """Create valid PDF content with correct magic bytes."""
        # PDF files must start with %PDF
        return b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF"

    @pytest.fixture
    def invalid_pdf_content_exe(self) -> bytes:
        """Create content that looks like an executable (MZ header)."""
        return b"MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\xff\xff"

    @pytest.fixture
    def invalid_pdf_content_text(self) -> bytes:
        """Create plain text content masquerading as PDF."""
        return b"This is not a PDF file, just plain text."

    @pytest.fixture
    def invalid_pdf_content_html(self) -> bytes:
        """Create HTML content masquerading as PDF."""
        return b"<!DOCTYPE html><html><body>Not a PDF</body></html>"

    @pytest.fixture
    def empty_content(self) -> bytes:
        """Create empty file content."""
        return b""

    @pytest.fixture
    def almost_valid_pdf(self) -> bytes:
        """Create content that starts with %PD but not %PDF."""
        return b"%PD-1.4\nNot quite right"

    def test_valid_pdf_passes_validation(self, valid_pdf_content):
        """Test that a valid PDF file with correct magic bytes passes validation.

        A valid PDF must start with the bytes b'%PDF'.
        """
        # Import the validation function (to be implemented)
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes

        # Should not raise any exception
        validate_pdf_magic_bytes(valid_pdf_content)

    def test_invalid_pdf_exe_fails_validation(self, invalid_pdf_content_exe):
        """Test that an executable file renamed to .pdf fails validation.

        This is a security test - attackers might try to upload malicious
        executables by renaming them to .pdf.
        """
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes

        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(invalid_pdf_content_exe)

        assert "Invalid PDF file" in str(exc_info.value)
        assert "valid PDF header" in str(exc_info.value)

    def test_invalid_pdf_text_fails_validation(self, invalid_pdf_content_text):
        """Test that plain text file renamed to .pdf fails validation."""
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes

        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(invalid_pdf_content_text)

        assert "Invalid PDF file" in str(exc_info.value)

    def test_invalid_pdf_html_fails_validation(self, invalid_pdf_content_html):
        """Test that HTML file renamed to .pdf fails validation."""
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes

        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(invalid_pdf_content_html)

        assert "Invalid PDF file" in str(exc_info.value)

    def test_empty_file_fails_validation(self, empty_content):
        """Test that an empty file fails validation.

        Empty files cannot be valid PDFs and should be rejected.
        """
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes

        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(empty_content)

        assert "Invalid PDF file" in str(exc_info.value)

    def test_almost_valid_pdf_fails_validation(self, almost_valid_pdf):
        """Test that content starting with %PD but not %PDF fails validation.

        The magic bytes must be exactly %PDF (4 bytes).
        """
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes

        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(almost_valid_pdf)

        assert "Invalid PDF file" in str(exc_info.value)

    def test_pdf_magic_bytes_constant(self):
        """Test that PDF magic bytes constant is correctly defined."""
        from backend.web.api.v1.admin.documents import PDF_MAGIC_BYTES

        assert PDF_MAGIC_BYTES == b"%PDF"

    def test_validation_is_case_sensitive(self):
        """Test that magic bytes validation is case-sensitive.

        %pdf (lowercase) should fail - PDF magic bytes are uppercase.
        """
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes

        lowercase_pdf = b"%pdf-1.4\nfake content"

        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(lowercase_pdf)

        assert "Invalid PDF file" in str(exc_info.value)


class TestDocumentUploadWithMagicBytesValidation:
    """Integration tests for document upload with magic bytes validation."""

    @pytest.fixture
    def storage_config(self, tmp_path):
        """Create a StorageConfig for testing."""
        return StorageConfig(
            upload_dir=tmp_path / "uploads",
            result_dir=tmp_path / "results",
            max_file_size_mb=50,
        )

    @pytest.fixture
    def valid_pdf_content(self) -> bytes:
        """Create valid PDF content."""
        return b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF"

    @pytest.fixture
    def invalid_pdf_content(self) -> bytes:
        """Create invalid PDF content (executable header)."""
        return b"MZ\x90\x00\x03\x00\x00\x00"

    def test_upload_valid_pdf_succeeds(
        self, storage_config, valid_pdf_content
    ):
        """Test that uploading a valid PDF with correct magic bytes succeeds."""
        router = create_documents_router(storage_config)

        # Find the upload endpoint (path includes prefix /admin/documents)
        upload_route = None
        for route in router.routes:
            if hasattr(route, 'methods') and 'POST' in route.methods:
                if route.path == "/admin/documents":
                    upload_route = route
                    break

        assert upload_route is not None, "Upload route should exist"

        # Validate that valid PDF content passes validation
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
        validate_pdf_magic_bytes(valid_pdf_content)  # Should not raise

    def test_upload_invalid_pdf_returns_400(
        self, storage_config, invalid_pdf_content
    ):
        """Test that uploading an invalid PDF returns HTTP 400.

        The error message should clearly indicate the PDF header is invalid.
        """
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes

        # Simulate what the upload endpoint should do
        try:
            validate_pdf_magic_bytes(invalid_pdf_content)
            pytest.fail("Should have raised ValueError for invalid PDF")
        except ValueError as e:
            # The endpoint should convert this to HTTP 400
            assert "Invalid PDF file" in str(e)
            assert "valid PDF header" in str(e)

    def test_upload_empty_pdf_returns_400(self, storage_config):
        """Test that uploading an empty file returns HTTP 400."""
        from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes

        empty_content = b""

        with pytest.raises(ValueError) as exc_info:
            validate_pdf_magic_bytes(empty_content)

        assert "Invalid PDF file" in str(exc_info.value)


class TestNonPDFFileValidation:
    """Tests to ensure non-PDF files are not affected by magic bytes validation."""

    def test_png_files_skip_pdf_validation(self):
        """Test that PNG files do not go through PDF magic bytes validation.

        Only files with .pdf extension should be validated for PDF magic bytes.
        """
        # PNG magic bytes
        png_content = b"\x89PNG\r\n\x1a\n"
        file_ext = ".png"

        # PNG files should not be validated with PDF magic bytes check
        # The validation should only apply to .pdf files
        assert file_ext != ".pdf"

    def test_jpg_files_skip_pdf_validation(self):
        """Test that JPG files do not go through PDF magic bytes validation."""
        # JPEG magic bytes
        jpg_content = b"\xff\xd8\xff\xe0"
        file_ext = ".jpg"

        assert file_ext != ".pdf"