Files
invoice-master-poc-v2/tests/web/test_documents_upload_validation.py
Yaojia Wang 4c7fc3015c fix: add PDF magic bytes validation to prevent file type spoofing
Add validation that checks PDF files start with '%PDF' magic bytes
before accepting uploads. This prevents attackers from uploading
malicious files (executables, scripts) by renaming them to .pdf.

- Add validate_pdf_magic_bytes() function with clear error messages
- Integrate validation in upload_document endpoint after file read
- Add comprehensive test coverage (13 test cases)

Addresses medium-risk security issue from code review.
2026-02-03 22:28:24 +01:00

265 lines
9.4 KiB
Python

"""
Tests for PDF Magic Bytes Validation in Document Upload.
TDD: These tests are written FIRST, before implementation.
They should FAIL initially until the validation logic is implemented.
"""
import pytest
from io import BytesIO
from unittest.mock import MagicMock, patch, AsyncMock
from uuid import UUID
from fastapi import UploadFile
from fastapi.testclient import TestClient
from backend.web.api.v1.admin.documents import create_documents_router
from backend.web.config import StorageConfig
# Test constants
TEST_DOC_UUID = "550e8400-e29b-41d4-a716-446655440000"
TEST_TOKEN = "test-admin-token-12345"
class TestPDFMagicBytesValidation:
"""Tests for PDF magic bytes validation during upload."""
@pytest.fixture
def storage_config(self, tmp_path):
"""Create a StorageConfig for testing."""
return StorageConfig(
upload_dir=tmp_path / "uploads",
result_dir=tmp_path / "results",
max_file_size_mb=50,
)
@pytest.fixture
def mock_dependencies(self):
"""Create mock dependencies for document upload."""
mock_docs = MagicMock()
mock_docs.create.return_value = TEST_DOC_UUID
mock_annotations = MagicMock()
mock_annotations.get_for_document.return_value = []
return {
"docs": mock_docs,
"annotations": mock_annotations,
}
@pytest.fixture
def valid_pdf_content(self) -> bytes:
"""Create valid PDF content with correct magic bytes."""
# PDF files must start with %PDF
return b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF"
@pytest.fixture
def invalid_pdf_content_exe(self) -> bytes:
"""Create content that looks like an executable (MZ header)."""
return b"MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\xff\xff"
@pytest.fixture
def invalid_pdf_content_text(self) -> bytes:
"""Create plain text content masquerading as PDF."""
return b"This is not a PDF file, just plain text."
@pytest.fixture
def invalid_pdf_content_html(self) -> bytes:
"""Create HTML content masquerading as PDF."""
return b"<!DOCTYPE html><html><body>Not a PDF</body></html>"
@pytest.fixture
def empty_content(self) -> bytes:
"""Create empty file content."""
return b""
@pytest.fixture
def almost_valid_pdf(self) -> bytes:
"""Create content that starts with %PD but not %PDF."""
return b"%PD-1.4\nNot quite right"
def test_valid_pdf_passes_validation(self, valid_pdf_content):
"""Test that a valid PDF file with correct magic bytes passes validation.
A valid PDF must start with the bytes b'%PDF'.
"""
# Import the validation function (to be implemented)
from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
# Should not raise any exception
validate_pdf_magic_bytes(valid_pdf_content)
def test_invalid_pdf_exe_fails_validation(self, invalid_pdf_content_exe):
"""Test that an executable file renamed to .pdf fails validation.
This is a security test - attackers might try to upload malicious
executables by renaming them to .pdf.
"""
from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
with pytest.raises(ValueError) as exc_info:
validate_pdf_magic_bytes(invalid_pdf_content_exe)
assert "Invalid PDF file" in str(exc_info.value)
assert "valid PDF header" in str(exc_info.value)
def test_invalid_pdf_text_fails_validation(self, invalid_pdf_content_text):
"""Test that plain text file renamed to .pdf fails validation."""
from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
with pytest.raises(ValueError) as exc_info:
validate_pdf_magic_bytes(invalid_pdf_content_text)
assert "Invalid PDF file" in str(exc_info.value)
def test_invalid_pdf_html_fails_validation(self, invalid_pdf_content_html):
"""Test that HTML file renamed to .pdf fails validation."""
from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
with pytest.raises(ValueError) as exc_info:
validate_pdf_magic_bytes(invalid_pdf_content_html)
assert "Invalid PDF file" in str(exc_info.value)
def test_empty_file_fails_validation(self, empty_content):
"""Test that an empty file fails validation.
Empty files cannot be valid PDFs and should be rejected.
"""
from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
with pytest.raises(ValueError) as exc_info:
validate_pdf_magic_bytes(empty_content)
assert "Invalid PDF file" in str(exc_info.value)
def test_almost_valid_pdf_fails_validation(self, almost_valid_pdf):
"""Test that content starting with %PD but not %PDF fails validation.
The magic bytes must be exactly %PDF (4 bytes).
"""
from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
with pytest.raises(ValueError) as exc_info:
validate_pdf_magic_bytes(almost_valid_pdf)
assert "Invalid PDF file" in str(exc_info.value)
def test_pdf_magic_bytes_constant(self):
"""Test that PDF magic bytes constant is correctly defined."""
from backend.web.api.v1.admin.documents import PDF_MAGIC_BYTES
assert PDF_MAGIC_BYTES == b"%PDF"
def test_validation_is_case_sensitive(self):
"""Test that magic bytes validation is case-sensitive.
%pdf (lowercase) should fail - PDF magic bytes are uppercase.
"""
from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
lowercase_pdf = b"%pdf-1.4\nfake content"
with pytest.raises(ValueError) as exc_info:
validate_pdf_magic_bytes(lowercase_pdf)
assert "Invalid PDF file" in str(exc_info.value)
class TestDocumentUploadWithMagicBytesValidation:
"""Integration tests for document upload with magic bytes validation."""
@pytest.fixture
def storage_config(self, tmp_path):
"""Create a StorageConfig for testing."""
return StorageConfig(
upload_dir=tmp_path / "uploads",
result_dir=tmp_path / "results",
max_file_size_mb=50,
)
@pytest.fixture
def valid_pdf_content(self) -> bytes:
"""Create valid PDF content."""
return b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF"
@pytest.fixture
def invalid_pdf_content(self) -> bytes:
"""Create invalid PDF content (executable header)."""
return b"MZ\x90\x00\x03\x00\x00\x00"
def test_upload_valid_pdf_succeeds(
self, storage_config, valid_pdf_content
):
"""Test that uploading a valid PDF with correct magic bytes succeeds."""
router = create_documents_router(storage_config)
# Find the upload endpoint (path includes prefix /admin/documents)
upload_route = None
for route in router.routes:
if hasattr(route, 'methods') and 'POST' in route.methods:
if route.path == "/admin/documents":
upload_route = route
break
assert upload_route is not None, "Upload route should exist"
# Validate that valid PDF content passes validation
from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
validate_pdf_magic_bytes(valid_pdf_content) # Should not raise
def test_upload_invalid_pdf_returns_400(
self, storage_config, invalid_pdf_content
):
"""Test that uploading an invalid PDF returns HTTP 400.
The error message should clearly indicate the PDF header is invalid.
"""
from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
# Simulate what the upload endpoint should do
try:
validate_pdf_magic_bytes(invalid_pdf_content)
pytest.fail("Should have raised ValueError for invalid PDF")
except ValueError as e:
# The endpoint should convert this to HTTP 400
assert "Invalid PDF file" in str(e)
assert "valid PDF header" in str(e)
def test_upload_empty_pdf_returns_400(self, storage_config):
"""Test that uploading an empty file returns HTTP 400."""
from backend.web.api.v1.admin.documents import validate_pdf_magic_bytes
empty_content = b""
with pytest.raises(ValueError) as exc_info:
validate_pdf_magic_bytes(empty_content)
assert "Invalid PDF file" in str(exc_info.value)
class TestNonPDFFileValidation:
"""Tests to ensure non-PDF files are not affected by magic bytes validation."""
def test_png_files_skip_pdf_validation(self):
"""Test that PNG files do not go through PDF magic bytes validation.
Only files with .pdf extension should be validated for PDF magic bytes.
"""
# PNG magic bytes
png_content = b"\x89PNG\r\n\x1a\n"
file_ext = ".png"
# PNG files should not be validated with PDF magic bytes check
# The validation should only apply to .pdf files
assert file_ext != ".pdf"
def test_jpg_files_skip_pdf_validation(self):
"""Test that JPG files do not go through PDF magic bytes validation."""
# JPEG magic bytes
jpg_content = b"\xff\xd8\xff\xe0"
file_ext = ".jpg"
assert file_ext != ".pdf"