fix: add PDF magic bytes validation to prevent file type spoofing

Add validation that checks PDF files start with '%PDF' magic bytes
before accepting uploads. This prevents attackers from uploading
malicious files (executables, scripts) by renaming them to .pdf.

- Add validate_pdf_magic_bytes() function with clear error messages
- Integrate validation in upload_document endpoint after file read
- Add comprehensive test coverage (13 test cases)

Addresses medium-risk security issue from code review.
This commit is contained in:
Yaojia Wang
2026-02-03 22:28:24 +01:00
parent 183d3503ef
commit 4c7fc3015c
2 changed files with 292 additions and 0 deletions

View File

@@ -39,6 +39,26 @@ from backend.web.schemas.common import ErrorResponse
logger = logging.getLogger(__name__)
# PDF magic bytes - all valid PDF files must start with this sequence
PDF_MAGIC_BYTES = b"%PDF"
def validate_pdf_magic_bytes(content: bytes) -> None:
"""Validate that file content has valid PDF magic bytes.
PDF files must start with the bytes '%PDF' (0x25 0x50 0x44 0x46).
This validation prevents attackers from uploading malicious files
(executables, scripts) by simply renaming them to .pdf extension.
Args:
content: The raw file content to validate.
Raises:
ValueError: If the content does not start with valid PDF magic bytes.
"""
if not content or not content.startswith(PDF_MAGIC_BYTES):
raise ValueError("Invalid PDF file: does not have valid PDF header")
def _validate_uuid(value: str, name: str = "ID") -> None:
"""Validate UUID format."""
@@ -135,6 +155,14 @@ def create_documents_router(storage_config: StorageConfig) -> APIRouter:
logger.error(f"Failed to read uploaded file: {e}")
raise HTTPException(status_code=400, detail="Failed to read file")
# Validate PDF magic bytes (only for PDF files)
if file_ext == ".pdf":
try:
validate_pdf_magic_bytes(content)
except ValueError as e:
logger.warning(f"PDF magic bytes validation failed: {e}")
raise HTTPException(status_code=400, detail=str(e))
# Get page count (for PDF)
page_count = 1
if file_ext == ".pdf":