fix: add PDF magic bytes validation to prevent file type spoofing
Add validation that checks PDF files start with '%PDF' magic bytes before accepting uploads. This prevents attackers from uploading malicious files (executables, scripts) by renaming them to .pdf. - Add validate_pdf_magic_bytes() function with clear error messages - Integrate validation in upload_document endpoint after file read - Add comprehensive test coverage (13 test cases) Addresses medium-risk security issue from code review.
This commit is contained in:
@@ -39,6 +39,26 @@ from backend.web.schemas.common import ErrorResponse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# PDF magic bytes - all valid PDF files must start with this sequence
|
||||
PDF_MAGIC_BYTES = b"%PDF"
|
||||
|
||||
|
||||
def validate_pdf_magic_bytes(content: bytes) -> None:
|
||||
"""Validate that file content has valid PDF magic bytes.
|
||||
|
||||
PDF files must start with the bytes '%PDF' (0x25 0x50 0x44 0x46).
|
||||
This validation prevents attackers from uploading malicious files
|
||||
(executables, scripts) by simply renaming them to .pdf extension.
|
||||
|
||||
Args:
|
||||
content: The raw file content to validate.
|
||||
|
||||
Raises:
|
||||
ValueError: If the content does not start with valid PDF magic bytes.
|
||||
"""
|
||||
if not content or not content.startswith(PDF_MAGIC_BYTES):
|
||||
raise ValueError("Invalid PDF file: does not have valid PDF header")
|
||||
|
||||
|
||||
def _validate_uuid(value: str, name: str = "ID") -> None:
|
||||
"""Validate UUID format."""
|
||||
@@ -135,6 +155,14 @@ def create_documents_router(storage_config: StorageConfig) -> APIRouter:
|
||||
logger.error(f"Failed to read uploaded file: {e}")
|
||||
raise HTTPException(status_code=400, detail="Failed to read file")
|
||||
|
||||
# Validate PDF magic bytes (only for PDF files)
|
||||
if file_ext == ".pdf":
|
||||
try:
|
||||
validate_pdf_magic_bytes(content)
|
||||
except ValueError as e:
|
||||
logger.warning(f"PDF magic bytes validation failed: {e}")
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
# Get page count (for PDF)
|
||||
page_count = 1
|
||||
if file_ext == ".pdf":
|
||||
|
||||
Reference in New Issue
Block a user