invoice-master-poc-v2/packages/inference/inference/web/api/v1/admin/documents.py

"""
Admin Document Routes

FastAPI endpoints for admin document management.
"""

import logging
from pathlib import Path
from typing import Annotated
from uuid import UUID

from fastapi import APIRouter, File, HTTPException, Query, UploadFile

from inference.web.config import DEFAULT_DPI, StorageConfig
from inference.web.core.auth import AdminTokenDep, AdminDBDep
from inference.web.schemas.admin import (
    AnnotationItem,
    AnnotationSource,
    AutoLabelStatus,
    BoundingBox,
    DocumentDetailResponse,
    DocumentItem,
    DocumentListResponse,
    DocumentStatus,
    DocumentStatsResponse,
    DocumentUploadResponse,
    ModelMetrics,
    TrainingHistoryItem,
)
from inference.web.schemas.common import ErrorResponse

logger = logging.getLogger(__name__)


def _validate_uuid(value: str, name: str = "ID") -> None:
    """Validate UUID format."""
    try:
        UUID(value)
    except ValueError:
        raise HTTPException(
            status_code=400,
            detail=f"Invalid {name} format. Must be a valid UUID.",
        )


def _convert_pdf_to_images(
    document_id: str, content: bytes, page_count: int, images_dir: Path, dpi: int
) -> None:
    """Convert PDF pages to images for annotation."""
    import fitz

    doc_images_dir = images_dir / document_id
    doc_images_dir.mkdir(parents=True, exist_ok=True)

    pdf_doc = fitz.open(stream=content, filetype="pdf")

    for page_num in range(page_count):
        page = pdf_doc[page_num]
        # Render at configured DPI for consistency with training
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat)

        image_path = doc_images_dir / f"page_{page_num + 1}.png"
        pix.save(str(image_path))

    pdf_doc.close()


def create_documents_router(storage_config: StorageConfig) -> APIRouter:
    """Create admin documents router."""
    router = APIRouter(prefix="/admin/documents", tags=["Admin Documents"])

    # Directories are created by StorageConfig.__post_init__
    allowed_extensions = storage_config.allowed_extensions

    @router.post(
        "",
        response_model=DocumentUploadResponse,
        responses={
            400: {"model": ErrorResponse, "description": "Invalid file"},
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="Upload document",
        description="Upload a PDF or image document for labeling.",
    )
    async def upload_document(
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        file: UploadFile = File(..., description="PDF or image file"),
        auto_label: Annotated[
            bool,
            Query(description="Trigger auto-labeling after upload"),
        ] = True,
        group_key: Annotated[
            str | None,
            Query(description="Optional group key for document organization", max_length=255),
        ] = None,
    ) -> DocumentUploadResponse:
        """Upload a document for labeling."""
        # Validate group_key length
        if group_key and len(group_key) > 255:
            raise HTTPException(
                status_code=400,
                detail="Group key must be 255 characters or less",
            )

        # Validate filename
        if not file.filename:
            raise HTTPException(status_code=400, detail="Filename is required")

        # Validate extension
        file_ext = Path(file.filename).suffix.lower()
        if file_ext not in allowed_extensions:
            raise HTTPException(
                status_code=400,
                detail=f"Unsupported file type: {file_ext}. "
                f"Allowed: {', '.join(allowed_extensions)}",
            )

        # Read file content
        try:
            content = await file.read()
        except Exception as e:
            logger.error(f"Failed to read uploaded file: {e}")
            raise HTTPException(status_code=400, detail="Failed to read file")

        # Get page count (for PDF)
        page_count = 1
        if file_ext == ".pdf":
            try:
                import fitz
                pdf_doc = fitz.open(stream=content, filetype="pdf")
                page_count = len(pdf_doc)
                pdf_doc.close()
            except Exception as e:
                logger.warning(f"Failed to get PDF page count: {e}")

        # Create document record (token only used for auth, not stored)
        document_id = db.create_document(
            filename=file.filename,
            file_size=len(content),
            content_type=file.content_type or "application/octet-stream",
            file_path="",  # Will update after saving
            page_count=page_count,
            group_key=group_key,
        )

        # Save file to admin uploads
        file_path = storage_config.admin_upload_dir / f"{document_id}{file_ext}"
        try:
            file_path.write_bytes(content)
        except Exception as e:
            logger.error(f"Failed to save file: {e}")
            raise HTTPException(status_code=500, detail="Failed to save file")

        # Update file path in database
        from inference.data.database import get_session_context
        from inference.data.admin_models import AdminDocument
        with get_session_context() as session:
            doc = session.get(AdminDocument, UUID(document_id))
            if doc:
                doc.file_path = str(file_path)
                session.add(doc)

        # Convert PDF to images for annotation
        if file_ext == ".pdf":
            try:
                _convert_pdf_to_images(
                    document_id, content, page_count,
                    storage_config.admin_images_dir, storage_config.dpi
                )
            except Exception as e:
                logger.error(f"Failed to convert PDF to images: {e}")

        # Trigger auto-labeling if requested
        auto_label_started = False
        if auto_label:
            # Auto-labeling will be triggered by a background task
            db.update_document_status(
                document_id=document_id,
                status="auto_labeling",
                auto_label_status="running",
            )
            auto_label_started = True

        return DocumentUploadResponse(
            document_id=document_id,
            filename=file.filename,
            file_size=len(content),
            page_count=page_count,
            status=DocumentStatus.AUTO_LABELING if auto_label_started else DocumentStatus.PENDING,
            group_key=group_key,
            auto_label_started=auto_label_started,
            message="Document uploaded successfully",
        )

    @router.get(
        "",
        response_model=DocumentListResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="List documents",
        description="List all documents for the current admin.",
    )
    async def list_documents(
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        status: Annotated[
            str | None,
            Query(description="Filter by status"),
        ] = None,
        upload_source: Annotated[
            str | None,
            Query(description="Filter by upload source (ui or api)"),
        ] = None,
        has_annotations: Annotated[
            bool | None,
            Query(description="Filter by annotation presence"),
        ] = None,
        auto_label_status: Annotated[
            str | None,
            Query(description="Filter by auto-label status"),
        ] = None,
        batch_id: Annotated[
            str | None,
            Query(description="Filter by batch ID"),
        ] = None,
        limit: Annotated[
            int,
            Query(ge=1, le=100, description="Page size"),
        ] = 20,
        offset: Annotated[
            int,
            Query(ge=0, description="Offset"),
        ] = 0,
    ) -> DocumentListResponse:
        """List documents."""
        # Validate status
        if status and status not in ("pending", "auto_labeling", "labeled", "exported"):
            raise HTTPException(
                status_code=400,
                detail=f"Invalid status: {status}",
            )

        # Validate upload_source
        if upload_source and upload_source not in ("ui", "api"):
            raise HTTPException(
                status_code=400,
                detail=f"Invalid upload_source: {upload_source}",
            )

        # Validate auto_label_status
        if auto_label_status and auto_label_status not in ("pending", "running", "completed", "failed"):
            raise HTTPException(
                status_code=400,
                detail=f"Invalid auto_label_status: {auto_label_status}",
            )

        documents, total = db.get_documents_by_token(
            admin_token=admin_token,
            status=status,
            upload_source=upload_source,
            has_annotations=has_annotations,
            auto_label_status=auto_label_status,
            batch_id=batch_id,
            limit=limit,
            offset=offset,
        )

        # Get annotation counts and build items
        items = []
        for doc in documents:
            annotations = db.get_annotations_for_document(str(doc.document_id))

            # Determine if document can be annotated (not locked)
            can_annotate = True
            if hasattr(doc, 'annotation_lock_until') and doc.annotation_lock_until:
                from datetime import datetime, timezone
                can_annotate = doc.annotation_lock_until < datetime.now(timezone.utc)

            items.append(
                DocumentItem(
                    document_id=str(doc.document_id),
                    filename=doc.filename,
                    file_size=doc.file_size,
                    page_count=doc.page_count,
                    status=DocumentStatus(doc.status),
                    auto_label_status=AutoLabelStatus(doc.auto_label_status) if doc.auto_label_status else None,
                    annotation_count=len(annotations),
                    upload_source=doc.upload_source if hasattr(doc, 'upload_source') else "ui",
                    batch_id=str(doc.batch_id) if hasattr(doc, 'batch_id') and doc.batch_id else None,
                    group_key=doc.group_key if hasattr(doc, 'group_key') else None,
                    can_annotate=can_annotate,
                    created_at=doc.created_at,
                    updated_at=doc.updated_at,
                )
            )

        return DocumentListResponse(
            total=total,
            limit=limit,
            offset=offset,
            documents=items,
        )

    @router.get(
        "/stats",
        response_model=DocumentStatsResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="Get document statistics",
        description="Get document count by status.",
    )
    async def get_document_stats(
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> DocumentStatsResponse:
        """Get document statistics."""
        counts = db.count_documents_by_status(admin_token)

        return DocumentStatsResponse(
            total=sum(counts.values()),
            pending=counts.get("pending", 0),
            auto_labeling=counts.get("auto_labeling", 0),
            labeled=counts.get("labeled", 0),
            exported=counts.get("exported", 0),
        )

    @router.get(
        "/{document_id}",
        response_model=DocumentDetailResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="Get document detail",
        description="Get document details with annotations.",
    )
    async def get_document(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> DocumentDetailResponse:
        """Get document details."""
        _validate_uuid(document_id, "document_id")

        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )

        # Get annotations
        raw_annotations = db.get_annotations_for_document(document_id)
        annotations = [
            AnnotationItem(
                annotation_id=str(ann.annotation_id),
                page_number=ann.page_number,
                class_id=ann.class_id,
                class_name=ann.class_name,
                bbox=BoundingBox(
                    x=ann.bbox_x,
                    y=ann.bbox_y,
                    width=ann.bbox_width,
                    height=ann.bbox_height,
                ),
                normalized_bbox={
                    "x_center": ann.x_center,
                    "y_center": ann.y_center,
                    "width": ann.width,
                    "height": ann.height,
                },
                text_value=ann.text_value,
                confidence=ann.confidence,
                source=AnnotationSource(ann.source),
                created_at=ann.created_at,
            )
            for ann in raw_annotations
        ]

        # Generate image URLs
        image_urls = []
        for page in range(1, document.page_count + 1):
            image_urls.append(f"/api/v1/admin/documents/{document_id}/images/{page}")

        # Determine if document can be annotated (not locked)
        can_annotate = True
        annotation_lock_until = None
        if hasattr(document, 'annotation_lock_until') and document.annotation_lock_until:
            from datetime import datetime, timezone
            annotation_lock_until = document.annotation_lock_until
            can_annotate = document.annotation_lock_until < datetime.now(timezone.utc)

        # Get CSV field values if available
        csv_field_values = None
        if hasattr(document, 'csv_field_values') and document.csv_field_values:
            csv_field_values = document.csv_field_values

        # Get training history (Phase 5)
        training_history = []
        training_links = db.get_document_training_tasks(document.document_id)
        for link in training_links:
            # Get task details
            task = db.get_training_task(str(link.task_id))
            if task:
                # Build metrics
                metrics = None
                if task.metrics_mAP or task.metrics_precision or task.metrics_recall:
                    metrics = ModelMetrics(
                        mAP=task.metrics_mAP,
                        precision=task.metrics_precision,
                        recall=task.metrics_recall,
                    )

                training_history.append(
                    TrainingHistoryItem(
                        task_id=str(link.task_id),
                        name=task.name,
                        trained_at=link.created_at,
                        model_metrics=metrics,
                    )
                )

        return DocumentDetailResponse(
            document_id=str(document.document_id),
            filename=document.filename,
            file_size=document.file_size,
            content_type=document.content_type,
            page_count=document.page_count,
            status=DocumentStatus(document.status),
            auto_label_status=AutoLabelStatus(document.auto_label_status) if document.auto_label_status else None,
            auto_label_error=document.auto_label_error,
            upload_source=document.upload_source if hasattr(document, 'upload_source') else "ui",
            batch_id=str(document.batch_id) if hasattr(document, 'batch_id') and document.batch_id else None,
            group_key=document.group_key if hasattr(document, 'group_key') else None,
            csv_field_values=csv_field_values,
            can_annotate=can_annotate,
            annotation_lock_until=annotation_lock_until,
            annotations=annotations,
            image_urls=image_urls,
            training_history=training_history,
            created_at=document.created_at,
            updated_at=document.updated_at,
        )

    @router.delete(
        "/{document_id}",
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="Delete document",
        description="Delete a document and its annotations.",
    )
    async def delete_document(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> dict:
        """Delete a document."""
        _validate_uuid(document_id, "document_id")

        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )

        # Delete file
        file_path = Path(document.file_path)
        if file_path.exists():
            file_path.unlink()

        # Delete images
        images_dir = ADMIN_IMAGES_DIR / document_id
        if images_dir.exists():
            import shutil
            shutil.rmtree(images_dir)

        # Delete from database
        db.delete_document(document_id)

        return {
            "status": "deleted",
            "document_id": document_id,
            "message": "Document deleted successfully",
        }

    @router.patch(
        "/{document_id}/status",
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="Update document status",
        description="Update document status (e.g., mark as labeled). When marking as 'labeled', annotations are saved to PostgreSQL.",
    )
    async def update_document_status(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        status: Annotated[
            str,
            Query(description="New status"),
        ],
    ) -> dict:
        """Update document status.

        When status is set to 'labeled', the annotations are automatically
        saved to PostgreSQL documents/field_results tables for consistency
        with CLI auto-label workflow.
        """
        _validate_uuid(document_id, "document_id")

        # Validate status
        if status not in ("pending", "labeled", "exported"):
            raise HTTPException(
                status_code=400,
                detail=f"Invalid status: {status}",
            )

        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )

        # If marking as labeled, save annotations to PostgreSQL DocumentDB
        db_save_result = None
        if status == "labeled":
            from inference.web.services.db_autolabel import save_manual_annotations_to_document_db

            # Get all annotations for this document
            annotations = db.get_annotations_for_document(document_id)

            if annotations:
                db_save_result = save_manual_annotations_to_document_db(
                    document=document,
                    annotations=annotations,
                    db=db,
                )

        db.update_document_status(document_id, status)

        response = {
            "status": "updated",
            "document_id": document_id,
            "new_status": status,
            "message": "Document status updated",
        }

        # Include PostgreSQL save result if applicable
        if db_save_result:
            response["document_db_saved"] = db_save_result.get("success", False)
            response["fields_saved"] = db_save_result.get("fields_saved", 0)

        return response

    @router.patch(
        "/{document_id}/group-key",
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="Update document group key",
        description="Update the group key for a document.",
    )
    async def update_document_group_key(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        group_key: Annotated[
            str | None,
            Query(description="New group key (null to clear)"),
        ] = None,
    ) -> dict:
        """Update document group key."""
        _validate_uuid(document_id, "document_id")

        # Validate group_key length
        if group_key and len(group_key) > 255:
            raise HTTPException(
                status_code=400,
                detail="Group key must be 255 characters or less",
            )

        # Verify document exists
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )

        # Update group key
        db.update_document_group_key(document_id, group_key)

        return {
            "status": "updated",
            "document_id": document_id,
            "group_key": group_key,
            "message": "Document group key updated",
        }

    return router