Files
invoice-master-poc-v2/packages/backend/backend/web/api/v1/admin/annotations.py
Yaojia Wang b602d0a340 re-structure
2026-02-01 22:55:31 +01:00

707 lines
24 KiB
Python

"""
Admin Annotation API Routes
FastAPI endpoints for annotation management.
"""
import io
import logging
from typing import Annotated
from uuid import UUID
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import FileResponse, StreamingResponse
from shared.fields import FIELD_CLASSES, FIELD_CLASS_IDS
from backend.data.repositories import DocumentRepository, AnnotationRepository
from backend.web.core.auth import AdminTokenDep
from backend.web.services.autolabel import get_auto_label_service
from backend.web.services.storage_helpers import get_storage_helper
from backend.web.schemas.admin import (
AnnotationCreate,
AnnotationItem,
AnnotationListResponse,
AnnotationOverrideRequest,
AnnotationOverrideResponse,
AnnotationResponse,
AnnotationSource,
AnnotationUpdate,
AnnotationVerifyRequest,
AnnotationVerifyResponse,
AutoLabelRequest,
AutoLabelResponse,
BoundingBox,
)
from backend.web.schemas.common import ErrorResponse
logger = logging.getLogger(__name__)
# Global repository instances
_doc_repo: DocumentRepository | None = None
_ann_repo: AnnotationRepository | None = None
def get_doc_repository() -> DocumentRepository:
"""Get the DocumentRepository instance."""
global _doc_repo
if _doc_repo is None:
_doc_repo = DocumentRepository()
return _doc_repo
def get_ann_repository() -> AnnotationRepository:
"""Get the AnnotationRepository instance."""
global _ann_repo
if _ann_repo is None:
_ann_repo = AnnotationRepository()
return _ann_repo
# Type aliases for dependency injection
DocRepoDep = Annotated[DocumentRepository, Depends(get_doc_repository)]
AnnRepoDep = Annotated[AnnotationRepository, Depends(get_ann_repository)]
def _validate_uuid(value: str, name: str = "ID") -> None:
"""Validate UUID format."""
try:
UUID(value)
except ValueError:
raise HTTPException(
status_code=400,
detail=f"Invalid {name} format. Must be a valid UUID.",
)
def create_annotation_router() -> APIRouter:
"""Create annotation API router."""
router = APIRouter(prefix="/admin/documents", tags=["Admin Annotations"])
# =========================================================================
# Image Endpoints
# =========================================================================
@router.get(
"/{document_id}/images/{page_number}",
response_model=None,
responses={
200: {"content": {"image/png": {}}, "description": "Page image"},
401: {"model": ErrorResponse, "description": "Invalid token"},
404: {"model": ErrorResponse, "description": "Not found"},
},
summary="Get page image",
description="Get the image for a specific page.",
)
async def get_page_image(
document_id: str,
page_number: int,
admin_token: AdminTokenDep,
doc_repo: DocRepoDep,
) -> FileResponse | StreamingResponse:
"""Get page image."""
_validate_uuid(document_id, "document_id")
# Get document
document = doc_repo.get(document_id)
if document is None:
raise HTTPException(
status_code=404,
detail="Document not found",
)
# Validate page number
if page_number < 1 or page_number > document.page_count:
raise HTTPException(
status_code=404,
detail=f"Page {page_number} not found. Document has {document.page_count} pages.",
)
# Get storage helper
storage = get_storage_helper()
# Check if image exists
if not storage.admin_image_exists(document_id, page_number):
raise HTTPException(
status_code=404,
detail=f"Image for page {page_number} not found",
)
# Try to get local path for efficient file serving
local_path = storage.get_admin_image_local_path(document_id, page_number)
if local_path is not None:
return FileResponse(
path=str(local_path),
media_type="image/png",
filename=f"{document.filename}_page_{page_number}.png",
)
# Fall back to streaming for cloud storage
image_content = storage.get_admin_image(document_id, page_number)
return StreamingResponse(
io.BytesIO(image_content),
media_type="image/png",
headers={
"Content-Disposition": f'inline; filename="{document.filename}_page_{page_number}.png"'
},
)
# =========================================================================
# Annotation Endpoints
# =========================================================================
@router.get(
"/{document_id}/annotations",
response_model=AnnotationListResponse,
responses={
401: {"model": ErrorResponse, "description": "Invalid token"},
404: {"model": ErrorResponse, "description": "Document not found"},
},
summary="List annotations",
description="Get all annotations for a document.",
)
async def list_annotations(
document_id: str,
admin_token: AdminTokenDep,
doc_repo: DocRepoDep,
ann_repo: AnnRepoDep,
page_number: Annotated[
int | None,
Query(ge=1, description="Filter by page number"),
] = None,
) -> AnnotationListResponse:
"""List annotations for a document."""
_validate_uuid(document_id, "document_id")
# Get document
document = doc_repo.get(document_id)
if document is None:
raise HTTPException(
status_code=404,
detail="Document not found",
)
# Get annotations
raw_annotations = ann_repo.get_for_document(document_id, page_number)
annotations = [
AnnotationItem(
annotation_id=str(ann.annotation_id),
page_number=ann.page_number,
class_id=ann.class_id,
class_name=ann.class_name,
bbox=BoundingBox(
x=ann.bbox_x,
y=ann.bbox_y,
width=ann.bbox_width,
height=ann.bbox_height,
),
normalized_bbox={
"x_center": ann.x_center,
"y_center": ann.y_center,
"width": ann.width,
"height": ann.height,
},
text_value=ann.text_value,
confidence=ann.confidence,
source=AnnotationSource(ann.source),
created_at=ann.created_at,
)
for ann in raw_annotations
]
return AnnotationListResponse(
document_id=document_id,
page_count=document.page_count,
total_annotations=len(annotations),
annotations=annotations,
)
@router.post(
"/{document_id}/annotations",
response_model=AnnotationResponse,
responses={
400: {"model": ErrorResponse, "description": "Invalid request"},
401: {"model": ErrorResponse, "description": "Invalid token"},
404: {"model": ErrorResponse, "description": "Document not found"},
},
summary="Create annotation",
description="Create a new annotation for a document.",
)
async def create_annotation(
document_id: str,
request: AnnotationCreate,
admin_token: AdminTokenDep,
doc_repo: DocRepoDep,
ann_repo: AnnRepoDep,
) -> AnnotationResponse:
"""Create a new annotation."""
_validate_uuid(document_id, "document_id")
# Get document
document = doc_repo.get(document_id)
if document is None:
raise HTTPException(
status_code=404,
detail="Document not found",
)
# Validate page number
if request.page_number > document.page_count:
raise HTTPException(
status_code=400,
detail=f"Page {request.page_number} exceeds document page count ({document.page_count})",
)
# Get image dimensions for normalization
storage = get_storage_helper()
dimensions = storage.get_admin_image_dimensions(document_id, request.page_number)
if dimensions is None:
raise HTTPException(
status_code=400,
detail=f"Image for page {request.page_number} not available",
)
image_width, image_height = dimensions
# Calculate normalized coordinates
x_center = (request.bbox.x + request.bbox.width / 2) / image_width
y_center = (request.bbox.y + request.bbox.height / 2) / image_height
width = request.bbox.width / image_width
height = request.bbox.height / image_height
# Get class name
class_name = FIELD_CLASSES.get(request.class_id, f"class_{request.class_id}")
# Create annotation
annotation_id = ann_repo.create(
document_id=document_id,
page_number=request.page_number,
class_id=request.class_id,
class_name=class_name,
x_center=x_center,
y_center=y_center,
width=width,
height=height,
bbox_x=request.bbox.x,
bbox_y=request.bbox.y,
bbox_width=request.bbox.width,
bbox_height=request.bbox.height,
text_value=request.text_value,
source="manual",
)
# Keep status as pending - user must click "Mark Complete" to finalize
# This allows user to add multiple annotations before saving to PostgreSQL
return AnnotationResponse(
annotation_id=annotation_id,
message="Annotation created successfully",
)
@router.patch(
"/{document_id}/annotations/{annotation_id}",
response_model=AnnotationResponse,
responses={
400: {"model": ErrorResponse, "description": "Invalid request"},
401: {"model": ErrorResponse, "description": "Invalid token"},
404: {"model": ErrorResponse, "description": "Not found"},
},
summary="Update annotation",
description="Update an existing annotation.",
)
async def update_annotation(
document_id: str,
annotation_id: str,
request: AnnotationUpdate,
admin_token: AdminTokenDep,
doc_repo: DocRepoDep,
ann_repo: AnnRepoDep,
) -> AnnotationResponse:
"""Update an annotation."""
_validate_uuid(document_id, "document_id")
_validate_uuid(annotation_id, "annotation_id")
# Get document
document = doc_repo.get(document_id)
if document is None:
raise HTTPException(
status_code=404,
detail="Document not found",
)
# Get existing annotation
annotation = ann_repo.get(annotation_id)
if annotation is None:
raise HTTPException(
status_code=404,
detail="Annotation not found",
)
# Verify annotation belongs to document
if str(annotation.document_id) != document_id:
raise HTTPException(
status_code=404,
detail="Annotation does not belong to this document",
)
# Prepare update data
update_kwargs = {}
if request.class_id is not None:
update_kwargs["class_id"] = request.class_id
update_kwargs["class_name"] = FIELD_CLASSES.get(
request.class_id, f"class_{request.class_id}"
)
if request.text_value is not None:
update_kwargs["text_value"] = request.text_value
if request.bbox is not None:
# Get image dimensions
storage = get_storage_helper()
dimensions = storage.get_admin_image_dimensions(document_id, annotation.page_number)
if dimensions is None:
raise HTTPException(
status_code=400,
detail=f"Image for page {annotation.page_number} not available",
)
image_width, image_height = dimensions
# Calculate normalized coordinates
update_kwargs["x_center"] = (request.bbox.x + request.bbox.width / 2) / image_width
update_kwargs["y_center"] = (request.bbox.y + request.bbox.height / 2) / image_height
update_kwargs["width"] = request.bbox.width / image_width
update_kwargs["height"] = request.bbox.height / image_height
update_kwargs["bbox_x"] = request.bbox.x
update_kwargs["bbox_y"] = request.bbox.y
update_kwargs["bbox_width"] = request.bbox.width
update_kwargs["bbox_height"] = request.bbox.height
# Update annotation
if update_kwargs:
success = ann_repo.update(annotation_id, **update_kwargs)
if not success:
raise HTTPException(
status_code=500,
detail="Failed to update annotation",
)
return AnnotationResponse(
annotation_id=annotation_id,
message="Annotation updated successfully",
)
@router.delete(
"/{document_id}/annotations/{annotation_id}",
responses={
401: {"model": ErrorResponse, "description": "Invalid token"},
404: {"model": ErrorResponse, "description": "Not found"},
},
summary="Delete annotation",
description="Delete an annotation.",
)
async def delete_annotation(
document_id: str,
annotation_id: str,
admin_token: AdminTokenDep,
doc_repo: DocRepoDep,
ann_repo: AnnRepoDep,
) -> dict:
"""Delete an annotation."""
_validate_uuid(document_id, "document_id")
_validate_uuid(annotation_id, "annotation_id")
# Get document
document = doc_repo.get(document_id)
if document is None:
raise HTTPException(
status_code=404,
detail="Document not found",
)
# Get existing annotation
annotation = ann_repo.get(annotation_id)
if annotation is None:
raise HTTPException(
status_code=404,
detail="Annotation not found",
)
# Verify annotation belongs to document
if str(annotation.document_id) != document_id:
raise HTTPException(
status_code=404,
detail="Annotation does not belong to this document",
)
# Delete annotation
ann_repo.delete(annotation_id)
return {
"status": "deleted",
"annotation_id": annotation_id,
"message": "Annotation deleted successfully",
}
# =========================================================================
# Auto-Labeling Endpoints
# =========================================================================
@router.post(
"/{document_id}/auto-label",
response_model=AutoLabelResponse,
responses={
400: {"model": ErrorResponse, "description": "Invalid request"},
401: {"model": ErrorResponse, "description": "Invalid token"},
404: {"model": ErrorResponse, "description": "Document not found"},
},
summary="Trigger auto-labeling",
description="Trigger auto-labeling for a document using field values.",
)
async def trigger_auto_label(
document_id: str,
request: AutoLabelRequest,
admin_token: AdminTokenDep,
doc_repo: DocRepoDep,
ann_repo: AnnRepoDep,
) -> AutoLabelResponse:
"""Trigger auto-labeling for a document."""
_validate_uuid(document_id, "document_id")
# Get document
document = doc_repo.get(document_id)
if document is None:
raise HTTPException(
status_code=404,
detail="Document not found",
)
# Validate field values
if not request.field_values:
raise HTTPException(
status_code=400,
detail="At least one field value is required",
)
# Get the actual file path from storage
# document.file_path is a relative storage path like "raw_pdfs/uuid.pdf"
storage = get_storage_helper()
filename = document.file_path.split("/")[-1] if "/" in document.file_path else document.file_path
file_path = storage.get_raw_pdf_local_path(filename)
if file_path is None:
raise HTTPException(
status_code=500,
detail=f"Cannot find PDF file: {document.file_path}",
)
# Run auto-labeling
service = get_auto_label_service()
result = service.auto_label_document(
document_id=document_id,
file_path=str(file_path),
field_values=request.field_values,
doc_repo=doc_repo,
ann_repo=ann_repo,
replace_existing=request.replace_existing,
)
if result["status"] == "failed":
raise HTTPException(
status_code=500,
detail=f"Auto-labeling failed: {result.get('error', 'Unknown error')}",
)
return AutoLabelResponse(
document_id=document_id,
status=result["status"],
annotations_created=result["annotations_created"],
message=f"Auto-labeling completed. Created {result['annotations_created']} annotations.",
)
@router.delete(
"/{document_id}/annotations",
responses={
401: {"model": ErrorResponse, "description": "Invalid token"},
404: {"model": ErrorResponse, "description": "Document not found"},
},
summary="Delete all annotations",
description="Delete all annotations for a document (optionally filter by source).",
)
async def delete_all_annotations(
document_id: str,
admin_token: AdminTokenDep,
doc_repo: DocRepoDep,
ann_repo: AnnRepoDep,
source: Annotated[
str | None,
Query(description="Filter by source (manual, auto, imported)"),
] = None,
) -> dict:
"""Delete all annotations for a document."""
_validate_uuid(document_id, "document_id")
# Validate source
if source and source not in ("manual", "auto", "imported"):
raise HTTPException(
status_code=400,
detail=f"Invalid source: {source}",
)
# Get document
document = doc_repo.get(document_id)
if document is None:
raise HTTPException(
status_code=404,
detail="Document not found",
)
# Delete annotations
deleted_count = ann_repo.delete_for_document(document_id, source)
# Update document status if all annotations deleted
remaining = ann_repo.get_for_document(document_id)
if not remaining:
doc_repo.update_status(document_id, "pending")
return {
"status": "deleted",
"document_id": document_id,
"deleted_count": deleted_count,
"message": f"Deleted {deleted_count} annotations",
}
# =========================================================================
# Phase 5: Annotation Enhancement
# =========================================================================
@router.post(
"/{document_id}/annotations/{annotation_id}/verify",
response_model=AnnotationVerifyResponse,
responses={
401: {"model": ErrorResponse, "description": "Invalid token"},
404: {"model": ErrorResponse, "description": "Annotation not found"},
},
summary="Verify annotation",
description="Mark an annotation as verified by a human reviewer.",
)
async def verify_annotation(
document_id: str,
annotation_id: str,
admin_token: AdminTokenDep,
doc_repo: DocRepoDep,
ann_repo: AnnRepoDep,
request: AnnotationVerifyRequest = AnnotationVerifyRequest(),
) -> AnnotationVerifyResponse:
"""Verify an annotation."""
_validate_uuid(document_id, "document_id")
_validate_uuid(annotation_id, "annotation_id")
# Get document
document = doc_repo.get(document_id)
if document is None:
raise HTTPException(
status_code=404,
detail="Document not found",
)
# Verify the annotation
annotation = ann_repo.verify(annotation_id, admin_token)
if annotation is None:
raise HTTPException(
status_code=404,
detail="Annotation not found",
)
return AnnotationVerifyResponse(
annotation_id=annotation_id,
is_verified=annotation.is_verified,
verified_at=annotation.verified_at,
verified_by=annotation.verified_by,
message="Annotation verified successfully",
)
@router.patch(
"/{document_id}/annotations/{annotation_id}/override",
response_model=AnnotationOverrideResponse,
responses={
401: {"model": ErrorResponse, "description": "Invalid token"},
404: {"model": ErrorResponse, "description": "Annotation not found"},
},
summary="Override annotation",
description="Override an auto-generated annotation with manual corrections.",
)
async def override_annotation(
document_id: str,
annotation_id: str,
request: AnnotationOverrideRequest,
admin_token: AdminTokenDep,
doc_repo: DocRepoDep,
ann_repo: AnnRepoDep,
) -> AnnotationOverrideResponse:
"""Override an auto-generated annotation."""
_validate_uuid(document_id, "document_id")
_validate_uuid(annotation_id, "annotation_id")
# Get document
document = doc_repo.get(document_id)
if document is None:
raise HTTPException(
status_code=404,
detail="Document not found",
)
# Build updates dict from request
updates = {}
if request.text_value is not None:
updates["text_value"] = request.text_value
if request.class_id is not None:
updates["class_id"] = request.class_id
# Update class_name if class_id changed
if request.class_id in FIELD_CLASSES:
updates["class_name"] = FIELD_CLASSES[request.class_id]
if request.class_name is not None:
updates["class_name"] = request.class_name
if request.bbox:
# Update bbox fields
if "x" in request.bbox:
updates["bbox_x"] = request.bbox["x"]
if "y" in request.bbox:
updates["bbox_y"] = request.bbox["y"]
if "width" in request.bbox:
updates["bbox_width"] = request.bbox["width"]
if "height" in request.bbox:
updates["bbox_height"] = request.bbox["height"]
if not updates:
raise HTTPException(
status_code=400,
detail="No updates provided. Specify at least one field to update.",
)
# Override the annotation
annotation = ann_repo.override(
annotation_id=annotation_id,
admin_token=admin_token,
change_reason=request.reason,
**updates,
)
if annotation is None:
raise HTTPException(
status_code=404,
detail="Annotation not found",
)
# Get history to return history_id
history_records = ann_repo.get_history(UUID(annotation_id))
latest_history = history_records[0] if history_records else None
return AnnotationOverrideResponse(
annotation_id=annotation_id,
source=annotation.source,
override_source=annotation.override_source,
original_annotation_id=str(annotation.original_annotation_id) if annotation.original_annotation_id else None,
message="Annotation overridden successfully",
history_id=str(latest_history.history_id) if latest_history else "",
)
return router