Files
invoice-master-poc-v2/packages/shared/shared/storage/prefixes.py
Yaojia Wang a516de4320 WIP
2026-02-01 00:08:40 +01:00

159 lines
4.5 KiB
Python

"""
Storage path prefixes for unified file organization.
Provides standardized path prefixes for organizing files within
the storage backend, ensuring consistent structure across
local, Azure Blob, and S3 storage.
"""
from dataclasses import dataclass
@dataclass(frozen=True)
class StoragePrefixes:
"""Standardized storage path prefixes.
All paths are relative to the storage backend root.
These prefixes ensure consistent file organization across
all storage backends (local, Azure, S3).
Usage:
from shared.storage.prefixes import PREFIXES
path = f"{PREFIXES.DOCUMENTS}/{document_id}.pdf"
storage.upload_bytes(content, path)
"""
# Document storage
DOCUMENTS: str = "documents"
"""Original document files (PDFs, etc.)"""
IMAGES: str = "images"
"""Page images extracted from documents"""
# Processing directories
UPLOADS: str = "uploads"
"""Temporary upload staging area"""
RESULTS: str = "results"
"""Inference results and visualizations"""
EXPORTS: str = "exports"
"""Exported datasets and annotations"""
# Training data
DATASETS: str = "datasets"
"""Training dataset files"""
MODELS: str = "models"
"""Trained model weights and checkpoints"""
# Data pipeline directories (legacy compatibility)
RAW_PDFS: str = "raw_pdfs"
"""Raw PDF files for auto-labeling pipeline"""
STRUCTURED_DATA: str = "structured_data"
"""CSV/structured data for matching"""
ADMIN_IMAGES: str = "admin_images"
"""Admin UI page images"""
@staticmethod
def document_path(document_id: str, extension: str = ".pdf") -> str:
"""Get path for a document file.
Args:
document_id: Unique document identifier.
extension: File extension (include leading dot).
Returns:
Storage path like "documents/abc123.pdf"
"""
ext = extension if extension.startswith(".") else f".{extension}"
return f"{PREFIXES.DOCUMENTS}/{document_id}{ext}"
@staticmethod
def image_path(document_id: str, page_num: int, extension: str = ".png") -> str:
"""Get path for a page image file.
Args:
document_id: Unique document identifier.
page_num: Page number (1-indexed).
extension: File extension (include leading dot).
Returns:
Storage path like "images/abc123/page_1.png"
"""
ext = extension if extension.startswith(".") else f".{extension}"
return f"{PREFIXES.IMAGES}/{document_id}/page_{page_num}{ext}"
@staticmethod
def upload_path(filename: str, subfolder: str | None = None) -> str:
"""Get path for a temporary upload file.
Args:
filename: Original filename.
subfolder: Optional subfolder (e.g., "async").
Returns:
Storage path like "uploads/filename.pdf" or "uploads/async/filename.pdf"
"""
if subfolder:
return f"{PREFIXES.UPLOADS}/{subfolder}/{filename}"
return f"{PREFIXES.UPLOADS}/{filename}"
@staticmethod
def result_path(filename: str) -> str:
"""Get path for a result file.
Args:
filename: Result filename.
Returns:
Storage path like "results/filename.json"
"""
return f"{PREFIXES.RESULTS}/{filename}"
@staticmethod
def export_path(export_id: str, filename: str) -> str:
"""Get path for an export file.
Args:
export_id: Unique export identifier.
filename: Export filename.
Returns:
Storage path like "exports/abc123/filename.zip"
"""
return f"{PREFIXES.EXPORTS}/{export_id}/{filename}"
@staticmethod
def dataset_path(dataset_id: str, filename: str) -> str:
"""Get path for a dataset file.
Args:
dataset_id: Unique dataset identifier.
filename: Dataset filename.
Returns:
Storage path like "datasets/abc123/filename.yaml"
"""
return f"{PREFIXES.DATASETS}/{dataset_id}/{filename}"
@staticmethod
def model_path(version: str, filename: str) -> str:
"""Get path for a model file.
Args:
version: Model version string.
filename: Model filename.
Returns:
Storage path like "models/v1.0.0/best.pt"
"""
return f"{PREFIXES.MODELS}/{version}/{filename}"
# Default instance for convenient access
PREFIXES = StoragePrefixes()