454 lines
16 KiB
Python
454 lines
16 KiB
Python
"""
|
|
Dataset Builder Service Integration Tests
|
|
|
|
Tests DatasetBuilder with real file operations and repository interactions.
|
|
"""
|
|
|
|
import shutil
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from uuid import uuid4
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from backend.data.admin_models import AdminAnnotation, AdminDocument
|
|
from backend.data.repositories.annotation_repository import AnnotationRepository
|
|
from backend.data.repositories.dataset_repository import DatasetRepository
|
|
from backend.data.repositories.document_repository import DocumentRepository
|
|
from backend.web.services.dataset_builder import DatasetBuilder
|
|
|
|
|
|
@pytest.fixture
|
|
def dataset_builder(patched_session, temp_dataset_dir):
|
|
"""Create a DatasetBuilder with real repositories."""
|
|
return DatasetBuilder(
|
|
datasets_repo=DatasetRepository(),
|
|
documents_repo=DocumentRepository(),
|
|
annotations_repo=AnnotationRepository(),
|
|
base_dir=temp_dataset_dir,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def admin_images_dir(temp_upload_dir):
|
|
"""Create a directory for admin images."""
|
|
images_dir = temp_upload_dir / "admin_images"
|
|
images_dir.mkdir(parents=True, exist_ok=True)
|
|
return images_dir
|
|
|
|
|
|
@pytest.fixture
|
|
def documents_with_annotations(patched_session, db_session, admin_token, admin_images_dir):
|
|
"""Create documents with annotations and corresponding image files."""
|
|
documents = []
|
|
doc_repo = DocumentRepository()
|
|
ann_repo = AnnotationRepository()
|
|
|
|
for i in range(5):
|
|
# Create document
|
|
doc_id = doc_repo.create(
|
|
filename=f"invoice_{i}.pdf",
|
|
file_size=1024,
|
|
content_type="application/pdf",
|
|
file_path=f"/uploads/invoice_{i}.pdf",
|
|
page_count=2,
|
|
category="invoice",
|
|
group_key=f"group_{i % 2}", # Two groups
|
|
)
|
|
|
|
# Create image files for each page
|
|
doc_dir = admin_images_dir / doc_id
|
|
doc_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
for page in range(1, 3):
|
|
image_path = doc_dir / f"page_{page}.png"
|
|
# Create a minimal fake PNG
|
|
image_path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100)
|
|
|
|
# Create annotations
|
|
for j in range(3):
|
|
ann_repo.create(
|
|
document_id=doc_id,
|
|
page_number=1,
|
|
class_id=j,
|
|
class_name=f"field_{j}",
|
|
x_center=0.5,
|
|
y_center=0.1 + j * 0.2,
|
|
width=0.2,
|
|
height=0.05,
|
|
bbox_x=400,
|
|
bbox_y=80 + j * 160,
|
|
bbox_width=160,
|
|
bbox_height=40,
|
|
text_value=f"value_{j}",
|
|
confidence=0.95,
|
|
source="auto",
|
|
)
|
|
|
|
doc = doc_repo.get(doc_id)
|
|
documents.append(doc)
|
|
|
|
return documents
|
|
|
|
|
|
class TestDatasetBuilderBasic:
|
|
"""Tests for basic dataset building operations."""
|
|
|
|
def test_build_dataset_creates_directory_structure(
|
|
self, dataset_builder, documents_with_annotations, admin_images_dir, temp_dataset_dir, patched_session
|
|
):
|
|
"""Test that building creates proper directory structure."""
|
|
dataset_repo = DatasetRepository()
|
|
dataset = dataset_repo.create(name="Test Dataset")
|
|
|
|
doc_ids = [str(d.document_id) for d in documents_with_annotations]
|
|
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset.dataset_id),
|
|
document_ids=doc_ids,
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=42,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
dataset_dir = temp_dataset_dir / str(dataset.dataset_id)
|
|
|
|
# Check directory structure
|
|
assert (dataset_dir / "images" / "train").exists()
|
|
assert (dataset_dir / "images" / "val").exists()
|
|
assert (dataset_dir / "images" / "test").exists()
|
|
assert (dataset_dir / "labels" / "train").exists()
|
|
assert (dataset_dir / "labels" / "val").exists()
|
|
assert (dataset_dir / "labels" / "test").exists()
|
|
|
|
def test_build_dataset_copies_images(
|
|
self, dataset_builder, documents_with_annotations, admin_images_dir, temp_dataset_dir, patched_session
|
|
):
|
|
"""Test that images are copied to dataset directory."""
|
|
dataset_repo = DatasetRepository()
|
|
dataset = dataset_repo.create(name="Image Copy Test")
|
|
|
|
doc_ids = [str(d.document_id) for d in documents_with_annotations]
|
|
|
|
result = dataset_builder.build_dataset(
|
|
dataset_id=str(dataset.dataset_id),
|
|
document_ids=doc_ids,
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=42,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
dataset_dir = temp_dataset_dir / str(dataset.dataset_id)
|
|
|
|
# Count total images across all splits
|
|
total_images = 0
|
|
for split in ["train", "val", "test"]:
|
|
images = list((dataset_dir / "images" / split).glob("*.png"))
|
|
total_images += len(images)
|
|
|
|
# 5 docs * 2 pages = 10 images
|
|
assert total_images == 10
|
|
assert result["total_images"] == 10
|
|
|
|
def test_build_dataset_generates_labels(
|
|
self, dataset_builder, documents_with_annotations, admin_images_dir, temp_dataset_dir, patched_session
|
|
):
|
|
"""Test that YOLO label files are generated."""
|
|
dataset_repo = DatasetRepository()
|
|
dataset = dataset_repo.create(name="Label Generation Test")
|
|
|
|
doc_ids = [str(d.document_id) for d in documents_with_annotations]
|
|
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset.dataset_id),
|
|
document_ids=doc_ids,
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=42,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
dataset_dir = temp_dataset_dir / str(dataset.dataset_id)
|
|
|
|
# Count total label files
|
|
total_labels = 0
|
|
for split in ["train", "val", "test"]:
|
|
labels = list((dataset_dir / "labels" / split).glob("*.txt"))
|
|
total_labels += len(labels)
|
|
|
|
# Same count as images
|
|
assert total_labels == 10
|
|
|
|
def test_build_dataset_generates_data_yaml(
|
|
self, dataset_builder, documents_with_annotations, admin_images_dir, temp_dataset_dir, patched_session
|
|
):
|
|
"""Test that data.yaml is generated correctly."""
|
|
dataset_repo = DatasetRepository()
|
|
dataset = dataset_repo.create(name="YAML Generation Test")
|
|
|
|
doc_ids = [str(d.document_id) for d in documents_with_annotations]
|
|
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset.dataset_id),
|
|
document_ids=doc_ids,
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=42,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
dataset_dir = temp_dataset_dir / str(dataset.dataset_id)
|
|
yaml_path = dataset_dir / "data.yaml"
|
|
|
|
assert yaml_path.exists()
|
|
|
|
with open(yaml_path) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
assert data["train"] == "images/train"
|
|
assert data["val"] == "images/val"
|
|
assert data["test"] == "images/test"
|
|
assert "nc" in data
|
|
assert "names" in data
|
|
|
|
|
|
class TestDatasetBuilderSplits:
|
|
"""Tests for train/val/test split assignment."""
|
|
|
|
def test_split_ratio_respected(
|
|
self, dataset_builder, documents_with_annotations, admin_images_dir, temp_dataset_dir, patched_session
|
|
):
|
|
"""Test that split ratios are approximately respected."""
|
|
dataset_repo = DatasetRepository()
|
|
dataset = dataset_repo.create(name="Split Ratio Test")
|
|
|
|
doc_ids = [str(d.document_id) for d in documents_with_annotations]
|
|
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset.dataset_id),
|
|
document_ids=doc_ids,
|
|
train_ratio=0.6,
|
|
val_ratio=0.2,
|
|
seed=42,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
# Check document assignments in database
|
|
dataset_docs = dataset_repo.get_documents(str(dataset.dataset_id))
|
|
|
|
splits = {"train": 0, "val": 0, "test": 0}
|
|
for doc in dataset_docs:
|
|
splits[doc.split] += 1
|
|
|
|
# With 5 docs and ratios 0.6/0.2/0.2, expect ~3/1/1
|
|
# Due to rounding and group constraints, allow some variation
|
|
assert splits["train"] >= 2
|
|
assert splits["val"] >= 1 or splits["test"] >= 1
|
|
|
|
def test_same_seed_same_split(
|
|
self, dataset_builder, documents_with_annotations, admin_images_dir, temp_dataset_dir, patched_session
|
|
):
|
|
"""Test that same seed produces same split."""
|
|
dataset_repo = DatasetRepository()
|
|
doc_ids = [str(d.document_id) for d in documents_with_annotations]
|
|
|
|
# Build first dataset
|
|
dataset1 = dataset_repo.create(name="Seed Test 1")
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset1.dataset_id),
|
|
document_ids=doc_ids,
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=12345,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
# Build second dataset with same seed
|
|
dataset2 = dataset_repo.create(name="Seed Test 2")
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset2.dataset_id),
|
|
document_ids=doc_ids,
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=12345,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
# Compare splits
|
|
docs1 = {str(d.document_id): d.split for d in dataset_repo.get_documents(str(dataset1.dataset_id))}
|
|
docs2 = {str(d.document_id): d.split for d in dataset_repo.get_documents(str(dataset2.dataset_id))}
|
|
|
|
assert docs1 == docs2
|
|
|
|
|
|
class TestDatasetBuilderDatabase:
|
|
"""Tests for database interactions."""
|
|
|
|
def test_updates_dataset_status(
|
|
self, dataset_builder, documents_with_annotations, admin_images_dir, patched_session
|
|
):
|
|
"""Test that dataset status is updated after build."""
|
|
dataset_repo = DatasetRepository()
|
|
dataset = dataset_repo.create(name="Status Update Test")
|
|
|
|
doc_ids = [str(d.document_id) for d in documents_with_annotations]
|
|
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset.dataset_id),
|
|
document_ids=doc_ids,
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=42,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
updated = dataset_repo.get(str(dataset.dataset_id))
|
|
|
|
assert updated.status == "ready"
|
|
assert updated.total_documents == 5
|
|
assert updated.total_images == 10
|
|
assert updated.total_annotations > 0
|
|
assert updated.dataset_path is not None
|
|
|
|
def test_records_document_assignments(
|
|
self, dataset_builder, documents_with_annotations, admin_images_dir, patched_session
|
|
):
|
|
"""Test that document assignments are recorded in database."""
|
|
dataset_repo = DatasetRepository()
|
|
dataset = dataset_repo.create(name="Assignment Recording Test")
|
|
|
|
doc_ids = [str(d.document_id) for d in documents_with_annotations]
|
|
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset.dataset_id),
|
|
document_ids=doc_ids,
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=42,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
dataset_docs = dataset_repo.get_documents(str(dataset.dataset_id))
|
|
|
|
assert len(dataset_docs) == 5
|
|
|
|
for doc in dataset_docs:
|
|
assert doc.split in ["train", "val", "test"]
|
|
assert doc.page_count > 0
|
|
|
|
|
|
class TestDatasetBuilderErrors:
|
|
"""Tests for error handling."""
|
|
|
|
def test_fails_with_no_documents(self, dataset_builder, admin_images_dir, patched_session):
|
|
"""Test that building fails with empty document list."""
|
|
dataset_repo = DatasetRepository()
|
|
dataset = dataset_repo.create(name="Empty Docs Test")
|
|
|
|
with pytest.raises(ValueError, match="No valid documents"):
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset.dataset_id),
|
|
document_ids=[],
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=42,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
def test_fails_with_invalid_doc_ids(self, dataset_builder, admin_images_dir, patched_session):
|
|
"""Test that building fails with nonexistent document IDs."""
|
|
dataset_repo = DatasetRepository()
|
|
dataset = dataset_repo.create(name="Invalid IDs Test")
|
|
|
|
fake_ids = [str(uuid4()) for _ in range(3)]
|
|
|
|
with pytest.raises(ValueError, match="No valid documents"):
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset.dataset_id),
|
|
document_ids=fake_ids,
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=42,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
def test_updates_status_on_failure(self, dataset_builder, admin_images_dir, patched_session):
|
|
"""Test that dataset status is set to failed on error."""
|
|
dataset_repo = DatasetRepository()
|
|
dataset = dataset_repo.create(name="Failure Status Test")
|
|
|
|
try:
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset.dataset_id),
|
|
document_ids=[],
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=42,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
except ValueError:
|
|
pass
|
|
|
|
updated = dataset_repo.get(str(dataset.dataset_id))
|
|
assert updated.status == "failed"
|
|
assert updated.error_message is not None
|
|
|
|
|
|
class TestLabelFileFormat:
|
|
"""Tests for YOLO label file format."""
|
|
|
|
def test_label_file_format(
|
|
self, dataset_builder, documents_with_annotations, admin_images_dir, temp_dataset_dir, patched_session
|
|
):
|
|
"""Test that label files are in correct YOLO format."""
|
|
dataset_repo = DatasetRepository()
|
|
dataset = dataset_repo.create(name="Label Format Test")
|
|
|
|
doc_ids = [str(d.document_id) for d in documents_with_annotations]
|
|
|
|
dataset_builder.build_dataset(
|
|
dataset_id=str(dataset.dataset_id),
|
|
document_ids=doc_ids,
|
|
train_ratio=0.8,
|
|
val_ratio=0.1,
|
|
seed=42,
|
|
admin_images_dir=admin_images_dir,
|
|
)
|
|
|
|
dataset_dir = temp_dataset_dir / str(dataset.dataset_id)
|
|
|
|
# Find a label file with content
|
|
label_files = []
|
|
for split in ["train", "val", "test"]:
|
|
label_files.extend(list((dataset_dir / "labels" / split).glob("*.txt")))
|
|
|
|
# Check at least one label file has correct format
|
|
found_valid_label = False
|
|
for label_file in label_files:
|
|
content = label_file.read_text().strip()
|
|
if content:
|
|
lines = content.split("\n")
|
|
for line in lines:
|
|
parts = line.split()
|
|
assert len(parts) == 5, f"Expected 5 parts, got {len(parts)}: {line}"
|
|
|
|
class_id = int(parts[0])
|
|
x_center = float(parts[1])
|
|
y_center = float(parts[2])
|
|
width = float(parts[3])
|
|
height = float(parts[4])
|
|
|
|
assert 0 <= class_id < 10
|
|
assert 0 <= x_center <= 1
|
|
assert 0 <= y_center <= 1
|
|
assert 0 <= width <= 1
|
|
assert 0 <= height <= 1
|
|
|
|
found_valid_label = True
|
|
break
|
|
|
|
assert found_valid_label, "No valid label files found"
|