-- Training Datasets Management -- Tracks dataset-document relationships and train/val/test splits CREATE TABLE IF NOT EXISTS training_datasets ( dataset_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), name VARCHAR(255) NOT NULL, description TEXT, status VARCHAR(20) NOT NULL DEFAULT 'building', train_ratio FLOAT NOT NULL DEFAULT 0.8, val_ratio FLOAT NOT NULL DEFAULT 0.1, seed INTEGER NOT NULL DEFAULT 42, total_documents INTEGER NOT NULL DEFAULT 0, total_images INTEGER NOT NULL DEFAULT 0, total_annotations INTEGER NOT NULL DEFAULT 0, dataset_path VARCHAR(512), error_message TEXT, created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); CREATE INDEX IF NOT EXISTS idx_training_datasets_status ON training_datasets(status); CREATE TABLE IF NOT EXISTS dataset_documents ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), dataset_id UUID NOT NULL REFERENCES training_datasets(dataset_id) ON DELETE CASCADE, document_id UUID NOT NULL REFERENCES admin_documents(document_id), split VARCHAR(10) NOT NULL, page_count INTEGER NOT NULL DEFAULT 0, annotation_count INTEGER NOT NULL DEFAULT 0, created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), UNIQUE(dataset_id, document_id) ); CREATE INDEX IF NOT EXISTS idx_dataset_documents_dataset ON dataset_documents(dataset_id); CREATE INDEX IF NOT EXISTS idx_dataset_documents_document ON dataset_documents(document_id); -- Add dataset_id to training_tasks ALTER TABLE training_tasks ADD COLUMN IF NOT EXISTS dataset_id UUID REFERENCES training_datasets(dataset_id); CREATE INDEX IF NOT EXISTS idx_training_tasks_dataset ON training_tasks(dataset_id);