40 lines
1.7 KiB
SQL
40 lines
1.7 KiB
SQL
-- Training Datasets Management
|
|
-- Tracks dataset-document relationships and train/val/test splits
|
|
|
|
CREATE TABLE IF NOT EXISTS training_datasets (
|
|
dataset_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
name VARCHAR(255) NOT NULL,
|
|
description TEXT,
|
|
status VARCHAR(20) NOT NULL DEFAULT 'building',
|
|
train_ratio FLOAT NOT NULL DEFAULT 0.8,
|
|
val_ratio FLOAT NOT NULL DEFAULT 0.1,
|
|
seed INTEGER NOT NULL DEFAULT 42,
|
|
total_documents INTEGER NOT NULL DEFAULT 0,
|
|
total_images INTEGER NOT NULL DEFAULT 0,
|
|
total_annotations INTEGER NOT NULL DEFAULT 0,
|
|
dataset_path VARCHAR(512),
|
|
error_message TEXT,
|
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_training_datasets_status ON training_datasets(status);
|
|
|
|
CREATE TABLE IF NOT EXISTS dataset_documents (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
dataset_id UUID NOT NULL REFERENCES training_datasets(dataset_id) ON DELETE CASCADE,
|
|
document_id UUID NOT NULL REFERENCES admin_documents(document_id),
|
|
split VARCHAR(10) NOT NULL,
|
|
page_count INTEGER NOT NULL DEFAULT 0,
|
|
annotation_count INTEGER NOT NULL DEFAULT 0,
|
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
|
UNIQUE(dataset_id, document_id)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_dataset_documents_dataset ON dataset_documents(dataset_id);
|
|
CREATE INDEX IF NOT EXISTS idx_dataset_documents_document ON dataset_documents(document_id);
|
|
|
|
-- Add dataset_id to training_tasks
|
|
ALTER TABLE training_tasks ADD COLUMN IF NOT EXISTS dataset_id UUID REFERENCES training_datasets(dataset_id);
|
|
CREATE INDEX IF NOT EXISTS idx_training_tasks_dataset ON training_tasks(dataset_id);
|