301 lines
12 KiB
Python
301 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Import existing JSONL report files into PostgreSQL database.
|
|
|
|
Usage:
|
|
python -m src.cli.import_report_to_db --report "reports/autolabel_report_v4*.jsonl"
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import psycopg2
|
|
from psycopg2.extras import execute_values
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
from src.config import get_db_connection_string, PATHS
|
|
|
|
|
|
def create_tables(conn):
|
|
"""Create database tables."""
|
|
with conn.cursor() as cursor:
|
|
cursor.execute("""
|
|
CREATE TABLE IF NOT EXISTS documents (
|
|
document_id TEXT PRIMARY KEY,
|
|
pdf_path TEXT,
|
|
pdf_type TEXT,
|
|
success BOOLEAN,
|
|
total_pages INTEGER,
|
|
fields_matched INTEGER,
|
|
fields_total INTEGER,
|
|
annotations_generated INTEGER,
|
|
processing_time_ms REAL,
|
|
timestamp TIMESTAMPTZ,
|
|
errors JSONB DEFAULT '[]',
|
|
-- New fields for extended CSV format
|
|
split TEXT,
|
|
customer_number TEXT,
|
|
supplier_name TEXT,
|
|
supplier_organisation_number TEXT,
|
|
supplier_accounts TEXT
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS field_results (
|
|
id SERIAL PRIMARY KEY,
|
|
document_id TEXT NOT NULL REFERENCES documents(document_id) ON DELETE CASCADE,
|
|
field_name TEXT,
|
|
csv_value TEXT,
|
|
matched BOOLEAN,
|
|
score REAL,
|
|
matched_text TEXT,
|
|
candidate_used TEXT,
|
|
bbox JSONB,
|
|
page_no INTEGER,
|
|
context_keywords JSONB DEFAULT '[]',
|
|
error TEXT
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_documents_success ON documents(success);
|
|
CREATE INDEX IF NOT EXISTS idx_field_results_document_id ON field_results(document_id);
|
|
CREATE INDEX IF NOT EXISTS idx_field_results_field_name ON field_results(field_name);
|
|
CREATE INDEX IF NOT EXISTS idx_field_results_matched ON field_results(matched);
|
|
|
|
-- Add new columns to existing tables if they don't exist (for migration)
|
|
DO $$
|
|
BEGIN
|
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='split') THEN
|
|
ALTER TABLE documents ADD COLUMN split TEXT;
|
|
END IF;
|
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='customer_number') THEN
|
|
ALTER TABLE documents ADD COLUMN customer_number TEXT;
|
|
END IF;
|
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_name') THEN
|
|
ALTER TABLE documents ADD COLUMN supplier_name TEXT;
|
|
END IF;
|
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_organisation_number') THEN
|
|
ALTER TABLE documents ADD COLUMN supplier_organisation_number TEXT;
|
|
END IF;
|
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_accounts') THEN
|
|
ALTER TABLE documents ADD COLUMN supplier_accounts TEXT;
|
|
END IF;
|
|
END $$;
|
|
""")
|
|
conn.commit()
|
|
|
|
|
|
def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_size: int = 1000) -> dict:
|
|
"""Import a single JSONL file into database."""
|
|
stats = {'imported': 0, 'skipped': 0, 'errors': 0}
|
|
|
|
# Get existing document IDs if skipping
|
|
existing_ids = set()
|
|
if skip_existing:
|
|
with conn.cursor() as cursor:
|
|
cursor.execute("SELECT document_id FROM documents")
|
|
existing_ids = {row[0] for row in cursor.fetchall()}
|
|
|
|
doc_batch = []
|
|
field_batch = []
|
|
|
|
def flush_batches():
|
|
nonlocal doc_batch, field_batch
|
|
if doc_batch:
|
|
with conn.cursor() as cursor:
|
|
execute_values(cursor, """
|
|
INSERT INTO documents
|
|
(document_id, pdf_path, pdf_type, success, total_pages,
|
|
fields_matched, fields_total, annotations_generated,
|
|
processing_time_ms, timestamp, errors,
|
|
split, customer_number, supplier_name, supplier_organisation_number, supplier_accounts)
|
|
VALUES %s
|
|
ON CONFLICT (document_id) DO UPDATE SET
|
|
pdf_path = EXCLUDED.pdf_path,
|
|
pdf_type = EXCLUDED.pdf_type,
|
|
success = EXCLUDED.success,
|
|
total_pages = EXCLUDED.total_pages,
|
|
fields_matched = EXCLUDED.fields_matched,
|
|
fields_total = EXCLUDED.fields_total,
|
|
annotations_generated = EXCLUDED.annotations_generated,
|
|
processing_time_ms = EXCLUDED.processing_time_ms,
|
|
timestamp = EXCLUDED.timestamp,
|
|
errors = EXCLUDED.errors,
|
|
split = EXCLUDED.split,
|
|
customer_number = EXCLUDED.customer_number,
|
|
supplier_name = EXCLUDED.supplier_name,
|
|
supplier_organisation_number = EXCLUDED.supplier_organisation_number,
|
|
supplier_accounts = EXCLUDED.supplier_accounts
|
|
""", doc_batch)
|
|
doc_batch = []
|
|
|
|
if field_batch:
|
|
with conn.cursor() as cursor:
|
|
execute_values(cursor, """
|
|
INSERT INTO field_results
|
|
(document_id, field_name, csv_value, matched, score,
|
|
matched_text, candidate_used, bbox, page_no, context_keywords, error)
|
|
VALUES %s
|
|
""", field_batch)
|
|
field_batch = []
|
|
|
|
conn.commit()
|
|
|
|
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
for line_no, line in enumerate(f, 1):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
try:
|
|
record = json.loads(line)
|
|
except json.JSONDecodeError as e:
|
|
print(f" Warning: Line {line_no} - JSON parse error: {e}")
|
|
stats['errors'] += 1
|
|
continue
|
|
|
|
doc_id = record.get('document_id')
|
|
if not doc_id:
|
|
stats['errors'] += 1
|
|
continue
|
|
|
|
# Only import successful documents
|
|
if not record.get('success'):
|
|
stats['skipped'] += 1
|
|
continue
|
|
|
|
# Check if already exists
|
|
if skip_existing and doc_id in existing_ids:
|
|
stats['skipped'] += 1
|
|
continue
|
|
|
|
# Add to batch
|
|
doc_batch.append((
|
|
doc_id,
|
|
record.get('pdf_path'),
|
|
record.get('pdf_type'),
|
|
record.get('success'),
|
|
record.get('total_pages'),
|
|
record.get('fields_matched'),
|
|
record.get('fields_total'),
|
|
record.get('annotations_generated'),
|
|
record.get('processing_time_ms'),
|
|
record.get('timestamp'),
|
|
json.dumps(record.get('errors', [])),
|
|
# New fields
|
|
record.get('split'),
|
|
record.get('customer_number'),
|
|
record.get('supplier_name'),
|
|
record.get('supplier_organisation_number'),
|
|
record.get('supplier_accounts'),
|
|
))
|
|
|
|
for field in record.get('field_results', []):
|
|
field_batch.append((
|
|
doc_id,
|
|
field.get('field_name'),
|
|
field.get('csv_value'),
|
|
field.get('matched'),
|
|
field.get('score'),
|
|
field.get('matched_text'),
|
|
field.get('candidate_used'),
|
|
json.dumps(field.get('bbox')) if field.get('bbox') else None,
|
|
field.get('page_no'),
|
|
json.dumps(field.get('context_keywords', [])),
|
|
field.get('error')
|
|
))
|
|
|
|
stats['imported'] += 1
|
|
existing_ids.add(doc_id)
|
|
|
|
# Flush batch if needed
|
|
if len(doc_batch) >= batch_size:
|
|
flush_batches()
|
|
print(f" Processed {stats['imported'] + stats['skipped']} records...")
|
|
|
|
# Final flush
|
|
flush_batches()
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Import JSONL reports to PostgreSQL database')
|
|
parser.add_argument('--report', type=str, default=f"{PATHS['reports_dir']}/autolabel_report*.jsonl",
|
|
help='Report file path or glob pattern')
|
|
parser.add_argument('--db', type=str, default=None,
|
|
help='PostgreSQL connection string (uses config.py if not specified)')
|
|
parser.add_argument('--no-skip', action='store_true',
|
|
help='Do not skip existing documents (replace them)')
|
|
parser.add_argument('--batch-size', type=int, default=1000,
|
|
help='Batch size for bulk inserts')
|
|
args = parser.parse_args()
|
|
|
|
# Use config if db not specified
|
|
db_connection = args.db or get_db_connection_string()
|
|
|
|
# Find report files
|
|
report_path = Path(args.report)
|
|
if '*' in str(report_path) or '?' in str(report_path):
|
|
parent = report_path.parent
|
|
pattern = report_path.name
|
|
report_files = sorted(parent.glob(pattern))
|
|
else:
|
|
report_files = [report_path] if report_path.exists() else []
|
|
|
|
if not report_files:
|
|
print(f"No report files found: {args.report}")
|
|
return
|
|
|
|
print(f"Found {len(report_files)} report file(s)")
|
|
|
|
# Connect to database
|
|
conn = psycopg2.connect(db_connection)
|
|
create_tables(conn)
|
|
|
|
# Import each file
|
|
total_stats = {'imported': 0, 'skipped': 0, 'errors': 0}
|
|
|
|
for report_file in report_files:
|
|
print(f"\nImporting: {report_file.name}")
|
|
stats = import_jsonl_file(conn, report_file, skip_existing=not args.no_skip, batch_size=args.batch_size)
|
|
print(f" Imported: {stats['imported']}, Skipped: {stats['skipped']}, Errors: {stats['errors']}")
|
|
|
|
for key in total_stats:
|
|
total_stats[key] += stats[key]
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 50)
|
|
print("Import Complete")
|
|
print("=" * 50)
|
|
print(f"Total imported: {total_stats['imported']}")
|
|
print(f"Total skipped: {total_stats['skipped']}")
|
|
print(f"Total errors: {total_stats['errors']}")
|
|
|
|
# Quick stats from database
|
|
with conn.cursor() as cursor:
|
|
cursor.execute("SELECT COUNT(*) FROM documents")
|
|
total_docs = cursor.fetchone()[0]
|
|
|
|
cursor.execute("SELECT COUNT(*) FROM documents WHERE success = true")
|
|
success_docs = cursor.fetchone()[0]
|
|
|
|
cursor.execute("SELECT COUNT(*) FROM field_results")
|
|
total_fields = cursor.fetchone()[0]
|
|
|
|
cursor.execute("SELECT COUNT(*) FROM field_results WHERE matched = true")
|
|
matched_fields = cursor.fetchone()[0]
|
|
|
|
conn.close()
|
|
|
|
print(f"\nDatabase Stats:")
|
|
print(f" Documents: {total_docs} ({success_docs} successful)")
|
|
print(f" Field results: {total_fields} ({matched_fields} matched)")
|
|
if total_fields > 0:
|
|
print(f" Match rate: {matched_fields / total_fields * 100:.2f}%")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|