#!/usr/bin/env python3 """ Import existing JSONL report files into PostgreSQL database. Usage: python -m src.cli.import_report_to_db --report "reports/autolabel_report_v4*.jsonl" """ import argparse import json import sys from pathlib import Path import psycopg2 from psycopg2.extras import execute_values # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from src.config import get_db_connection_string, PATHS def create_tables(conn): """Create database tables.""" with conn.cursor() as cursor: cursor.execute(""" CREATE TABLE IF NOT EXISTS documents ( document_id TEXT PRIMARY KEY, pdf_path TEXT, pdf_type TEXT, success BOOLEAN, total_pages INTEGER, fields_matched INTEGER, fields_total INTEGER, annotations_generated INTEGER, processing_time_ms REAL, timestamp TIMESTAMPTZ, errors JSONB DEFAULT '[]', -- New fields for extended CSV format split TEXT, customer_number TEXT, supplier_name TEXT, supplier_organisation_number TEXT, supplier_accounts TEXT ); CREATE TABLE IF NOT EXISTS field_results ( id SERIAL PRIMARY KEY, document_id TEXT NOT NULL REFERENCES documents(document_id) ON DELETE CASCADE, field_name TEXT, csv_value TEXT, matched BOOLEAN, score REAL, matched_text TEXT, candidate_used TEXT, bbox JSONB, page_no INTEGER, context_keywords JSONB DEFAULT '[]', error TEXT ); CREATE INDEX IF NOT EXISTS idx_documents_success ON documents(success); CREATE INDEX IF NOT EXISTS idx_field_results_document_id ON field_results(document_id); CREATE INDEX IF NOT EXISTS idx_field_results_field_name ON field_results(field_name); CREATE INDEX IF NOT EXISTS idx_field_results_matched ON field_results(matched); -- Add new columns to existing tables if they don't exist (for migration) DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='split') THEN ALTER TABLE documents ADD COLUMN split TEXT; END IF; IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='customer_number') THEN ALTER TABLE documents ADD COLUMN customer_number TEXT; END IF; IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_name') THEN ALTER TABLE documents ADD COLUMN supplier_name TEXT; END IF; IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_organisation_number') THEN ALTER TABLE documents ADD COLUMN supplier_organisation_number TEXT; END IF; IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_accounts') THEN ALTER TABLE documents ADD COLUMN supplier_accounts TEXT; END IF; END $$; """) conn.commit() def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_size: int = 1000) -> dict: """Import a single JSONL file into database.""" stats = {'imported': 0, 'skipped': 0, 'errors': 0} # Get existing document IDs if skipping existing_ids = set() if skip_existing: with conn.cursor() as cursor: cursor.execute("SELECT document_id FROM documents") existing_ids = {row[0] for row in cursor.fetchall()} doc_batch = [] field_batch = [] def flush_batches(): nonlocal doc_batch, field_batch if doc_batch: with conn.cursor() as cursor: execute_values(cursor, """ INSERT INTO documents (document_id, pdf_path, pdf_type, success, total_pages, fields_matched, fields_total, annotations_generated, processing_time_ms, timestamp, errors, split, customer_number, supplier_name, supplier_organisation_number, supplier_accounts) VALUES %s ON CONFLICT (document_id) DO UPDATE SET pdf_path = EXCLUDED.pdf_path, pdf_type = EXCLUDED.pdf_type, success = EXCLUDED.success, total_pages = EXCLUDED.total_pages, fields_matched = EXCLUDED.fields_matched, fields_total = EXCLUDED.fields_total, annotations_generated = EXCLUDED.annotations_generated, processing_time_ms = EXCLUDED.processing_time_ms, timestamp = EXCLUDED.timestamp, errors = EXCLUDED.errors, split = EXCLUDED.split, customer_number = EXCLUDED.customer_number, supplier_name = EXCLUDED.supplier_name, supplier_organisation_number = EXCLUDED.supplier_organisation_number, supplier_accounts = EXCLUDED.supplier_accounts """, doc_batch) doc_batch = [] if field_batch: with conn.cursor() as cursor: execute_values(cursor, """ INSERT INTO field_results (document_id, field_name, csv_value, matched, score, matched_text, candidate_used, bbox, page_no, context_keywords, error) VALUES %s """, field_batch) field_batch = [] conn.commit() with open(jsonl_path, 'r', encoding='utf-8') as f: for line_no, line in enumerate(f, 1): line = line.strip() if not line: continue try: record = json.loads(line) except json.JSONDecodeError as e: print(f" Warning: Line {line_no} - JSON parse error: {e}") stats['errors'] += 1 continue doc_id = record.get('document_id') if not doc_id: stats['errors'] += 1 continue # Only import successful documents if not record.get('success'): stats['skipped'] += 1 continue # Check if already exists if skip_existing and doc_id in existing_ids: stats['skipped'] += 1 continue # Add to batch doc_batch.append(( doc_id, record.get('pdf_path'), record.get('pdf_type'), record.get('success'), record.get('total_pages'), record.get('fields_matched'), record.get('fields_total'), record.get('annotations_generated'), record.get('processing_time_ms'), record.get('timestamp'), json.dumps(record.get('errors', [])), # New fields record.get('split'), record.get('customer_number'), record.get('supplier_name'), record.get('supplier_organisation_number'), record.get('supplier_accounts'), )) for field in record.get('field_results', []): field_batch.append(( doc_id, field.get('field_name'), field.get('csv_value'), field.get('matched'), field.get('score'), field.get('matched_text'), field.get('candidate_used'), json.dumps(field.get('bbox')) if field.get('bbox') else None, field.get('page_no'), json.dumps(field.get('context_keywords', [])), field.get('error') )) stats['imported'] += 1 existing_ids.add(doc_id) # Flush batch if needed if len(doc_batch) >= batch_size: flush_batches() print(f" Processed {stats['imported'] + stats['skipped']} records...") # Final flush flush_batches() return stats def main(): parser = argparse.ArgumentParser(description='Import JSONL reports to PostgreSQL database') parser.add_argument('--report', type=str, default=f"{PATHS['reports_dir']}/autolabel_report*.jsonl", help='Report file path or glob pattern') parser.add_argument('--db', type=str, default=None, help='PostgreSQL connection string (uses config.py if not specified)') parser.add_argument('--no-skip', action='store_true', help='Do not skip existing documents (replace them)') parser.add_argument('--batch-size', type=int, default=1000, help='Batch size for bulk inserts') args = parser.parse_args() # Use config if db not specified db_connection = args.db or get_db_connection_string() # Find report files report_path = Path(args.report) if '*' in str(report_path) or '?' in str(report_path): parent = report_path.parent pattern = report_path.name report_files = sorted(parent.glob(pattern)) else: report_files = [report_path] if report_path.exists() else [] if not report_files: print(f"No report files found: {args.report}") return print(f"Found {len(report_files)} report file(s)") # Connect to database conn = psycopg2.connect(db_connection) create_tables(conn) # Import each file total_stats = {'imported': 0, 'skipped': 0, 'errors': 0} for report_file in report_files: print(f"\nImporting: {report_file.name}") stats = import_jsonl_file(conn, report_file, skip_existing=not args.no_skip, batch_size=args.batch_size) print(f" Imported: {stats['imported']}, Skipped: {stats['skipped']}, Errors: {stats['errors']}") for key in total_stats: total_stats[key] += stats[key] # Print summary print("\n" + "=" * 50) print("Import Complete") print("=" * 50) print(f"Total imported: {total_stats['imported']}") print(f"Total skipped: {total_stats['skipped']}") print(f"Total errors: {total_stats['errors']}") # Quick stats from database with conn.cursor() as cursor: cursor.execute("SELECT COUNT(*) FROM documents") total_docs = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM documents WHERE success = true") success_docs = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM field_results") total_fields = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM field_results WHERE matched = true") matched_fields = cursor.fetchone()[0] conn.close() print(f"\nDatabase Stats:") print(f" Documents: {total_docs} ({success_docs} successful)") print(f" Field results: {total_fields} ({matched_fields} matched)") if total_fields > 0: print(f" Match rate: {matched_fields / total_fields * 100:.2f}%") if __name__ == '__main__': main()