invoice-master-poc-v2/src/cli/import_report_to_db.py

#!/usr/bin/env python3
"""
Import existing JSONL report files into PostgreSQL database.

Usage:
    python -m src.cli.import_report_to_db --report "reports/autolabel_report_v4*.jsonl"
"""

import argparse
import json
import sys
from pathlib import Path

import psycopg2
from psycopg2.extras import execute_values

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from src.config import get_db_connection_string, PATHS


def create_tables(conn):
    """Create database tables."""
    with conn.cursor() as cursor:
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS documents (
                document_id TEXT PRIMARY KEY,
                pdf_path TEXT,
                pdf_type TEXT,
                success BOOLEAN,
                total_pages INTEGER,
                fields_matched INTEGER,
                fields_total INTEGER,
                annotations_generated INTEGER,
                processing_time_ms REAL,
                timestamp TIMESTAMPTZ,
                errors JSONB DEFAULT '[]',
                -- New fields for extended CSV format
                split TEXT,
                customer_number TEXT,
                supplier_name TEXT,
                supplier_organisation_number TEXT,
                supplier_accounts TEXT
            );

            CREATE TABLE IF NOT EXISTS field_results (
                id SERIAL PRIMARY KEY,
                document_id TEXT NOT NULL REFERENCES documents(document_id) ON DELETE CASCADE,
                field_name TEXT,
                csv_value TEXT,
                matched BOOLEAN,
                score REAL,
                matched_text TEXT,
                candidate_used TEXT,
                bbox JSONB,
                page_no INTEGER,
                context_keywords JSONB DEFAULT '[]',
                error TEXT
            );

            CREATE INDEX IF NOT EXISTS idx_documents_success ON documents(success);
            CREATE INDEX IF NOT EXISTS idx_field_results_document_id ON field_results(document_id);
            CREATE INDEX IF NOT EXISTS idx_field_results_field_name ON field_results(field_name);
            CREATE INDEX IF NOT EXISTS idx_field_results_matched ON field_results(matched);

            -- Add new columns to existing tables if they don't exist (for migration)
            DO $$
            BEGIN
                IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='split') THEN
                    ALTER TABLE documents ADD COLUMN split TEXT;
                END IF;
                IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='customer_number') THEN
                    ALTER TABLE documents ADD COLUMN customer_number TEXT;
                END IF;
                IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_name') THEN
                    ALTER TABLE documents ADD COLUMN supplier_name TEXT;
                END IF;
                IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_organisation_number') THEN
                    ALTER TABLE documents ADD COLUMN supplier_organisation_number TEXT;
                END IF;
                IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_accounts') THEN
                    ALTER TABLE documents ADD COLUMN supplier_accounts TEXT;
                END IF;
            END $$;
        """)
    conn.commit()


def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_size: int = 1000) -> dict:
    """Import a single JSONL file into database."""
    stats = {'imported': 0, 'skipped': 0, 'errors': 0}

    # Get existing document IDs if skipping
    existing_ids = set()
    if skip_existing:
        with conn.cursor() as cursor:
            cursor.execute("SELECT document_id FROM documents")
            existing_ids = {row[0] for row in cursor.fetchall()}

    doc_batch = []
    field_batch = []

    def flush_batches():
        nonlocal doc_batch, field_batch
        if doc_batch:
            with conn.cursor() as cursor:
                execute_values(cursor, """
                    INSERT INTO documents
                    (document_id, pdf_path, pdf_type, success, total_pages,
                     fields_matched, fields_total, annotations_generated,
                     processing_time_ms, timestamp, errors,
                     split, customer_number, supplier_name, supplier_organisation_number, supplier_accounts)
                    VALUES %s
                    ON CONFLICT (document_id) DO UPDATE SET
                        pdf_path = EXCLUDED.pdf_path,
                        pdf_type = EXCLUDED.pdf_type,
                        success = EXCLUDED.success,
                        total_pages = EXCLUDED.total_pages,
                        fields_matched = EXCLUDED.fields_matched,
                        fields_total = EXCLUDED.fields_total,
                        annotations_generated = EXCLUDED.annotations_generated,
                        processing_time_ms = EXCLUDED.processing_time_ms,
                        timestamp = EXCLUDED.timestamp,
                        errors = EXCLUDED.errors,
                        split = EXCLUDED.split,
                        customer_number = EXCLUDED.customer_number,
                        supplier_name = EXCLUDED.supplier_name,
                        supplier_organisation_number = EXCLUDED.supplier_organisation_number,
                        supplier_accounts = EXCLUDED.supplier_accounts
                """, doc_batch)
            doc_batch = []

        if field_batch:
            with conn.cursor() as cursor:
                execute_values(cursor, """
                    INSERT INTO field_results
                    (document_id, field_name, csv_value, matched, score,
                     matched_text, candidate_used, bbox, page_no, context_keywords, error)
                    VALUES %s
                """, field_batch)
            field_batch = []

        conn.commit()

    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line_no, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue

            try:
                record = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"  Warning: Line {line_no} - JSON parse error: {e}")
                stats['errors'] += 1
                continue

            doc_id = record.get('document_id')
            if not doc_id:
                stats['errors'] += 1
                continue

            # Only import successful documents
            if not record.get('success'):
                stats['skipped'] += 1
                continue

            # Check if already exists
            if skip_existing and doc_id in existing_ids:
                stats['skipped'] += 1
                continue

            # Add to batch
            doc_batch.append((
                doc_id,
                record.get('pdf_path'),
                record.get('pdf_type'),
                record.get('success'),
                record.get('total_pages'),
                record.get('fields_matched'),
                record.get('fields_total'),
                record.get('annotations_generated'),
                record.get('processing_time_ms'),
                record.get('timestamp'),
                json.dumps(record.get('errors', [])),
                # New fields
                record.get('split'),
                record.get('customer_number'),
                record.get('supplier_name'),
                record.get('supplier_organisation_number'),
                record.get('supplier_accounts'),
            ))

            for field in record.get('field_results', []):
                field_batch.append((
                    doc_id,
                    field.get('field_name'),
                    field.get('csv_value'),
                    field.get('matched'),
                    field.get('score'),
                    field.get('matched_text'),
                    field.get('candidate_used'),
                    json.dumps(field.get('bbox')) if field.get('bbox') else None,
                    field.get('page_no'),
                    json.dumps(field.get('context_keywords', [])),
                    field.get('error')
                ))

            stats['imported'] += 1
            existing_ids.add(doc_id)

            # Flush batch if needed
            if len(doc_batch) >= batch_size:
                flush_batches()
                print(f"  Processed {stats['imported'] + stats['skipped']} records...")

    # Final flush
    flush_batches()

    return stats


def main():
    parser = argparse.ArgumentParser(description='Import JSONL reports to PostgreSQL database')
    parser.add_argument('--report', type=str, default=f"{PATHS['reports_dir']}/autolabel_report*.jsonl",
                        help='Report file path or glob pattern')
    parser.add_argument('--db', type=str, default=None,
                        help='PostgreSQL connection string (uses config.py if not specified)')
    parser.add_argument('--no-skip', action='store_true',
                        help='Do not skip existing documents (replace them)')
    parser.add_argument('--batch-size', type=int, default=1000,
                        help='Batch size for bulk inserts')
    args = parser.parse_args()

    # Use config if db not specified
    db_connection = args.db or get_db_connection_string()

    # Find report files
    report_path = Path(args.report)
    if '*' in str(report_path) or '?' in str(report_path):
        parent = report_path.parent
        pattern = report_path.name
        report_files = sorted(parent.glob(pattern))
    else:
        report_files = [report_path] if report_path.exists() else []

    if not report_files:
        print(f"No report files found: {args.report}")
        return

    print(f"Found {len(report_files)} report file(s)")

    # Connect to database
    conn = psycopg2.connect(db_connection)
    create_tables(conn)

    # Import each file
    total_stats = {'imported': 0, 'skipped': 0, 'errors': 0}

    for report_file in report_files:
        print(f"\nImporting: {report_file.name}")
        stats = import_jsonl_file(conn, report_file, skip_existing=not args.no_skip, batch_size=args.batch_size)
        print(f"  Imported: {stats['imported']}, Skipped: {stats['skipped']}, Errors: {stats['errors']}")

        for key in total_stats:
            total_stats[key] += stats[key]

    # Print summary
    print("\n" + "=" * 50)
    print("Import Complete")
    print("=" * 50)
    print(f"Total imported: {total_stats['imported']}")
    print(f"Total skipped:  {total_stats['skipped']}")
    print(f"Total errors:   {total_stats['errors']}")

    # Quick stats from database
    with conn.cursor() as cursor:
        cursor.execute("SELECT COUNT(*) FROM documents")
        total_docs = cursor.fetchone()[0]

        cursor.execute("SELECT COUNT(*) FROM documents WHERE success = true")
        success_docs = cursor.fetchone()[0]

        cursor.execute("SELECT COUNT(*) FROM field_results")
        total_fields = cursor.fetchone()[0]

        cursor.execute("SELECT COUNT(*) FROM field_results WHERE matched = true")
        matched_fields = cursor.fetchone()[0]

    conn.close()

    print(f"\nDatabase Stats:")
    print(f"  Documents: {total_docs} ({success_docs} successful)")
    print(f"  Field results: {total_fields} ({matched_fields} matched)")
    if total_fields > 0:
        print(f"  Match rate: {matched_fields / total_fields * 100:.2f}%")


if __name__ == '__main__':
    main()