WOP

2026-01-13 00:10:27 +01:00
parent 1b7c61cdd8
commit b26fd61852
43 changed files with 7751 additions and 578 deletions
--- a/src/cli/import_report_to_db.py
+++ b/src/cli/import_report_to_db.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python3
+"""
+Import existing JSONL report files into PostgreSQL database.
+
+Usage:
+    python -m src.cli.import_report_to_db --report "reports/autolabel_report_v4*.jsonl"
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+import psycopg2
+from psycopg2.extras import execute_values
+
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+from config import get_db_connection_string, PATHS
+
+
+def create_tables(conn):
+    """Create database tables."""
+    with conn.cursor() as cursor:
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS documents (
+                document_id TEXT PRIMARY KEY,
+                pdf_path TEXT,
+                pdf_type TEXT,
+                success BOOLEAN,
+                total_pages INTEGER,
+                fields_matched INTEGER,
+                fields_total INTEGER,
+                annotations_generated INTEGER,
+                processing_time_ms REAL,
+                timestamp TIMESTAMPTZ,
+                errors JSONB DEFAULT '[]'
+            );
+
+            CREATE TABLE IF NOT EXISTS field_results (
+                id SERIAL PRIMARY KEY,
+                document_id TEXT NOT NULL REFERENCES documents(document_id) ON DELETE CASCADE,
+                field_name TEXT,
+                csv_value TEXT,
+                matched BOOLEAN,
+                score REAL,
+                matched_text TEXT,
+                candidate_used TEXT,
+                bbox JSONB,
+                page_no INTEGER,
+                context_keywords JSONB DEFAULT '[]',
+                error TEXT
+            );
+
+            CREATE INDEX IF NOT EXISTS idx_documents_success ON documents(success);
+            CREATE INDEX IF NOT EXISTS idx_field_results_document_id ON field_results(document_id);
+            CREATE INDEX IF NOT EXISTS idx_field_results_field_name ON field_results(field_name);
+            CREATE INDEX IF NOT EXISTS idx_field_results_matched ON field_results(matched);
+        """)
+    conn.commit()
+
+
+def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_size: int = 1000) -> dict:
+    """Import a single JSONL file into database."""
+    stats = {'imported': 0, 'skipped': 0, 'errors': 0}
+
+    # Get existing document IDs if skipping
+    existing_ids = set()
+    if skip_existing:
+        with conn.cursor() as cursor:
+            cursor.execute("SELECT document_id FROM documents")
+            existing_ids = {row[0] for row in cursor.fetchall()}
+
+    doc_batch = []
+    field_batch = []
+
+    def flush_batches():
+        nonlocal doc_batch, field_batch
+        if doc_batch:
+            with conn.cursor() as cursor:
+                execute_values(cursor, """
+                    INSERT INTO documents
+                    (document_id, pdf_path, pdf_type, success, total_pages,
+                     fields_matched, fields_total, annotations_generated,
+                     processing_time_ms, timestamp, errors)
+                    VALUES %s
+                    ON CONFLICT (document_id) DO UPDATE SET
+                        pdf_path = EXCLUDED.pdf_path,
+                        pdf_type = EXCLUDED.pdf_type,
+                        success = EXCLUDED.success,
+                        total_pages = EXCLUDED.total_pages,
+                        fields_matched = EXCLUDED.fields_matched,
+                        fields_total = EXCLUDED.fields_total,
+                        annotations_generated = EXCLUDED.annotations_generated,
+                        processing_time_ms = EXCLUDED.processing_time_ms,
+                        timestamp = EXCLUDED.timestamp,
+                        errors = EXCLUDED.errors
+                """, doc_batch)
+            doc_batch = []
+
+        if field_batch:
+            with conn.cursor() as cursor:
+                execute_values(cursor, """
+                    INSERT INTO field_results
+                    (document_id, field_name, csv_value, matched, score,
+                     matched_text, candidate_used, bbox, page_no, context_keywords, error)
+                    VALUES %s
+                """, field_batch)
+            field_batch = []
+
+        conn.commit()
+
+    with open(jsonl_path, 'r', encoding='utf-8') as f:
+        for line_no, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+
+            try:
+                record = json.loads(line)
+            except json.JSONDecodeError as e:
+                print(f"  Warning: Line {line_no} - JSON parse error: {e}")
+                stats['errors'] += 1
+                continue
+
+            doc_id = record.get('document_id')
+            if not doc_id:
+                stats['errors'] += 1
+                continue
+
+            # Only import successful documents
+            if not record.get('success'):
+                stats['skipped'] += 1
+                continue
+
+            # Check if already exists
+            if skip_existing and doc_id in existing_ids:
+                stats['skipped'] += 1
+                continue
+
+            # Add to batch
+            doc_batch.append((
+                doc_id,
+                record.get('pdf_path'),
+                record.get('pdf_type'),
+                record.get('success'),
+                record.get('total_pages'),
+                record.get('fields_matched'),
+                record.get('fields_total'),
+                record.get('annotations_generated'),
+                record.get('processing_time_ms'),
+                record.get('timestamp'),
+                json.dumps(record.get('errors', []))
+            ))
+
+            for field in record.get('field_results', []):
+                field_batch.append((
+                    doc_id,
+                    field.get('field_name'),
+                    field.get('csv_value'),
+                    field.get('matched'),
+                    field.get('score'),
+                    field.get('matched_text'),
+                    field.get('candidate_used'),
+                    json.dumps(field.get('bbox')) if field.get('bbox') else None,
+                    field.get('page_no'),
+                    json.dumps(field.get('context_keywords', [])),
+                    field.get('error')
+                ))
+
+            stats['imported'] += 1
+            existing_ids.add(doc_id)
+
+            # Flush batch if needed
+            if len(doc_batch) >= batch_size:
+                flush_batches()
+                print(f"  Processed {stats['imported'] + stats['skipped']} records...")
+
+    # Final flush
+    flush_batches()
+
+    return stats
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Import JSONL reports to PostgreSQL database')
+    parser.add_argument('--report', type=str, default=f"{PATHS['reports_dir']}/autolabel_report*.jsonl",
+                        help='Report file path or glob pattern')
+    parser.add_argument('--db', type=str, default=None,
+                        help='PostgreSQL connection string (uses config.py if not specified)')
+    parser.add_argument('--no-skip', action='store_true',
+                        help='Do not skip existing documents (replace them)')
+    parser.add_argument('--batch-size', type=int, default=1000,
+                        help='Batch size for bulk inserts')
+    args = parser.parse_args()
+
+    # Use config if db not specified
+    db_connection = args.db or get_db_connection_string()
+
+    # Find report files
+    report_path = Path(args.report)
+    if '*' in str(report_path) or '?' in str(report_path):
+        parent = report_path.parent
+        pattern = report_path.name
+        report_files = sorted(parent.glob(pattern))
+    else:
+        report_files = [report_path] if report_path.exists() else []
+
+    if not report_files:
+        print(f"No report files found: {args.report}")
+        return
+
+    print(f"Found {len(report_files)} report file(s)")
+
+    # Connect to database
+    conn = psycopg2.connect(db_connection)
+    create_tables(conn)
+
+    # Import each file
+    total_stats = {'imported': 0, 'skipped': 0, 'errors': 0}
+
+    for report_file in report_files:
+        print(f"\nImporting: {report_file.name}")
+        stats = import_jsonl_file(conn, report_file, skip_existing=not args.no_skip, batch_size=args.batch_size)
+        print(f"  Imported: {stats['imported']}, Skipped: {stats['skipped']}, Errors: {stats['errors']}")
+
+        for key in total_stats:
+            total_stats[key] += stats[key]
+
+    # Print summary
+    print("\n" + "=" * 50)
+    print("Import Complete")
+    print("=" * 50)
+    print(f"Total imported: {total_stats['imported']}")
+    print(f"Total skipped:  {total_stats['skipped']}")
+    print(f"Total errors:   {total_stats['errors']}")
+
+    # Quick stats from database
+    with conn.cursor() as cursor:
+        cursor.execute("SELECT COUNT(*) FROM documents")
+        total_docs = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM documents WHERE success = true")
+        success_docs = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM field_results")
+        total_fields = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM field_results WHERE matched = true")
+        matched_fields = cursor.fetchone()[0]
+
+    conn.close()
+
+    print(f"\nDatabase Stats:")
+    print(f"  Documents: {total_docs} ({success_docs} successful)")
+    print(f"  Field results: {total_fields} ({matched_fields} matched)")
+    if total_fields > 0:
+        print(f"  Match rate: {matched_fields / total_fields * 100:.2f}%")
+
+
+if __name__ == '__main__':
+    main()