WOP
This commit is contained in:
262
src/cli/import_report_to_db.py
Normal file
262
src/cli/import_report_to_db.py
Normal file
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Import existing JSONL report files into PostgreSQL database.
|
||||
|
||||
Usage:
|
||||
python -m src.cli.import_report_to_db --report "reports/autolabel_report_v4*.jsonl"
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
# Add project root to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
from config import get_db_connection_string, PATHS
|
||||
|
||||
|
||||
def create_tables(conn):
|
||||
"""Create database tables."""
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
document_id TEXT PRIMARY KEY,
|
||||
pdf_path TEXT,
|
||||
pdf_type TEXT,
|
||||
success BOOLEAN,
|
||||
total_pages INTEGER,
|
||||
fields_matched INTEGER,
|
||||
fields_total INTEGER,
|
||||
annotations_generated INTEGER,
|
||||
processing_time_ms REAL,
|
||||
timestamp TIMESTAMPTZ,
|
||||
errors JSONB DEFAULT '[]'
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS field_results (
|
||||
id SERIAL PRIMARY KEY,
|
||||
document_id TEXT NOT NULL REFERENCES documents(document_id) ON DELETE CASCADE,
|
||||
field_name TEXT,
|
||||
csv_value TEXT,
|
||||
matched BOOLEAN,
|
||||
score REAL,
|
||||
matched_text TEXT,
|
||||
candidate_used TEXT,
|
||||
bbox JSONB,
|
||||
page_no INTEGER,
|
||||
context_keywords JSONB DEFAULT '[]',
|
||||
error TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_success ON documents(success);
|
||||
CREATE INDEX IF NOT EXISTS idx_field_results_document_id ON field_results(document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_field_results_field_name ON field_results(field_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_field_results_matched ON field_results(matched);
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
|
||||
def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_size: int = 1000) -> dict:
|
||||
"""Import a single JSONL file into database."""
|
||||
stats = {'imported': 0, 'skipped': 0, 'errors': 0}
|
||||
|
||||
# Get existing document IDs if skipping
|
||||
existing_ids = set()
|
||||
if skip_existing:
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute("SELECT document_id FROM documents")
|
||||
existing_ids = {row[0] for row in cursor.fetchall()}
|
||||
|
||||
doc_batch = []
|
||||
field_batch = []
|
||||
|
||||
def flush_batches():
|
||||
nonlocal doc_batch, field_batch
|
||||
if doc_batch:
|
||||
with conn.cursor() as cursor:
|
||||
execute_values(cursor, """
|
||||
INSERT INTO documents
|
||||
(document_id, pdf_path, pdf_type, success, total_pages,
|
||||
fields_matched, fields_total, annotations_generated,
|
||||
processing_time_ms, timestamp, errors)
|
||||
VALUES %s
|
||||
ON CONFLICT (document_id) DO UPDATE SET
|
||||
pdf_path = EXCLUDED.pdf_path,
|
||||
pdf_type = EXCLUDED.pdf_type,
|
||||
success = EXCLUDED.success,
|
||||
total_pages = EXCLUDED.total_pages,
|
||||
fields_matched = EXCLUDED.fields_matched,
|
||||
fields_total = EXCLUDED.fields_total,
|
||||
annotations_generated = EXCLUDED.annotations_generated,
|
||||
processing_time_ms = EXCLUDED.processing_time_ms,
|
||||
timestamp = EXCLUDED.timestamp,
|
||||
errors = EXCLUDED.errors
|
||||
""", doc_batch)
|
||||
doc_batch = []
|
||||
|
||||
if field_batch:
|
||||
with conn.cursor() as cursor:
|
||||
execute_values(cursor, """
|
||||
INSERT INTO field_results
|
||||
(document_id, field_name, csv_value, matched, score,
|
||||
matched_text, candidate_used, bbox, page_no, context_keywords, error)
|
||||
VALUES %s
|
||||
""", field_batch)
|
||||
field_batch = []
|
||||
|
||||
conn.commit()
|
||||
|
||||
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
||||
for line_no, line in enumerate(f, 1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
record = json.loads(line)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f" Warning: Line {line_no} - JSON parse error: {e}")
|
||||
stats['errors'] += 1
|
||||
continue
|
||||
|
||||
doc_id = record.get('document_id')
|
||||
if not doc_id:
|
||||
stats['errors'] += 1
|
||||
continue
|
||||
|
||||
# Only import successful documents
|
||||
if not record.get('success'):
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Check if already exists
|
||||
if skip_existing and doc_id in existing_ids:
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Add to batch
|
||||
doc_batch.append((
|
||||
doc_id,
|
||||
record.get('pdf_path'),
|
||||
record.get('pdf_type'),
|
||||
record.get('success'),
|
||||
record.get('total_pages'),
|
||||
record.get('fields_matched'),
|
||||
record.get('fields_total'),
|
||||
record.get('annotations_generated'),
|
||||
record.get('processing_time_ms'),
|
||||
record.get('timestamp'),
|
||||
json.dumps(record.get('errors', []))
|
||||
))
|
||||
|
||||
for field in record.get('field_results', []):
|
||||
field_batch.append((
|
||||
doc_id,
|
||||
field.get('field_name'),
|
||||
field.get('csv_value'),
|
||||
field.get('matched'),
|
||||
field.get('score'),
|
||||
field.get('matched_text'),
|
||||
field.get('candidate_used'),
|
||||
json.dumps(field.get('bbox')) if field.get('bbox') else None,
|
||||
field.get('page_no'),
|
||||
json.dumps(field.get('context_keywords', [])),
|
||||
field.get('error')
|
||||
))
|
||||
|
||||
stats['imported'] += 1
|
||||
existing_ids.add(doc_id)
|
||||
|
||||
# Flush batch if needed
|
||||
if len(doc_batch) >= batch_size:
|
||||
flush_batches()
|
||||
print(f" Processed {stats['imported'] + stats['skipped']} records...")
|
||||
|
||||
# Final flush
|
||||
flush_batches()
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Import JSONL reports to PostgreSQL database')
|
||||
parser.add_argument('--report', type=str, default=f"{PATHS['reports_dir']}/autolabel_report*.jsonl",
|
||||
help='Report file path or glob pattern')
|
||||
parser.add_argument('--db', type=str, default=None,
|
||||
help='PostgreSQL connection string (uses config.py if not specified)')
|
||||
parser.add_argument('--no-skip', action='store_true',
|
||||
help='Do not skip existing documents (replace them)')
|
||||
parser.add_argument('--batch-size', type=int, default=1000,
|
||||
help='Batch size for bulk inserts')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Use config if db not specified
|
||||
db_connection = args.db or get_db_connection_string()
|
||||
|
||||
# Find report files
|
||||
report_path = Path(args.report)
|
||||
if '*' in str(report_path) or '?' in str(report_path):
|
||||
parent = report_path.parent
|
||||
pattern = report_path.name
|
||||
report_files = sorted(parent.glob(pattern))
|
||||
else:
|
||||
report_files = [report_path] if report_path.exists() else []
|
||||
|
||||
if not report_files:
|
||||
print(f"No report files found: {args.report}")
|
||||
return
|
||||
|
||||
print(f"Found {len(report_files)} report file(s)")
|
||||
|
||||
# Connect to database
|
||||
conn = psycopg2.connect(db_connection)
|
||||
create_tables(conn)
|
||||
|
||||
# Import each file
|
||||
total_stats = {'imported': 0, 'skipped': 0, 'errors': 0}
|
||||
|
||||
for report_file in report_files:
|
||||
print(f"\nImporting: {report_file.name}")
|
||||
stats = import_jsonl_file(conn, report_file, skip_existing=not args.no_skip, batch_size=args.batch_size)
|
||||
print(f" Imported: {stats['imported']}, Skipped: {stats['skipped']}, Errors: {stats['errors']}")
|
||||
|
||||
for key in total_stats:
|
||||
total_stats[key] += stats[key]
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 50)
|
||||
print("Import Complete")
|
||||
print("=" * 50)
|
||||
print(f"Total imported: {total_stats['imported']}")
|
||||
print(f"Total skipped: {total_stats['skipped']}")
|
||||
print(f"Total errors: {total_stats['errors']}")
|
||||
|
||||
# Quick stats from database
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute("SELECT COUNT(*) FROM documents")
|
||||
total_docs = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM documents WHERE success = true")
|
||||
success_docs = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM field_results")
|
||||
total_fields = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM field_results WHERE matched = true")
|
||||
matched_fields = cursor.fetchone()[0]
|
||||
|
||||
conn.close()
|
||||
|
||||
print(f"\nDatabase Stats:")
|
||||
print(f" Documents: {total_docs} ({success_docs} successful)")
|
||||
print(f" Field results: {total_fields} ({matched_fields} matched)")
|
||||
if total_fields > 0:
|
||||
print(f" Match rate: {matched_fields / total_fields * 100:.2f}%")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user