This commit is contained in:
Yaojia Wang
2026-01-13 00:10:27 +01:00
parent 1b7c61cdd8
commit b26fd61852
43 changed files with 7751 additions and 578 deletions

View File

@@ -0,0 +1,262 @@
#!/usr/bin/env python3
"""
Import existing JSONL report files into PostgreSQL database.
Usage:
python -m src.cli.import_report_to_db --report "reports/autolabel_report_v4*.jsonl"
"""
import argparse
import json
import sys
from pathlib import Path
import psycopg2
from psycopg2.extras import execute_values
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from config import get_db_connection_string, PATHS
def create_tables(conn):
"""Create database tables."""
with conn.cursor() as cursor:
cursor.execute("""
CREATE TABLE IF NOT EXISTS documents (
document_id TEXT PRIMARY KEY,
pdf_path TEXT,
pdf_type TEXT,
success BOOLEAN,
total_pages INTEGER,
fields_matched INTEGER,
fields_total INTEGER,
annotations_generated INTEGER,
processing_time_ms REAL,
timestamp TIMESTAMPTZ,
errors JSONB DEFAULT '[]'
);
CREATE TABLE IF NOT EXISTS field_results (
id SERIAL PRIMARY KEY,
document_id TEXT NOT NULL REFERENCES documents(document_id) ON DELETE CASCADE,
field_name TEXT,
csv_value TEXT,
matched BOOLEAN,
score REAL,
matched_text TEXT,
candidate_used TEXT,
bbox JSONB,
page_no INTEGER,
context_keywords JSONB DEFAULT '[]',
error TEXT
);
CREATE INDEX IF NOT EXISTS idx_documents_success ON documents(success);
CREATE INDEX IF NOT EXISTS idx_field_results_document_id ON field_results(document_id);
CREATE INDEX IF NOT EXISTS idx_field_results_field_name ON field_results(field_name);
CREATE INDEX IF NOT EXISTS idx_field_results_matched ON field_results(matched);
""")
conn.commit()
def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_size: int = 1000) -> dict:
"""Import a single JSONL file into database."""
stats = {'imported': 0, 'skipped': 0, 'errors': 0}
# Get existing document IDs if skipping
existing_ids = set()
if skip_existing:
with conn.cursor() as cursor:
cursor.execute("SELECT document_id FROM documents")
existing_ids = {row[0] for row in cursor.fetchall()}
doc_batch = []
field_batch = []
def flush_batches():
nonlocal doc_batch, field_batch
if doc_batch:
with conn.cursor() as cursor:
execute_values(cursor, """
INSERT INTO documents
(document_id, pdf_path, pdf_type, success, total_pages,
fields_matched, fields_total, annotations_generated,
processing_time_ms, timestamp, errors)
VALUES %s
ON CONFLICT (document_id) DO UPDATE SET
pdf_path = EXCLUDED.pdf_path,
pdf_type = EXCLUDED.pdf_type,
success = EXCLUDED.success,
total_pages = EXCLUDED.total_pages,
fields_matched = EXCLUDED.fields_matched,
fields_total = EXCLUDED.fields_total,
annotations_generated = EXCLUDED.annotations_generated,
processing_time_ms = EXCLUDED.processing_time_ms,
timestamp = EXCLUDED.timestamp,
errors = EXCLUDED.errors
""", doc_batch)
doc_batch = []
if field_batch:
with conn.cursor() as cursor:
execute_values(cursor, """
INSERT INTO field_results
(document_id, field_name, csv_value, matched, score,
matched_text, candidate_used, bbox, page_no, context_keywords, error)
VALUES %s
""", field_batch)
field_batch = []
conn.commit()
with open(jsonl_path, 'r', encoding='utf-8') as f:
for line_no, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
except json.JSONDecodeError as e:
print(f" Warning: Line {line_no} - JSON parse error: {e}")
stats['errors'] += 1
continue
doc_id = record.get('document_id')
if not doc_id:
stats['errors'] += 1
continue
# Only import successful documents
if not record.get('success'):
stats['skipped'] += 1
continue
# Check if already exists
if skip_existing and doc_id in existing_ids:
stats['skipped'] += 1
continue
# Add to batch
doc_batch.append((
doc_id,
record.get('pdf_path'),
record.get('pdf_type'),
record.get('success'),
record.get('total_pages'),
record.get('fields_matched'),
record.get('fields_total'),
record.get('annotations_generated'),
record.get('processing_time_ms'),
record.get('timestamp'),
json.dumps(record.get('errors', []))
))
for field in record.get('field_results', []):
field_batch.append((
doc_id,
field.get('field_name'),
field.get('csv_value'),
field.get('matched'),
field.get('score'),
field.get('matched_text'),
field.get('candidate_used'),
json.dumps(field.get('bbox')) if field.get('bbox') else None,
field.get('page_no'),
json.dumps(field.get('context_keywords', [])),
field.get('error')
))
stats['imported'] += 1
existing_ids.add(doc_id)
# Flush batch if needed
if len(doc_batch) >= batch_size:
flush_batches()
print(f" Processed {stats['imported'] + stats['skipped']} records...")
# Final flush
flush_batches()
return stats
def main():
parser = argparse.ArgumentParser(description='Import JSONL reports to PostgreSQL database')
parser.add_argument('--report', type=str, default=f"{PATHS['reports_dir']}/autolabel_report*.jsonl",
help='Report file path or glob pattern')
parser.add_argument('--db', type=str, default=None,
help='PostgreSQL connection string (uses config.py if not specified)')
parser.add_argument('--no-skip', action='store_true',
help='Do not skip existing documents (replace them)')
parser.add_argument('--batch-size', type=int, default=1000,
help='Batch size for bulk inserts')
args = parser.parse_args()
# Use config if db not specified
db_connection = args.db or get_db_connection_string()
# Find report files
report_path = Path(args.report)
if '*' in str(report_path) or '?' in str(report_path):
parent = report_path.parent
pattern = report_path.name
report_files = sorted(parent.glob(pattern))
else:
report_files = [report_path] if report_path.exists() else []
if not report_files:
print(f"No report files found: {args.report}")
return
print(f"Found {len(report_files)} report file(s)")
# Connect to database
conn = psycopg2.connect(db_connection)
create_tables(conn)
# Import each file
total_stats = {'imported': 0, 'skipped': 0, 'errors': 0}
for report_file in report_files:
print(f"\nImporting: {report_file.name}")
stats = import_jsonl_file(conn, report_file, skip_existing=not args.no_skip, batch_size=args.batch_size)
print(f" Imported: {stats['imported']}, Skipped: {stats['skipped']}, Errors: {stats['errors']}")
for key in total_stats:
total_stats[key] += stats[key]
# Print summary
print("\n" + "=" * 50)
print("Import Complete")
print("=" * 50)
print(f"Total imported: {total_stats['imported']}")
print(f"Total skipped: {total_stats['skipped']}")
print(f"Total errors: {total_stats['errors']}")
# Quick stats from database
with conn.cursor() as cursor:
cursor.execute("SELECT COUNT(*) FROM documents")
total_docs = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM documents WHERE success = true")
success_docs = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM field_results")
total_fields = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM field_results WHERE matched = true")
matched_fields = cursor.fetchone()[0]
conn.close()
print(f"\nDatabase Stats:")
print(f" Documents: {total_docs} ({success_docs} successful)")
print(f" Field results: {total_fields} ({matched_fields} matched)")
if total_fields > 0:
print(f" Match rate: {matched_fields / total_fields * 100:.2f}%")
if __name__ == '__main__':
main()