This commit is contained in:
Yaojia Wang
2026-01-16 23:10:01 +01:00
parent 53d1e8db25
commit 425b8fdedf
10 changed files with 653 additions and 87 deletions

View File

@@ -34,7 +34,13 @@ def create_tables(conn):
annotations_generated INTEGER,
processing_time_ms REAL,
timestamp TIMESTAMPTZ,
errors JSONB DEFAULT '[]'
errors JSONB DEFAULT '[]',
-- New fields for extended CSV format
split TEXT,
customer_number TEXT,
supplier_name TEXT,
supplier_organisation_number TEXT,
supplier_accounts TEXT
);
CREATE TABLE IF NOT EXISTS field_results (
@@ -56,6 +62,26 @@ def create_tables(conn):
CREATE INDEX IF NOT EXISTS idx_field_results_document_id ON field_results(document_id);
CREATE INDEX IF NOT EXISTS idx_field_results_field_name ON field_results(field_name);
CREATE INDEX IF NOT EXISTS idx_field_results_matched ON field_results(matched);
-- Add new columns to existing tables if they don't exist (for migration)
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='split') THEN
ALTER TABLE documents ADD COLUMN split TEXT;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='customer_number') THEN
ALTER TABLE documents ADD COLUMN customer_number TEXT;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_name') THEN
ALTER TABLE documents ADD COLUMN supplier_name TEXT;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_organisation_number') THEN
ALTER TABLE documents ADD COLUMN supplier_organisation_number TEXT;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_accounts') THEN
ALTER TABLE documents ADD COLUMN supplier_accounts TEXT;
END IF;
END $$;
""")
conn.commit()
@@ -82,7 +108,8 @@ def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_
INSERT INTO documents
(document_id, pdf_path, pdf_type, success, total_pages,
fields_matched, fields_total, annotations_generated,
processing_time_ms, timestamp, errors)
processing_time_ms, timestamp, errors,
split, customer_number, supplier_name, supplier_organisation_number, supplier_accounts)
VALUES %s
ON CONFLICT (document_id) DO UPDATE SET
pdf_path = EXCLUDED.pdf_path,
@@ -94,7 +121,12 @@ def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_
annotations_generated = EXCLUDED.annotations_generated,
processing_time_ms = EXCLUDED.processing_time_ms,
timestamp = EXCLUDED.timestamp,
errors = EXCLUDED.errors
errors = EXCLUDED.errors,
split = EXCLUDED.split,
customer_number = EXCLUDED.customer_number,
supplier_name = EXCLUDED.supplier_name,
supplier_organisation_number = EXCLUDED.supplier_organisation_number,
supplier_accounts = EXCLUDED.supplier_accounts
""", doc_batch)
doc_batch = []
@@ -150,7 +182,13 @@ def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_
record.get('annotations_generated'),
record.get('processing_time_ms'),
record.get('timestamp'),
json.dumps(record.get('errors', []))
json.dumps(record.get('errors', [])),
# New fields
record.get('split'),
record.get('customer_number'),
record.get('supplier_name'),
record.get('supplier_organisation_number'),
record.get('supplier_accounts'),
))
for field in record.get('field_results', []):