WIP
This commit is contained in:
@@ -34,7 +34,13 @@ def create_tables(conn):
|
||||
annotations_generated INTEGER,
|
||||
processing_time_ms REAL,
|
||||
timestamp TIMESTAMPTZ,
|
||||
errors JSONB DEFAULT '[]'
|
||||
errors JSONB DEFAULT '[]',
|
||||
-- New fields for extended CSV format
|
||||
split TEXT,
|
||||
customer_number TEXT,
|
||||
supplier_name TEXT,
|
||||
supplier_organisation_number TEXT,
|
||||
supplier_accounts TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS field_results (
|
||||
@@ -56,6 +62,26 @@ def create_tables(conn):
|
||||
CREATE INDEX IF NOT EXISTS idx_field_results_document_id ON field_results(document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_field_results_field_name ON field_results(field_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_field_results_matched ON field_results(matched);
|
||||
|
||||
-- Add new columns to existing tables if they don't exist (for migration)
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='split') THEN
|
||||
ALTER TABLE documents ADD COLUMN split TEXT;
|
||||
END IF;
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='customer_number') THEN
|
||||
ALTER TABLE documents ADD COLUMN customer_number TEXT;
|
||||
END IF;
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_name') THEN
|
||||
ALTER TABLE documents ADD COLUMN supplier_name TEXT;
|
||||
END IF;
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_organisation_number') THEN
|
||||
ALTER TABLE documents ADD COLUMN supplier_organisation_number TEXT;
|
||||
END IF;
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='documents' AND column_name='supplier_accounts') THEN
|
||||
ALTER TABLE documents ADD COLUMN supplier_accounts TEXT;
|
||||
END IF;
|
||||
END $$;
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
@@ -82,7 +108,8 @@ def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_
|
||||
INSERT INTO documents
|
||||
(document_id, pdf_path, pdf_type, success, total_pages,
|
||||
fields_matched, fields_total, annotations_generated,
|
||||
processing_time_ms, timestamp, errors)
|
||||
processing_time_ms, timestamp, errors,
|
||||
split, customer_number, supplier_name, supplier_organisation_number, supplier_accounts)
|
||||
VALUES %s
|
||||
ON CONFLICT (document_id) DO UPDATE SET
|
||||
pdf_path = EXCLUDED.pdf_path,
|
||||
@@ -94,7 +121,12 @@ def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_
|
||||
annotations_generated = EXCLUDED.annotations_generated,
|
||||
processing_time_ms = EXCLUDED.processing_time_ms,
|
||||
timestamp = EXCLUDED.timestamp,
|
||||
errors = EXCLUDED.errors
|
||||
errors = EXCLUDED.errors,
|
||||
split = EXCLUDED.split,
|
||||
customer_number = EXCLUDED.customer_number,
|
||||
supplier_name = EXCLUDED.supplier_name,
|
||||
supplier_organisation_number = EXCLUDED.supplier_organisation_number,
|
||||
supplier_accounts = EXCLUDED.supplier_accounts
|
||||
""", doc_batch)
|
||||
doc_batch = []
|
||||
|
||||
@@ -150,7 +182,13 @@ def import_jsonl_file(conn, jsonl_path: Path, skip_existing: bool = True, batch_
|
||||
record.get('annotations_generated'),
|
||||
record.get('processing_time_ms'),
|
||||
record.get('timestamp'),
|
||||
json.dumps(record.get('errors', []))
|
||||
json.dumps(record.get('errors', [])),
|
||||
# New fields
|
||||
record.get('split'),
|
||||
record.get('customer_number'),
|
||||
record.get('supplier_name'),
|
||||
record.get('supplier_organisation_number'),
|
||||
record.get('supplier_accounts'),
|
||||
))
|
||||
|
||||
for field in record.get('field_results', []):
|
||||
|
||||
Reference in New Issue
Block a user