invoice-master-poc-v2/src/cli/reprocess_failed.py

#!/usr/bin/env python3
"""
Re-process failed matches and store detailed information including OCR values,
CSV values, and source CSV filename in a new table.
"""

import argparse
import json
import glob
import os
import sys
import time
from pathlib import Path
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError
from tqdm import tqdm

sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from src.data.db import DocumentDB
from src.data.csv_loader import CSVLoader
from src.normalize.normalizer import normalize_field


def create_failed_match_table(db: DocumentDB):
    """Create the failed_match_details table."""
    conn = db.connect()
    with conn.cursor() as cursor:
        cursor.execute("""
            DROP TABLE IF EXISTS failed_match_details;

            CREATE TABLE failed_match_details (
                id SERIAL PRIMARY KEY,
                document_id TEXT NOT NULL,
                field_name TEXT NOT NULL,
                csv_value TEXT,
                csv_value_normalized TEXT,
                ocr_value TEXT,
                ocr_value_normalized TEXT,
                all_ocr_candidates JSONB,
                matched BOOLEAN DEFAULT FALSE,
                match_score REAL,
                pdf_path TEXT,
                pdf_type TEXT,
                csv_filename TEXT,
                page_no INTEGER,
                bbox JSONB,
                error TEXT,
                reprocessed_at TIMESTAMPTZ DEFAULT NOW(),

                UNIQUE(document_id, field_name)
            );

            CREATE INDEX IF NOT EXISTS idx_failed_match_document_id ON failed_match_details(document_id);
            CREATE INDEX IF NOT EXISTS idx_failed_match_field_name ON failed_match_details(field_name);
            CREATE INDEX IF NOT EXISTS idx_failed_match_csv_filename ON failed_match_details(csv_filename);
            CREATE INDEX IF NOT EXISTS idx_failed_match_matched ON failed_match_details(matched);
        """)
    conn.commit()
    print("Created table: failed_match_details")


def get_failed_documents(db: DocumentDB) -> list:
    """Get all documents that have at least one failed field match."""
    conn = db.connect()
    with conn.cursor() as cursor:
        cursor.execute("""
            SELECT DISTINCT fr.document_id, d.pdf_path, d.pdf_type
            FROM field_results fr
            JOIN documents d ON fr.document_id = d.document_id
            WHERE fr.matched = false
            ORDER BY fr.document_id
        """)
        return [{'document_id': row[0], 'pdf_path': row[1], 'pdf_type': row[2]}
                for row in cursor.fetchall()]


def get_failed_fields_for_document(db: DocumentDB, doc_id: str) -> list:
    """Get all failed field results for a document."""
    conn = db.connect()
    with conn.cursor() as cursor:
        cursor.execute("""
            SELECT field_name, csv_value, error
            FROM field_results
            WHERE document_id = %s AND matched = false
        """, (doc_id,))
        return [{'field_name': row[0], 'csv_value': row[1], 'error': row[2]}
                for row in cursor.fetchall()]


# Cache for CSV data
_csv_cache = {}

def build_csv_cache(csv_files: list):
    """Build a cache of document_id to csv_filename mapping."""
    global _csv_cache
    _csv_cache = {}

    for csv_file in csv_files:
        csv_filename = os.path.basename(csv_file)
        loader = CSVLoader(csv_file)
        for row in loader.iter_rows():
            if row.DocumentId not in _csv_cache:
                _csv_cache[row.DocumentId] = csv_filename


def find_csv_filename(doc_id: str) -> str:
    """Find which CSV file contains the document ID."""
    return _csv_cache.get(doc_id, None)


def init_worker():
    """Initialize worker process."""
    import os
    import warnings
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    os.environ["GLOG_minloglevel"] = "2"
    warnings.filterwarnings("ignore")


def process_single_document(args):
    """Process a single document and extract OCR values for failed fields."""
    doc_info, failed_fields, csv_filename = args
    doc_id = doc_info['document_id']
    pdf_path = doc_info['pdf_path']
    pdf_type = doc_info['pdf_type']

    results = []

    # Try to extract OCR from PDF
    try:
        if pdf_path and os.path.exists(pdf_path):
            from src.pdf import PDFDocument
            from src.ocr import OCREngine

            pdf_doc = PDFDocument(pdf_path)
            is_scanned = pdf_doc.detect_type() == "scanned"

            # Collect all OCR text blocks
            all_ocr_texts = []

            if is_scanned:
                # Use OCR for scanned PDFs
                ocr_engine = OCREngine()
                for page_no in range(pdf_doc.page_count):
                    # Render page to image
                    img = pdf_doc.render_page(page_no, dpi=150)
                    if img is None:
                        continue

                    # OCR the image
                    ocr_results = ocr_engine.extract_from_image(img)
                    for block in ocr_results:
                        all_ocr_texts.append({
                            'text': block.get('text', ''),
                            'bbox': block.get('bbox'),
                            'page_no': page_no
                        })
            else:
                # Use text extraction for text PDFs
                for page_no in range(pdf_doc.page_count):
                    tokens = list(pdf_doc.extract_text_tokens(page_no))
                    for token in tokens:
                        all_ocr_texts.append({
                            'text': token.text,
                            'bbox': token.bbox,
                            'page_no': page_no
                        })

            # For each failed field, try to find matching OCR
            for field in failed_fields:
                field_name = field['field_name']
                csv_value = field['csv_value']
                error = field['error']

                # Normalize CSV value
                csv_normalized = normalize_field(field_name, csv_value) if csv_value else None

                # Try to find best match in OCR
                best_score = 0
                best_ocr = None
                best_bbox = None
                best_page = None

                for ocr_block in all_ocr_texts:
                    ocr_text = ocr_block['text']
                    if not ocr_text:
                        continue
                    ocr_normalized = normalize_field(field_name, ocr_text)

                    # Calculate similarity
                    if csv_normalized and ocr_normalized:
                        # Check substring match
                        if csv_normalized in ocr_normalized:
                            score = len(csv_normalized) / max(len(ocr_normalized), 1)
                            if score > best_score:
                                best_score = score
                                best_ocr = ocr_text
                                best_bbox = ocr_block['bbox']
                                best_page = ocr_block['page_no']
                        elif ocr_normalized in csv_normalized:
                            score = len(ocr_normalized) / max(len(csv_normalized), 1)
                            if score > best_score:
                                best_score = score
                                best_ocr = ocr_text
                                best_bbox = ocr_block['bbox']
                                best_page = ocr_block['page_no']
                        # Exact match
                        elif csv_normalized == ocr_normalized:
                            best_score = 1.0
                            best_ocr = ocr_text
                            best_bbox = ocr_block['bbox']
                            best_page = ocr_block['page_no']
                            break

                results.append({
                    'document_id': doc_id,
                    'field_name': field_name,
                    'csv_value': csv_value,
                    'csv_value_normalized': csv_normalized,
                    'ocr_value': best_ocr,
                    'ocr_value_normalized': normalize_field(field_name, best_ocr) if best_ocr else None,
                    'all_ocr_candidates': [t['text'] for t in all_ocr_texts[:100]],  # Limit to 100
                    'matched': best_score > 0.8,
                    'match_score': best_score,
                    'pdf_path': pdf_path,
                    'pdf_type': pdf_type,
                    'csv_filename': csv_filename,
                    'page_no': best_page,
                    'bbox': list(best_bbox) if best_bbox else None,
                    'error': error
                })
        else:
            # PDF not found
            for field in failed_fields:
                results.append({
                    'document_id': doc_id,
                    'field_name': field['field_name'],
                    'csv_value': field['csv_value'],
                    'csv_value_normalized': normalize_field(field['field_name'], field['csv_value']) if field['csv_value'] else None,
                    'ocr_value': None,
                    'ocr_value_normalized': None,
                    'all_ocr_candidates': [],
                    'matched': False,
                    'match_score': 0,
                    'pdf_path': pdf_path,
                    'pdf_type': pdf_type,
                    'csv_filename': csv_filename,
                    'page_no': None,
                    'bbox': None,
                    'error': f"PDF not found: {pdf_path}"
                })

    except Exception as e:
        for field in failed_fields:
            results.append({
                'document_id': doc_id,
                'field_name': field['field_name'],
                'csv_value': field['csv_value'],
                'csv_value_normalized': None,
                'ocr_value': None,
                'ocr_value_normalized': None,
                'all_ocr_candidates': [],
                'matched': False,
                'match_score': 0,
                'pdf_path': pdf_path,
                'pdf_type': pdf_type,
                'csv_filename': csv_filename,
                'page_no': None,
                'bbox': None,
                'error': str(e)
            })

    return results


def save_results_batch(db: DocumentDB, results: list):
    """Save results to failed_match_details table."""
    if not results:
        return

    conn = db.connect()
    with conn.cursor() as cursor:
        for r in results:
            cursor.execute("""
                INSERT INTO failed_match_details
                (document_id, field_name, csv_value, csv_value_normalized,
                 ocr_value, ocr_value_normalized, all_ocr_candidates,
                 matched, match_score, pdf_path, pdf_type, csv_filename,
                 page_no, bbox, error)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON CONFLICT (document_id, field_name) DO UPDATE SET
                    csv_value = EXCLUDED.csv_value,
                    csv_value_normalized = EXCLUDED.csv_value_normalized,
                    ocr_value = EXCLUDED.ocr_value,
                    ocr_value_normalized = EXCLUDED.ocr_value_normalized,
                    all_ocr_candidates = EXCLUDED.all_ocr_candidates,
                    matched = EXCLUDED.matched,
                    match_score = EXCLUDED.match_score,
                    pdf_path = EXCLUDED.pdf_path,
                    pdf_type = EXCLUDED.pdf_type,
                    csv_filename = EXCLUDED.csv_filename,
                    page_no = EXCLUDED.page_no,
                    bbox = EXCLUDED.bbox,
                    error = EXCLUDED.error,
                    reprocessed_at = NOW()
            """, (
                r['document_id'],
                r['field_name'],
                r['csv_value'],
                r['csv_value_normalized'],
                r['ocr_value'],
                r['ocr_value_normalized'],
                json.dumps(r['all_ocr_candidates']),
                r['matched'],
                r['match_score'],
                r['pdf_path'],
                r['pdf_type'],
                r['csv_filename'],
                r['page_no'],
                json.dumps(r['bbox']) if r['bbox'] else None,
                r['error']
            ))
    conn.commit()


def main():
    parser = argparse.ArgumentParser(description='Re-process failed matches')
    parser.add_argument('--csv', required=True, help='CSV files glob pattern')
    parser.add_argument('--pdf-dir', required=True, help='PDF directory')
    parser.add_argument('--workers', type=int, default=3, help='Number of workers')
    parser.add_argument('--limit', type=int, help='Limit number of documents to process')
    args = parser.parse_args()

    # Expand CSV glob
    csv_files = sorted(glob.glob(args.csv))
    print(f"Found {len(csv_files)} CSV files")

    # Build CSV cache
    print("Building CSV filename cache...")
    build_csv_cache(csv_files)
    print(f"Cached {len(_csv_cache)} document IDs")

    # Connect to database
    db = DocumentDB()
    db.connect()

    # Create new table
    create_failed_match_table(db)

    # Get all failed documents
    print("Fetching failed documents...")
    failed_docs = get_failed_documents(db)
    print(f"Found {len(failed_docs)} documents with failed matches")

    if args.limit:
        failed_docs = failed_docs[:args.limit]
        print(f"Limited to {len(failed_docs)} documents")

    # Prepare tasks
    tasks = []
    for doc in failed_docs:
        failed_fields = get_failed_fields_for_document(db, doc['document_id'])
        csv_filename = find_csv_filename(doc['document_id'])
        if failed_fields:
            tasks.append((doc, failed_fields, csv_filename))

    print(f"Processing {len(tasks)} documents with {args.workers} workers...")

    # Process with multiprocessing
    total_results = 0
    batch_results = []
    batch_size = 50

    with ProcessPoolExecutor(max_workers=args.workers, initializer=init_worker) as executor:
        futures = {executor.submit(process_single_document, task): task[0]['document_id']
                   for task in tasks}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            doc_id = futures[future]
            try:
                results = future.result(timeout=120)
                batch_results.extend(results)
                total_results += len(results)

                # Save in batches
                if len(batch_results) >= batch_size:
                    save_results_batch(db, batch_results)
                    batch_results = []

            except TimeoutError:
                print(f"\nTimeout processing {doc_id}")
            except Exception as e:
                print(f"\nError processing {doc_id}: {e}")

    # Save remaining results
    if batch_results:
        save_results_batch(db, batch_results)

    print(f"\nDone! Saved {total_results} failed match records to failed_match_details table")

    # Show summary
    conn = db.connect()
    with conn.cursor() as cursor:
        cursor.execute("""
            SELECT field_name, COUNT(*) as total,
                   COUNT(*) FILTER (WHERE ocr_value IS NOT NULL) as has_ocr,
                   COALESCE(AVG(match_score), 0) as avg_score
            FROM failed_match_details
            GROUP BY field_name
            ORDER BY total DESC
        """)
        print("\nSummary by field:")
        print("-" * 70)
        print(f"{'Field':<35} {'Total':>8} {'Has OCR':>10} {'Avg Score':>12}")
        print("-" * 70)
        for row in cursor.fetchall():
            print(f"{row[0]:<35} {row[1]:>8} {row[2]:>10} {row[3]:>12.2f}")

    db.close()


if __name__ == '__main__':
    main()