#!/usr/bin/env python3 """ Label Analysis CLI Analyzes auto-generated labels to identify failures and diagnose root causes. Now reads from PostgreSQL database instead of JSONL files. """ import argparse import csv import json import sys from collections import defaultdict from dataclasses import dataclass, field from pathlib import Path from typing import Optional sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from src.config import get_db_connection_string from ..normalize import normalize_field from ..matcher import FieldMatcher from ..pdf import is_text_pdf, extract_text_tokens from ..yolo.annotation_generator import FIELD_CLASSES from ..data.db import DocumentDB @dataclass class FieldAnalysis: """Analysis result for a single field.""" field_name: str csv_value: str expected: bool # True if CSV has value labeled: bool # True if label file has this field matched: bool # True if matcher finds it # Diagnosis failure_reason: Optional[str] = None details: dict = field(default_factory=dict) @dataclass class DocumentAnalysis: """Analysis result for a document.""" doc_id: str pdf_exists: bool pdf_type: str # "text" or "scanned" total_pages: int # Per-field analysis fields: list[FieldAnalysis] = field(default_factory=list) # Summary csv_fields_count: int = 0 # Fields with values in CSV labeled_fields_count: int = 0 # Fields in label files matched_fields_count: int = 0 # Fields matcher can find @property def has_issues(self) -> bool: """Check if document has any labeling issues.""" return any( f.expected and not f.labeled for f in self.fields ) @property def missing_labels(self) -> list[FieldAnalysis]: """Get fields that should be labeled but aren't.""" return [f for f in self.fields if f.expected and not f.labeled] class LabelAnalyzer: """Analyzes labels and diagnoses failures.""" def __init__( self, csv_path: str, pdf_dir: str, dataset_dir: str, use_db: bool = True ): self.csv_path = Path(csv_path) self.pdf_dir = Path(pdf_dir) self.dataset_dir = Path(dataset_dir) self.use_db = use_db self.matcher = FieldMatcher() self.csv_data = {} self.label_data = {} self.report_data = {} # Database connection self.db = None if use_db: self.db = DocumentDB() self.db.connect() # Class ID to name mapping self.class_names = list(FIELD_CLASSES.keys()) def load_csv(self): """Load CSV data.""" with open(self.csv_path, 'r', encoding='utf-8-sig') as f: reader = csv.DictReader(f) for row in reader: doc_id = row['DocumentId'] self.csv_data[doc_id] = row print(f"Loaded {len(self.csv_data)} records from CSV") def load_labels(self): """Load all label files from dataset.""" for split in ['train', 'val', 'test']: label_dir = self.dataset_dir / split / 'labels' if not label_dir.exists(): continue for label_file in label_dir.glob('*.txt'): # Parse document ID from filename (uuid_page_XXX.txt) name = label_file.stem parts = name.rsplit('_page_', 1) if len(parts) == 2: doc_id = parts[0] page_no = int(parts[1]) else: continue if doc_id not in self.label_data: self.label_data[doc_id] = {'pages': {}, 'split': split} # Parse label file labels = [] with open(label_file, 'r') as f: for line in f: parts = line.strip().split() if len(parts) >= 5: class_id = int(parts[0]) labels.append({ 'class_id': class_id, 'class_name': self.class_names[class_id], 'x_center': float(parts[1]), 'y_center': float(parts[2]), 'width': float(parts[3]), 'height': float(parts[4]) }) self.label_data[doc_id]['pages'][page_no] = labels total_docs = len(self.label_data) total_labels = sum( len(labels) for doc in self.label_data.values() for labels in doc['pages'].values() ) print(f"Loaded labels for {total_docs} documents ({total_labels} total labels)") def load_report(self): """Load autolabel report from database.""" if not self.db: print("Database not configured, skipping report loading") return # Get document IDs from CSV to query doc_ids = list(self.csv_data.keys()) if not doc_ids: return # Query in batches to avoid memory issues batch_size = 1000 loaded = 0 for i in range(0, len(doc_ids), batch_size): batch_ids = doc_ids[i:i + batch_size] for doc_id in batch_ids: doc = self.db.get_document(doc_id) if doc: self.report_data[doc_id] = doc loaded += 1 print(f"Loaded {loaded} autolabel reports from database") def analyze_document(self, doc_id: str, skip_missing_pdf: bool = True) -> Optional[DocumentAnalysis]: """Analyze a single document.""" csv_row = self.csv_data.get(doc_id, {}) label_info = self.label_data.get(doc_id, {'pages': {}}) report = self.report_data.get(doc_id, {}) # Check PDF pdf_path = self.pdf_dir / f"{doc_id}.pdf" pdf_exists = pdf_path.exists() # Skip documents without PDF if requested if skip_missing_pdf and not pdf_exists: return None pdf_type = "unknown" total_pages = 0 if pdf_exists: pdf_type = "scanned" if not is_text_pdf(pdf_path) else "text" total_pages = len(label_info['pages']) or report.get('total_pages', 0) analysis = DocumentAnalysis( doc_id=doc_id, pdf_exists=pdf_exists, pdf_type=pdf_type, total_pages=total_pages ) # Get labeled classes labeled_classes = set() for page_labels in label_info['pages'].values(): for label in page_labels: labeled_classes.add(label['class_name']) # Analyze each field for field_name in FIELD_CLASSES.keys(): csv_value = csv_row.get(field_name, '') if csv_value is None: csv_value = '' csv_value = str(csv_value).strip() # Handle datetime values (remove time part) if ' 00:00:00' in csv_value: csv_value = csv_value.replace(' 00:00:00', '') expected = bool(csv_value) labeled = field_name in labeled_classes field_analysis = FieldAnalysis( field_name=field_name, csv_value=csv_value, expected=expected, labeled=labeled, matched=False ) if expected: analysis.csv_fields_count += 1 if labeled: analysis.labeled_fields_count += 1 # Diagnose failures if expected and not labeled: field_analysis.failure_reason = self._diagnose_failure( doc_id, field_name, csv_value, pdf_path, pdf_type, report ) field_analysis.details = self._get_failure_details( doc_id, field_name, csv_value, pdf_path, pdf_type ) elif not expected and labeled: field_analysis.failure_reason = "EXTRA_LABEL" field_analysis.details = {'note': 'Labeled but no CSV value'} analysis.fields.append(field_analysis) return analysis def _diagnose_failure( self, doc_id: str, field_name: str, csv_value: str, pdf_path: Path, pdf_type: str, report: dict ) -> str: """Diagnose why a field wasn't labeled.""" if not pdf_path.exists(): return "PDF_NOT_FOUND" if pdf_type == "scanned": return "SCANNED_PDF" # Try to match now with current normalizer (not historical report) if pdf_path.exists() and pdf_type == "text": try: # Check all pages for page_no in range(10): # Max 10 pages try: tokens = list(extract_text_tokens(pdf_path, page_no)) if not tokens: break normalized = normalize_field(field_name, csv_value) matches = self.matcher.find_matches(tokens, field_name, normalized, page_no) if matches: return "MATCHER_OK_NOW" # Would match with current normalizer except Exception: break return "VALUE_NOT_IN_PDF" except Exception as e: return f"PDF_ERROR: {str(e)[:50]}" return "UNKNOWN" def _get_failure_details( self, doc_id: str, field_name: str, csv_value: str, pdf_path: Path, pdf_type: str ) -> dict: """Get detailed information about a failure.""" details = { 'csv_value': csv_value, 'normalized_candidates': [], 'pdf_tokens_sample': [], 'potential_matches': [] } # Get normalized candidates try: details['normalized_candidates'] = normalize_field(field_name, csv_value) except Exception: pass # Get PDF tokens if available if pdf_path.exists() and pdf_type == "text": try: tokens = list(extract_text_tokens(pdf_path, 0))[:100] # Find tokens that might be related candidates = details['normalized_candidates'] for token in tokens: text = token.text.strip() # Check if any candidate is substring or similar for cand in candidates: if cand in text or text in cand: details['potential_matches'].append({ 'token': text, 'candidate': cand, 'bbox': token.bbox }) break # Also collect date-like or number-like tokens for reference if field_name in ('InvoiceDate', 'InvoiceDueDate'): if any(c.isdigit() for c in text) and len(text) >= 6: details['pdf_tokens_sample'].append(text) elif field_name == 'Amount': if any(c.isdigit() for c in text) and (',' in text or '.' in text or len(text) >= 4): details['pdf_tokens_sample'].append(text) # Limit samples details['pdf_tokens_sample'] = details['pdf_tokens_sample'][:10] details['potential_matches'] = details['potential_matches'][:5] except Exception: pass return details def run_analysis(self, limit: Optional[int] = None, skip_missing_pdf: bool = True) -> list[DocumentAnalysis]: """Run analysis on all documents.""" self.load_csv() self.load_labels() self.load_report() results = [] doc_ids = list(self.csv_data.keys()) skipped = 0 for doc_id in doc_ids: analysis = self.analyze_document(doc_id, skip_missing_pdf=skip_missing_pdf) if analysis is None: skipped += 1 continue results.append(analysis) if limit and len(results) >= limit: break if skipped > 0: print(f"Skipped {skipped} documents without PDF files") return results def generate_report( self, results: list[DocumentAnalysis], output_path: str, verbose: bool = False ): """Generate analysis report.""" output = Path(output_path) output.parent.mkdir(parents=True, exist_ok=True) # Collect statistics stats = { 'total_documents': len(results), 'documents_with_issues': 0, 'total_expected_fields': 0, 'total_labeled_fields': 0, 'missing_labels': 0, 'extra_labels': 0, 'failure_reasons': defaultdict(int), 'failures_by_field': defaultdict(lambda: defaultdict(int)) } issues = [] for analysis in results: stats['total_expected_fields'] += analysis.csv_fields_count stats['total_labeled_fields'] += analysis.labeled_fields_count if analysis.has_issues: stats['documents_with_issues'] += 1 for f in analysis.fields: if f.expected and not f.labeled: stats['missing_labels'] += 1 stats['failure_reasons'][f.failure_reason] += 1 stats['failures_by_field'][f.field_name][f.failure_reason] += 1 issues.append({ 'doc_id': analysis.doc_id, 'field': f.field_name, 'csv_value': f.csv_value, 'reason': f.failure_reason, 'details': f.details if verbose else {} }) elif not f.expected and f.labeled: stats['extra_labels'] += 1 # Write JSON report report = { 'summary': { 'total_documents': stats['total_documents'], 'documents_with_issues': stats['documents_with_issues'], 'issue_rate': f"{stats['documents_with_issues'] / stats['total_documents'] * 100:.1f}%", 'total_expected_fields': stats['total_expected_fields'], 'total_labeled_fields': stats['total_labeled_fields'], 'label_coverage': f"{stats['total_labeled_fields'] / max(1, stats['total_expected_fields']) * 100:.1f}%", 'missing_labels': stats['missing_labels'], 'extra_labels': stats['extra_labels'] }, 'failure_reasons': dict(stats['failure_reasons']), 'failures_by_field': { field: dict(reasons) for field, reasons in stats['failures_by_field'].items() }, 'issues': issues } with open(output, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f"\nReport saved to: {output}") return report def print_summary(report: dict): """Print summary to console.""" summary = report['summary'] print("\n" + "=" * 60) print("LABEL ANALYSIS SUMMARY") print("=" * 60) print(f"\nDocuments:") print(f" Total: {summary['total_documents']}") print(f" With issues: {summary['documents_with_issues']} ({summary['issue_rate']})") print(f"\nFields:") print(f" Expected: {summary['total_expected_fields']}") print(f" Labeled: {summary['total_labeled_fields']} ({summary['label_coverage']})") print(f" Missing: {summary['missing_labels']}") print(f" Extra: {summary['extra_labels']}") print(f"\nFailure Reasons:") for reason, count in sorted(report['failure_reasons'].items(), key=lambda x: -x[1]): print(f" {reason}: {count}") print(f"\nFailures by Field:") for field, reasons in report['failures_by_field'].items(): total = sum(reasons.values()) print(f" {field}: {total}") for reason, count in sorted(reasons.items(), key=lambda x: -x[1]): print(f" - {reason}: {count}") # Show sample issues if report['issues']: print(f"\n" + "-" * 60) print("SAMPLE ISSUES (first 10)") print("-" * 60) for issue in report['issues'][:10]: print(f"\n[{issue['doc_id']}] {issue['field']}") print(f" CSV value: {issue['csv_value']}") print(f" Reason: {issue['reason']}") if issue.get('details'): details = issue['details'] if details.get('normalized_candidates'): print(f" Candidates: {details['normalized_candidates'][:5]}") if details.get('pdf_tokens_sample'): print(f" PDF samples: {details['pdf_tokens_sample'][:5]}") if details.get('potential_matches'): print(f" Potential matches:") for pm in details['potential_matches'][:3]: print(f" - token='{pm['token']}' matches candidate='{pm['candidate']}'") def main(): parser = argparse.ArgumentParser( description='Analyze auto-generated labels and diagnose failures' ) parser.add_argument( '--csv', '-c', default='data/structured_data/document_export_20260109_220326.csv', help='Path to structured data CSV file' ) parser.add_argument( '--pdf-dir', '-p', default='data/raw_pdfs', help='Directory containing PDF files' ) parser.add_argument( '--dataset', '-d', default='data/dataset', help='Dataset directory with labels' ) parser.add_argument( '--output', '-o', default='reports/label_analysis.json', help='Output path for analysis report' ) parser.add_argument( '--limit', '-l', type=int, default=None, help='Limit number of documents to analyze' ) parser.add_argument( '--verbose', '-v', action='store_true', help='Include detailed failure information' ) parser.add_argument( '--single', '-s', help='Analyze single document ID' ) parser.add_argument( '--no-db', action='store_true', help='Skip database, only analyze label files' ) args = parser.parse_args() analyzer = LabelAnalyzer( csv_path=args.csv, pdf_dir=args.pdf_dir, dataset_dir=args.dataset, use_db=not args.no_db ) if args.single: # Analyze single document analyzer.load_csv() analyzer.load_labels() analyzer.load_report() analysis = analyzer.analyze_document(args.single) print(f"\n{'=' * 60}") print(f"Document: {analysis.doc_id}") print(f"{'=' * 60}") print(f"PDF exists: {analysis.pdf_exists}") print(f"PDF type: {analysis.pdf_type}") print(f"Pages: {analysis.total_pages}") print(f"\nFields (CSV: {analysis.csv_fields_count}, Labeled: {analysis.labeled_fields_count}):") for f in analysis.fields: status = "✓" if f.labeled else ("✗" if f.expected else "-") value_str = f.csv_value[:30] if f.csv_value else "(empty)" print(f" [{status}] {f.field_name}: {value_str}") if f.failure_reason: print(f" Reason: {f.failure_reason}") if f.details.get('normalized_candidates'): print(f" Candidates: {f.details['normalized_candidates']}") if f.details.get('potential_matches'): print(f" Potential matches in PDF:") for pm in f.details['potential_matches'][:3]: print(f" - '{pm['token']}'") else: # Full analysis print("Running label analysis...") results = analyzer.run_analysis(limit=args.limit) report = analyzer.generate_report(results, args.output, verbose=args.verbose) print_summary(report) if __name__ == '__main__': main()