This commit is contained in:
Yaojia Wang
2026-01-13 00:10:27 +01:00
parent 1b7c61cdd8
commit b26fd61852
43 changed files with 7751 additions and 578 deletions

600
src/cli/analyze_labels.py Normal file
View File

@@ -0,0 +1,600 @@
#!/usr/bin/env python3
"""
Label Analysis CLI
Analyzes auto-generated labels to identify failures and diagnose root causes.
Now reads from PostgreSQL database instead of JSONL files.
"""
import argparse
import csv
import json
import sys
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from config import get_db_connection_string
from ..normalize import normalize_field
from ..matcher import FieldMatcher
from ..pdf import is_text_pdf, extract_text_tokens
from ..yolo.annotation_generator import FIELD_CLASSES
from ..data.db import DocumentDB
@dataclass
class FieldAnalysis:
"""Analysis result for a single field."""
field_name: str
csv_value: str
expected: bool # True if CSV has value
labeled: bool # True if label file has this field
matched: bool # True if matcher finds it
# Diagnosis
failure_reason: Optional[str] = None
details: dict = field(default_factory=dict)
@dataclass
class DocumentAnalysis:
"""Analysis result for a document."""
doc_id: str
pdf_exists: bool
pdf_type: str # "text" or "scanned"
total_pages: int
# Per-field analysis
fields: list[FieldAnalysis] = field(default_factory=list)
# Summary
csv_fields_count: int = 0 # Fields with values in CSV
labeled_fields_count: int = 0 # Fields in label files
matched_fields_count: int = 0 # Fields matcher can find
@property
def has_issues(self) -> bool:
"""Check if document has any labeling issues."""
return any(
f.expected and not f.labeled
for f in self.fields
)
@property
def missing_labels(self) -> list[FieldAnalysis]:
"""Get fields that should be labeled but aren't."""
return [f for f in self.fields if f.expected and not f.labeled]
class LabelAnalyzer:
"""Analyzes labels and diagnoses failures."""
def __init__(
self,
csv_path: str,
pdf_dir: str,
dataset_dir: str,
use_db: bool = True
):
self.csv_path = Path(csv_path)
self.pdf_dir = Path(pdf_dir)
self.dataset_dir = Path(dataset_dir)
self.use_db = use_db
self.matcher = FieldMatcher()
self.csv_data = {}
self.label_data = {}
self.report_data = {}
# Database connection
self.db = None
if use_db:
self.db = DocumentDB()
self.db.connect()
# Class ID to name mapping
self.class_names = list(FIELD_CLASSES.keys())
def load_csv(self):
"""Load CSV data."""
with open(self.csv_path, 'r', encoding='utf-8-sig') as f:
reader = csv.DictReader(f)
for row in reader:
doc_id = row['DocumentId']
self.csv_data[doc_id] = row
print(f"Loaded {len(self.csv_data)} records from CSV")
def load_labels(self):
"""Load all label files from dataset."""
for split in ['train', 'val', 'test']:
label_dir = self.dataset_dir / split / 'labels'
if not label_dir.exists():
continue
for label_file in label_dir.glob('*.txt'):
# Parse document ID from filename (uuid_page_XXX.txt)
name = label_file.stem
parts = name.rsplit('_page_', 1)
if len(parts) == 2:
doc_id = parts[0]
page_no = int(parts[1])
else:
continue
if doc_id not in self.label_data:
self.label_data[doc_id] = {'pages': {}, 'split': split}
# Parse label file
labels = []
with open(label_file, 'r') as f:
for line in f:
parts = line.strip().split()
if len(parts) >= 5:
class_id = int(parts[0])
labels.append({
'class_id': class_id,
'class_name': self.class_names[class_id],
'x_center': float(parts[1]),
'y_center': float(parts[2]),
'width': float(parts[3]),
'height': float(parts[4])
})
self.label_data[doc_id]['pages'][page_no] = labels
total_docs = len(self.label_data)
total_labels = sum(
len(labels)
for doc in self.label_data.values()
for labels in doc['pages'].values()
)
print(f"Loaded labels for {total_docs} documents ({total_labels} total labels)")
def load_report(self):
"""Load autolabel report from database."""
if not self.db:
print("Database not configured, skipping report loading")
return
# Get document IDs from CSV to query
doc_ids = list(self.csv_data.keys())
if not doc_ids:
return
# Query in batches to avoid memory issues
batch_size = 1000
loaded = 0
for i in range(0, len(doc_ids), batch_size):
batch_ids = doc_ids[i:i + batch_size]
for doc_id in batch_ids:
doc = self.db.get_document(doc_id)
if doc:
self.report_data[doc_id] = doc
loaded += 1
print(f"Loaded {loaded} autolabel reports from database")
def analyze_document(self, doc_id: str, skip_missing_pdf: bool = True) -> Optional[DocumentAnalysis]:
"""Analyze a single document."""
csv_row = self.csv_data.get(doc_id, {})
label_info = self.label_data.get(doc_id, {'pages': {}})
report = self.report_data.get(doc_id, {})
# Check PDF
pdf_path = self.pdf_dir / f"{doc_id}.pdf"
pdf_exists = pdf_path.exists()
# Skip documents without PDF if requested
if skip_missing_pdf and not pdf_exists:
return None
pdf_type = "unknown"
total_pages = 0
if pdf_exists:
pdf_type = "scanned" if not is_text_pdf(pdf_path) else "text"
total_pages = len(label_info['pages']) or report.get('total_pages', 0)
analysis = DocumentAnalysis(
doc_id=doc_id,
pdf_exists=pdf_exists,
pdf_type=pdf_type,
total_pages=total_pages
)
# Get labeled classes
labeled_classes = set()
for page_labels in label_info['pages'].values():
for label in page_labels:
labeled_classes.add(label['class_name'])
# Analyze each field
for field_name in FIELD_CLASSES.keys():
csv_value = csv_row.get(field_name, '')
if csv_value is None:
csv_value = ''
csv_value = str(csv_value).strip()
# Handle datetime values (remove time part)
if ' 00:00:00' in csv_value:
csv_value = csv_value.replace(' 00:00:00', '')
expected = bool(csv_value)
labeled = field_name in labeled_classes
field_analysis = FieldAnalysis(
field_name=field_name,
csv_value=csv_value,
expected=expected,
labeled=labeled,
matched=False
)
if expected:
analysis.csv_fields_count += 1
if labeled:
analysis.labeled_fields_count += 1
# Diagnose failures
if expected and not labeled:
field_analysis.failure_reason = self._diagnose_failure(
doc_id, field_name, csv_value, pdf_path, pdf_type, report
)
field_analysis.details = self._get_failure_details(
doc_id, field_name, csv_value, pdf_path, pdf_type
)
elif not expected and labeled:
field_analysis.failure_reason = "EXTRA_LABEL"
field_analysis.details = {'note': 'Labeled but no CSV value'}
analysis.fields.append(field_analysis)
return analysis
def _diagnose_failure(
self,
doc_id: str,
field_name: str,
csv_value: str,
pdf_path: Path,
pdf_type: str,
report: dict
) -> str:
"""Diagnose why a field wasn't labeled."""
if not pdf_path.exists():
return "PDF_NOT_FOUND"
if pdf_type == "scanned":
return "SCANNED_PDF"
# Try to match now with current normalizer (not historical report)
if pdf_path.exists() and pdf_type == "text":
try:
# Check all pages
for page_no in range(10): # Max 10 pages
try:
tokens = list(extract_text_tokens(pdf_path, page_no))
if not tokens:
break
normalized = normalize_field(field_name, csv_value)
matches = self.matcher.find_matches(tokens, field_name, normalized, page_no)
if matches:
return "MATCHER_OK_NOW" # Would match with current normalizer
except Exception:
break
return "VALUE_NOT_IN_PDF"
except Exception as e:
return f"PDF_ERROR: {str(e)[:50]}"
return "UNKNOWN"
def _get_failure_details(
self,
doc_id: str,
field_name: str,
csv_value: str,
pdf_path: Path,
pdf_type: str
) -> dict:
"""Get detailed information about a failure."""
details = {
'csv_value': csv_value,
'normalized_candidates': [],
'pdf_tokens_sample': [],
'potential_matches': []
}
# Get normalized candidates
try:
details['normalized_candidates'] = normalize_field(field_name, csv_value)
except Exception:
pass
# Get PDF tokens if available
if pdf_path.exists() and pdf_type == "text":
try:
tokens = list(extract_text_tokens(pdf_path, 0))[:100]
# Find tokens that might be related
candidates = details['normalized_candidates']
for token in tokens:
text = token.text.strip()
# Check if any candidate is substring or similar
for cand in candidates:
if cand in text or text in cand:
details['potential_matches'].append({
'token': text,
'candidate': cand,
'bbox': token.bbox
})
break
# Also collect date-like or number-like tokens for reference
if field_name in ('InvoiceDate', 'InvoiceDueDate'):
if any(c.isdigit() for c in text) and len(text) >= 6:
details['pdf_tokens_sample'].append(text)
elif field_name == 'Amount':
if any(c.isdigit() for c in text) and (',' in text or '.' in text or len(text) >= 4):
details['pdf_tokens_sample'].append(text)
# Limit samples
details['pdf_tokens_sample'] = details['pdf_tokens_sample'][:10]
details['potential_matches'] = details['potential_matches'][:5]
except Exception:
pass
return details
def run_analysis(self, limit: Optional[int] = None, skip_missing_pdf: bool = True) -> list[DocumentAnalysis]:
"""Run analysis on all documents."""
self.load_csv()
self.load_labels()
self.load_report()
results = []
doc_ids = list(self.csv_data.keys())
skipped = 0
for doc_id in doc_ids:
analysis = self.analyze_document(doc_id, skip_missing_pdf=skip_missing_pdf)
if analysis is None:
skipped += 1
continue
results.append(analysis)
if limit and len(results) >= limit:
break
if skipped > 0:
print(f"Skipped {skipped} documents without PDF files")
return results
def generate_report(
self,
results: list[DocumentAnalysis],
output_path: str,
verbose: bool = False
):
"""Generate analysis report."""
output = Path(output_path)
output.parent.mkdir(parents=True, exist_ok=True)
# Collect statistics
stats = {
'total_documents': len(results),
'documents_with_issues': 0,
'total_expected_fields': 0,
'total_labeled_fields': 0,
'missing_labels': 0,
'extra_labels': 0,
'failure_reasons': defaultdict(int),
'failures_by_field': defaultdict(lambda: defaultdict(int))
}
issues = []
for analysis in results:
stats['total_expected_fields'] += analysis.csv_fields_count
stats['total_labeled_fields'] += analysis.labeled_fields_count
if analysis.has_issues:
stats['documents_with_issues'] += 1
for f in analysis.fields:
if f.expected and not f.labeled:
stats['missing_labels'] += 1
stats['failure_reasons'][f.failure_reason] += 1
stats['failures_by_field'][f.field_name][f.failure_reason] += 1
issues.append({
'doc_id': analysis.doc_id,
'field': f.field_name,
'csv_value': f.csv_value,
'reason': f.failure_reason,
'details': f.details if verbose else {}
})
elif not f.expected and f.labeled:
stats['extra_labels'] += 1
# Write JSON report
report = {
'summary': {
'total_documents': stats['total_documents'],
'documents_with_issues': stats['documents_with_issues'],
'issue_rate': f"{stats['documents_with_issues'] / stats['total_documents'] * 100:.1f}%",
'total_expected_fields': stats['total_expected_fields'],
'total_labeled_fields': stats['total_labeled_fields'],
'label_coverage': f"{stats['total_labeled_fields'] / max(1, stats['total_expected_fields']) * 100:.1f}%",
'missing_labels': stats['missing_labels'],
'extra_labels': stats['extra_labels']
},
'failure_reasons': dict(stats['failure_reasons']),
'failures_by_field': {
field: dict(reasons)
for field, reasons in stats['failures_by_field'].items()
},
'issues': issues
}
with open(output, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2, ensure_ascii=False)
print(f"\nReport saved to: {output}")
return report
def print_summary(report: dict):
"""Print summary to console."""
summary = report['summary']
print("\n" + "=" * 60)
print("LABEL ANALYSIS SUMMARY")
print("=" * 60)
print(f"\nDocuments:")
print(f" Total: {summary['total_documents']}")
print(f" With issues: {summary['documents_with_issues']} ({summary['issue_rate']})")
print(f"\nFields:")
print(f" Expected: {summary['total_expected_fields']}")
print(f" Labeled: {summary['total_labeled_fields']} ({summary['label_coverage']})")
print(f" Missing: {summary['missing_labels']}")
print(f" Extra: {summary['extra_labels']}")
print(f"\nFailure Reasons:")
for reason, count in sorted(report['failure_reasons'].items(), key=lambda x: -x[1]):
print(f" {reason}: {count}")
print(f"\nFailures by Field:")
for field, reasons in report['failures_by_field'].items():
total = sum(reasons.values())
print(f" {field}: {total}")
for reason, count in sorted(reasons.items(), key=lambda x: -x[1]):
print(f" - {reason}: {count}")
# Show sample issues
if report['issues']:
print(f"\n" + "-" * 60)
print("SAMPLE ISSUES (first 10)")
print("-" * 60)
for issue in report['issues'][:10]:
print(f"\n[{issue['doc_id']}] {issue['field']}")
print(f" CSV value: {issue['csv_value']}")
print(f" Reason: {issue['reason']}")
if issue.get('details'):
details = issue['details']
if details.get('normalized_candidates'):
print(f" Candidates: {details['normalized_candidates'][:5]}")
if details.get('pdf_tokens_sample'):
print(f" PDF samples: {details['pdf_tokens_sample'][:5]}")
if details.get('potential_matches'):
print(f" Potential matches:")
for pm in details['potential_matches'][:3]:
print(f" - token='{pm['token']}' matches candidate='{pm['candidate']}'")
def main():
parser = argparse.ArgumentParser(
description='Analyze auto-generated labels and diagnose failures'
)
parser.add_argument(
'--csv', '-c',
default='data/structured_data/document_export_20260109_220326.csv',
help='Path to structured data CSV file'
)
parser.add_argument(
'--pdf-dir', '-p',
default='data/raw_pdfs',
help='Directory containing PDF files'
)
parser.add_argument(
'--dataset', '-d',
default='data/dataset',
help='Dataset directory with labels'
)
parser.add_argument(
'--output', '-o',
default='reports/label_analysis.json',
help='Output path for analysis report'
)
parser.add_argument(
'--limit', '-l',
type=int,
default=None,
help='Limit number of documents to analyze'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Include detailed failure information'
)
parser.add_argument(
'--single', '-s',
help='Analyze single document ID'
)
parser.add_argument(
'--no-db',
action='store_true',
help='Skip database, only analyze label files'
)
args = parser.parse_args()
analyzer = LabelAnalyzer(
csv_path=args.csv,
pdf_dir=args.pdf_dir,
dataset_dir=args.dataset,
use_db=not args.no_db
)
if args.single:
# Analyze single document
analyzer.load_csv()
analyzer.load_labels()
analyzer.load_report()
analysis = analyzer.analyze_document(args.single)
print(f"\n{'=' * 60}")
print(f"Document: {analysis.doc_id}")
print(f"{'=' * 60}")
print(f"PDF exists: {analysis.pdf_exists}")
print(f"PDF type: {analysis.pdf_type}")
print(f"Pages: {analysis.total_pages}")
print(f"\nFields (CSV: {analysis.csv_fields_count}, Labeled: {analysis.labeled_fields_count}):")
for f in analysis.fields:
status = "" if f.labeled else ("" if f.expected else "-")
value_str = f.csv_value[:30] if f.csv_value else "(empty)"
print(f" [{status}] {f.field_name}: {value_str}")
if f.failure_reason:
print(f" Reason: {f.failure_reason}")
if f.details.get('normalized_candidates'):
print(f" Candidates: {f.details['normalized_candidates']}")
if f.details.get('potential_matches'):
print(f" Potential matches in PDF:")
for pm in f.details['potential_matches'][:3]:
print(f" - '{pm['token']}'")
else:
# Full analysis
print("Running label analysis...")
results = analyzer.run_analysis(limit=args.limit)
report = analyzer.generate_report(results, args.output, verbose=args.verbose)
print_summary(report)
if __name__ == '__main__':
main()