WOP
This commit is contained in:
600
src/cli/analyze_labels.py
Normal file
600
src/cli/analyze_labels.py
Normal file
@@ -0,0 +1,600 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Label Analysis CLI
|
||||
|
||||
Analyzes auto-generated labels to identify failures and diagnose root causes.
|
||||
Now reads from PostgreSQL database instead of JSONL files.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
from config import get_db_connection_string
|
||||
|
||||
from ..normalize import normalize_field
|
||||
from ..matcher import FieldMatcher
|
||||
from ..pdf import is_text_pdf, extract_text_tokens
|
||||
from ..yolo.annotation_generator import FIELD_CLASSES
|
||||
from ..data.db import DocumentDB
|
||||
|
||||
|
||||
@dataclass
|
||||
class FieldAnalysis:
|
||||
"""Analysis result for a single field."""
|
||||
field_name: str
|
||||
csv_value: str
|
||||
expected: bool # True if CSV has value
|
||||
labeled: bool # True if label file has this field
|
||||
matched: bool # True if matcher finds it
|
||||
|
||||
# Diagnosis
|
||||
failure_reason: Optional[str] = None
|
||||
details: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentAnalysis:
|
||||
"""Analysis result for a document."""
|
||||
doc_id: str
|
||||
pdf_exists: bool
|
||||
pdf_type: str # "text" or "scanned"
|
||||
total_pages: int
|
||||
|
||||
# Per-field analysis
|
||||
fields: list[FieldAnalysis] = field(default_factory=list)
|
||||
|
||||
# Summary
|
||||
csv_fields_count: int = 0 # Fields with values in CSV
|
||||
labeled_fields_count: int = 0 # Fields in label files
|
||||
matched_fields_count: int = 0 # Fields matcher can find
|
||||
|
||||
@property
|
||||
def has_issues(self) -> bool:
|
||||
"""Check if document has any labeling issues."""
|
||||
return any(
|
||||
f.expected and not f.labeled
|
||||
for f in self.fields
|
||||
)
|
||||
|
||||
@property
|
||||
def missing_labels(self) -> list[FieldAnalysis]:
|
||||
"""Get fields that should be labeled but aren't."""
|
||||
return [f for f in self.fields if f.expected and not f.labeled]
|
||||
|
||||
|
||||
class LabelAnalyzer:
|
||||
"""Analyzes labels and diagnoses failures."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
csv_path: str,
|
||||
pdf_dir: str,
|
||||
dataset_dir: str,
|
||||
use_db: bool = True
|
||||
):
|
||||
self.csv_path = Path(csv_path)
|
||||
self.pdf_dir = Path(pdf_dir)
|
||||
self.dataset_dir = Path(dataset_dir)
|
||||
self.use_db = use_db
|
||||
|
||||
self.matcher = FieldMatcher()
|
||||
self.csv_data = {}
|
||||
self.label_data = {}
|
||||
self.report_data = {}
|
||||
|
||||
# Database connection
|
||||
self.db = None
|
||||
if use_db:
|
||||
self.db = DocumentDB()
|
||||
self.db.connect()
|
||||
|
||||
# Class ID to name mapping
|
||||
self.class_names = list(FIELD_CLASSES.keys())
|
||||
|
||||
def load_csv(self):
|
||||
"""Load CSV data."""
|
||||
with open(self.csv_path, 'r', encoding='utf-8-sig') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
doc_id = row['DocumentId']
|
||||
self.csv_data[doc_id] = row
|
||||
print(f"Loaded {len(self.csv_data)} records from CSV")
|
||||
|
||||
def load_labels(self):
|
||||
"""Load all label files from dataset."""
|
||||
for split in ['train', 'val', 'test']:
|
||||
label_dir = self.dataset_dir / split / 'labels'
|
||||
if not label_dir.exists():
|
||||
continue
|
||||
|
||||
for label_file in label_dir.glob('*.txt'):
|
||||
# Parse document ID from filename (uuid_page_XXX.txt)
|
||||
name = label_file.stem
|
||||
parts = name.rsplit('_page_', 1)
|
||||
if len(parts) == 2:
|
||||
doc_id = parts[0]
|
||||
page_no = int(parts[1])
|
||||
else:
|
||||
continue
|
||||
|
||||
if doc_id not in self.label_data:
|
||||
self.label_data[doc_id] = {'pages': {}, 'split': split}
|
||||
|
||||
# Parse label file
|
||||
labels = []
|
||||
with open(label_file, 'r') as f:
|
||||
for line in f:
|
||||
parts = line.strip().split()
|
||||
if len(parts) >= 5:
|
||||
class_id = int(parts[0])
|
||||
labels.append({
|
||||
'class_id': class_id,
|
||||
'class_name': self.class_names[class_id],
|
||||
'x_center': float(parts[1]),
|
||||
'y_center': float(parts[2]),
|
||||
'width': float(parts[3]),
|
||||
'height': float(parts[4])
|
||||
})
|
||||
|
||||
self.label_data[doc_id]['pages'][page_no] = labels
|
||||
|
||||
total_docs = len(self.label_data)
|
||||
total_labels = sum(
|
||||
len(labels)
|
||||
for doc in self.label_data.values()
|
||||
for labels in doc['pages'].values()
|
||||
)
|
||||
print(f"Loaded labels for {total_docs} documents ({total_labels} total labels)")
|
||||
|
||||
def load_report(self):
|
||||
"""Load autolabel report from database."""
|
||||
if not self.db:
|
||||
print("Database not configured, skipping report loading")
|
||||
return
|
||||
|
||||
# Get document IDs from CSV to query
|
||||
doc_ids = list(self.csv_data.keys())
|
||||
if not doc_ids:
|
||||
return
|
||||
|
||||
# Query in batches to avoid memory issues
|
||||
batch_size = 1000
|
||||
loaded = 0
|
||||
|
||||
for i in range(0, len(doc_ids), batch_size):
|
||||
batch_ids = doc_ids[i:i + batch_size]
|
||||
for doc_id in batch_ids:
|
||||
doc = self.db.get_document(doc_id)
|
||||
if doc:
|
||||
self.report_data[doc_id] = doc
|
||||
loaded += 1
|
||||
|
||||
print(f"Loaded {loaded} autolabel reports from database")
|
||||
|
||||
def analyze_document(self, doc_id: str, skip_missing_pdf: bool = True) -> Optional[DocumentAnalysis]:
|
||||
"""Analyze a single document."""
|
||||
csv_row = self.csv_data.get(doc_id, {})
|
||||
label_info = self.label_data.get(doc_id, {'pages': {}})
|
||||
report = self.report_data.get(doc_id, {})
|
||||
|
||||
# Check PDF
|
||||
pdf_path = self.pdf_dir / f"{doc_id}.pdf"
|
||||
pdf_exists = pdf_path.exists()
|
||||
|
||||
# Skip documents without PDF if requested
|
||||
if skip_missing_pdf and not pdf_exists:
|
||||
return None
|
||||
|
||||
pdf_type = "unknown"
|
||||
total_pages = 0
|
||||
|
||||
if pdf_exists:
|
||||
pdf_type = "scanned" if not is_text_pdf(pdf_path) else "text"
|
||||
total_pages = len(label_info['pages']) or report.get('total_pages', 0)
|
||||
|
||||
analysis = DocumentAnalysis(
|
||||
doc_id=doc_id,
|
||||
pdf_exists=pdf_exists,
|
||||
pdf_type=pdf_type,
|
||||
total_pages=total_pages
|
||||
)
|
||||
|
||||
# Get labeled classes
|
||||
labeled_classes = set()
|
||||
for page_labels in label_info['pages'].values():
|
||||
for label in page_labels:
|
||||
labeled_classes.add(label['class_name'])
|
||||
|
||||
# Analyze each field
|
||||
for field_name in FIELD_CLASSES.keys():
|
||||
csv_value = csv_row.get(field_name, '')
|
||||
if csv_value is None:
|
||||
csv_value = ''
|
||||
csv_value = str(csv_value).strip()
|
||||
|
||||
# Handle datetime values (remove time part)
|
||||
if ' 00:00:00' in csv_value:
|
||||
csv_value = csv_value.replace(' 00:00:00', '')
|
||||
|
||||
expected = bool(csv_value)
|
||||
labeled = field_name in labeled_classes
|
||||
|
||||
field_analysis = FieldAnalysis(
|
||||
field_name=field_name,
|
||||
csv_value=csv_value,
|
||||
expected=expected,
|
||||
labeled=labeled,
|
||||
matched=False
|
||||
)
|
||||
|
||||
if expected:
|
||||
analysis.csv_fields_count += 1
|
||||
if labeled:
|
||||
analysis.labeled_fields_count += 1
|
||||
|
||||
# Diagnose failures
|
||||
if expected and not labeled:
|
||||
field_analysis.failure_reason = self._diagnose_failure(
|
||||
doc_id, field_name, csv_value, pdf_path, pdf_type, report
|
||||
)
|
||||
field_analysis.details = self._get_failure_details(
|
||||
doc_id, field_name, csv_value, pdf_path, pdf_type
|
||||
)
|
||||
elif not expected and labeled:
|
||||
field_analysis.failure_reason = "EXTRA_LABEL"
|
||||
field_analysis.details = {'note': 'Labeled but no CSV value'}
|
||||
|
||||
analysis.fields.append(field_analysis)
|
||||
|
||||
return analysis
|
||||
|
||||
def _diagnose_failure(
|
||||
self,
|
||||
doc_id: str,
|
||||
field_name: str,
|
||||
csv_value: str,
|
||||
pdf_path: Path,
|
||||
pdf_type: str,
|
||||
report: dict
|
||||
) -> str:
|
||||
"""Diagnose why a field wasn't labeled."""
|
||||
|
||||
if not pdf_path.exists():
|
||||
return "PDF_NOT_FOUND"
|
||||
|
||||
if pdf_type == "scanned":
|
||||
return "SCANNED_PDF"
|
||||
|
||||
# Try to match now with current normalizer (not historical report)
|
||||
if pdf_path.exists() and pdf_type == "text":
|
||||
try:
|
||||
# Check all pages
|
||||
for page_no in range(10): # Max 10 pages
|
||||
try:
|
||||
tokens = list(extract_text_tokens(pdf_path, page_no))
|
||||
if not tokens:
|
||||
break
|
||||
|
||||
normalized = normalize_field(field_name, csv_value)
|
||||
matches = self.matcher.find_matches(tokens, field_name, normalized, page_no)
|
||||
|
||||
if matches:
|
||||
return "MATCHER_OK_NOW" # Would match with current normalizer
|
||||
except Exception:
|
||||
break
|
||||
|
||||
return "VALUE_NOT_IN_PDF"
|
||||
|
||||
except Exception as e:
|
||||
return f"PDF_ERROR: {str(e)[:50]}"
|
||||
|
||||
return "UNKNOWN"
|
||||
|
||||
def _get_failure_details(
|
||||
self,
|
||||
doc_id: str,
|
||||
field_name: str,
|
||||
csv_value: str,
|
||||
pdf_path: Path,
|
||||
pdf_type: str
|
||||
) -> dict:
|
||||
"""Get detailed information about a failure."""
|
||||
details = {
|
||||
'csv_value': csv_value,
|
||||
'normalized_candidates': [],
|
||||
'pdf_tokens_sample': [],
|
||||
'potential_matches': []
|
||||
}
|
||||
|
||||
# Get normalized candidates
|
||||
try:
|
||||
details['normalized_candidates'] = normalize_field(field_name, csv_value)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Get PDF tokens if available
|
||||
if pdf_path.exists() and pdf_type == "text":
|
||||
try:
|
||||
tokens = list(extract_text_tokens(pdf_path, 0))[:100]
|
||||
|
||||
# Find tokens that might be related
|
||||
candidates = details['normalized_candidates']
|
||||
for token in tokens:
|
||||
text = token.text.strip()
|
||||
# Check if any candidate is substring or similar
|
||||
for cand in candidates:
|
||||
if cand in text or text in cand:
|
||||
details['potential_matches'].append({
|
||||
'token': text,
|
||||
'candidate': cand,
|
||||
'bbox': token.bbox
|
||||
})
|
||||
break
|
||||
# Also collect date-like or number-like tokens for reference
|
||||
if field_name in ('InvoiceDate', 'InvoiceDueDate'):
|
||||
if any(c.isdigit() for c in text) and len(text) >= 6:
|
||||
details['pdf_tokens_sample'].append(text)
|
||||
elif field_name == 'Amount':
|
||||
if any(c.isdigit() for c in text) and (',' in text or '.' in text or len(text) >= 4):
|
||||
details['pdf_tokens_sample'].append(text)
|
||||
|
||||
# Limit samples
|
||||
details['pdf_tokens_sample'] = details['pdf_tokens_sample'][:10]
|
||||
details['potential_matches'] = details['potential_matches'][:5]
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return details
|
||||
|
||||
def run_analysis(self, limit: Optional[int] = None, skip_missing_pdf: bool = True) -> list[DocumentAnalysis]:
|
||||
"""Run analysis on all documents."""
|
||||
self.load_csv()
|
||||
self.load_labels()
|
||||
self.load_report()
|
||||
|
||||
results = []
|
||||
doc_ids = list(self.csv_data.keys())
|
||||
skipped = 0
|
||||
|
||||
for doc_id in doc_ids:
|
||||
analysis = self.analyze_document(doc_id, skip_missing_pdf=skip_missing_pdf)
|
||||
if analysis is None:
|
||||
skipped += 1
|
||||
continue
|
||||
results.append(analysis)
|
||||
if limit and len(results) >= limit:
|
||||
break
|
||||
|
||||
if skipped > 0:
|
||||
print(f"Skipped {skipped} documents without PDF files")
|
||||
|
||||
return results
|
||||
|
||||
def generate_report(
|
||||
self,
|
||||
results: list[DocumentAnalysis],
|
||||
output_path: str,
|
||||
verbose: bool = False
|
||||
):
|
||||
"""Generate analysis report."""
|
||||
output = Path(output_path)
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Collect statistics
|
||||
stats = {
|
||||
'total_documents': len(results),
|
||||
'documents_with_issues': 0,
|
||||
'total_expected_fields': 0,
|
||||
'total_labeled_fields': 0,
|
||||
'missing_labels': 0,
|
||||
'extra_labels': 0,
|
||||
'failure_reasons': defaultdict(int),
|
||||
'failures_by_field': defaultdict(lambda: defaultdict(int))
|
||||
}
|
||||
|
||||
issues = []
|
||||
|
||||
for analysis in results:
|
||||
stats['total_expected_fields'] += analysis.csv_fields_count
|
||||
stats['total_labeled_fields'] += analysis.labeled_fields_count
|
||||
|
||||
if analysis.has_issues:
|
||||
stats['documents_with_issues'] += 1
|
||||
|
||||
for f in analysis.fields:
|
||||
if f.expected and not f.labeled:
|
||||
stats['missing_labels'] += 1
|
||||
stats['failure_reasons'][f.failure_reason] += 1
|
||||
stats['failures_by_field'][f.field_name][f.failure_reason] += 1
|
||||
|
||||
issues.append({
|
||||
'doc_id': analysis.doc_id,
|
||||
'field': f.field_name,
|
||||
'csv_value': f.csv_value,
|
||||
'reason': f.failure_reason,
|
||||
'details': f.details if verbose else {}
|
||||
})
|
||||
elif not f.expected and f.labeled:
|
||||
stats['extra_labels'] += 1
|
||||
|
||||
# Write JSON report
|
||||
report = {
|
||||
'summary': {
|
||||
'total_documents': stats['total_documents'],
|
||||
'documents_with_issues': stats['documents_with_issues'],
|
||||
'issue_rate': f"{stats['documents_with_issues'] / stats['total_documents'] * 100:.1f}%",
|
||||
'total_expected_fields': stats['total_expected_fields'],
|
||||
'total_labeled_fields': stats['total_labeled_fields'],
|
||||
'label_coverage': f"{stats['total_labeled_fields'] / max(1, stats['total_expected_fields']) * 100:.1f}%",
|
||||
'missing_labels': stats['missing_labels'],
|
||||
'extra_labels': stats['extra_labels']
|
||||
},
|
||||
'failure_reasons': dict(stats['failure_reasons']),
|
||||
'failures_by_field': {
|
||||
field: dict(reasons)
|
||||
for field, reasons in stats['failures_by_field'].items()
|
||||
},
|
||||
'issues': issues
|
||||
}
|
||||
|
||||
with open(output, 'w', encoding='utf-8') as f:
|
||||
json.dump(report, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\nReport saved to: {output}")
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def print_summary(report: dict):
|
||||
"""Print summary to console."""
|
||||
summary = report['summary']
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("LABEL ANALYSIS SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
print(f"\nDocuments:")
|
||||
print(f" Total: {summary['total_documents']}")
|
||||
print(f" With issues: {summary['documents_with_issues']} ({summary['issue_rate']})")
|
||||
|
||||
print(f"\nFields:")
|
||||
print(f" Expected: {summary['total_expected_fields']}")
|
||||
print(f" Labeled: {summary['total_labeled_fields']} ({summary['label_coverage']})")
|
||||
print(f" Missing: {summary['missing_labels']}")
|
||||
print(f" Extra: {summary['extra_labels']}")
|
||||
|
||||
print(f"\nFailure Reasons:")
|
||||
for reason, count in sorted(report['failure_reasons'].items(), key=lambda x: -x[1]):
|
||||
print(f" {reason}: {count}")
|
||||
|
||||
print(f"\nFailures by Field:")
|
||||
for field, reasons in report['failures_by_field'].items():
|
||||
total = sum(reasons.values())
|
||||
print(f" {field}: {total}")
|
||||
for reason, count in sorted(reasons.items(), key=lambda x: -x[1]):
|
||||
print(f" - {reason}: {count}")
|
||||
|
||||
# Show sample issues
|
||||
if report['issues']:
|
||||
print(f"\n" + "-" * 60)
|
||||
print("SAMPLE ISSUES (first 10)")
|
||||
print("-" * 60)
|
||||
|
||||
for issue in report['issues'][:10]:
|
||||
print(f"\n[{issue['doc_id']}] {issue['field']}")
|
||||
print(f" CSV value: {issue['csv_value']}")
|
||||
print(f" Reason: {issue['reason']}")
|
||||
|
||||
if issue.get('details'):
|
||||
details = issue['details']
|
||||
if details.get('normalized_candidates'):
|
||||
print(f" Candidates: {details['normalized_candidates'][:5]}")
|
||||
if details.get('pdf_tokens_sample'):
|
||||
print(f" PDF samples: {details['pdf_tokens_sample'][:5]}")
|
||||
if details.get('potential_matches'):
|
||||
print(f" Potential matches:")
|
||||
for pm in details['potential_matches'][:3]:
|
||||
print(f" - token='{pm['token']}' matches candidate='{pm['candidate']}'")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Analyze auto-generated labels and diagnose failures'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--csv', '-c',
|
||||
default='data/structured_data/document_export_20260109_220326.csv',
|
||||
help='Path to structured data CSV file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--pdf-dir', '-p',
|
||||
default='data/raw_pdfs',
|
||||
help='Directory containing PDF files'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dataset', '-d',
|
||||
default='data/dataset',
|
||||
help='Dataset directory with labels'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output', '-o',
|
||||
default='reports/label_analysis.json',
|
||||
help='Output path for analysis report'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--limit', '-l',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Limit number of documents to analyze'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v',
|
||||
action='store_true',
|
||||
help='Include detailed failure information'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--single', '-s',
|
||||
help='Analyze single document ID'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--no-db',
|
||||
action='store_true',
|
||||
help='Skip database, only analyze label files'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
analyzer = LabelAnalyzer(
|
||||
csv_path=args.csv,
|
||||
pdf_dir=args.pdf_dir,
|
||||
dataset_dir=args.dataset,
|
||||
use_db=not args.no_db
|
||||
)
|
||||
|
||||
if args.single:
|
||||
# Analyze single document
|
||||
analyzer.load_csv()
|
||||
analyzer.load_labels()
|
||||
analyzer.load_report()
|
||||
|
||||
analysis = analyzer.analyze_document(args.single)
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Document: {analysis.doc_id}")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"PDF exists: {analysis.pdf_exists}")
|
||||
print(f"PDF type: {analysis.pdf_type}")
|
||||
print(f"Pages: {analysis.total_pages}")
|
||||
print(f"\nFields (CSV: {analysis.csv_fields_count}, Labeled: {analysis.labeled_fields_count}):")
|
||||
|
||||
for f in analysis.fields:
|
||||
status = "✓" if f.labeled else ("✗" if f.expected else "-")
|
||||
value_str = f.csv_value[:30] if f.csv_value else "(empty)"
|
||||
print(f" [{status}] {f.field_name}: {value_str}")
|
||||
|
||||
if f.failure_reason:
|
||||
print(f" Reason: {f.failure_reason}")
|
||||
if f.details.get('normalized_candidates'):
|
||||
print(f" Candidates: {f.details['normalized_candidates']}")
|
||||
if f.details.get('potential_matches'):
|
||||
print(f" Potential matches in PDF:")
|
||||
for pm in f.details['potential_matches'][:3]:
|
||||
print(f" - '{pm['token']}'")
|
||||
else:
|
||||
# Full analysis
|
||||
print("Running label analysis...")
|
||||
results = analyzer.run_analysis(limit=args.limit)
|
||||
report = analyzer.generate_report(results, args.output, verbose=args.verbose)
|
||||
print_summary(report)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user