#!/usr/bin/env python3
"""
Label Analysis CLI

Analyzes auto-generated labels to identify failures and diagnose root causes.
Now reads from PostgreSQL database instead of JSONL files.
"""

import argparse
import csv
import json
import sys
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional

sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from src.config import get_db_connection_string

from ..normalize import normalize_field
from ..matcher import FieldMatcher
from ..pdf import is_text_pdf, extract_text_tokens
from ..yolo.annotation_generator import FIELD_CLASSES
from ..data.db import DocumentDB


@dataclass
class FieldAnalysis:
    """Analysis result for a single field."""
    field_name: str
    csv_value: str
    expected: bool  # True if CSV has value
    labeled: bool  # True if label file has this field
    matched: bool  # True if matcher finds it

    # Diagnosis
    failure_reason: Optional[str] = None
    details: dict = field(default_factory=dict)


@dataclass
class DocumentAnalysis:
    """Analysis result for a document."""
    doc_id: str
    pdf_exists: bool
    pdf_type: str  # "text" or "scanned"
    total_pages: int

    # Per-field analysis
    fields: list[FieldAnalysis] = field(default_factory=list)

    # Summary
    csv_fields_count: int = 0  # Fields with values in CSV
    labeled_fields_count: int = 0  # Fields in label files
    matched_fields_count: int = 0  # Fields matcher can find

    @property
    def has_issues(self) -> bool:
        """Check if document has any labeling issues."""
        return any(
            f.expected and not f.labeled
            for f in self.fields
        )

    @property
    def missing_labels(self) -> list[FieldAnalysis]:
        """Get fields that should be labeled but aren't."""
        return [f for f in self.fields if f.expected and not f.labeled]


class LabelAnalyzer:
    """Analyzes labels and diagnoses failures."""

    def __init__(
        self,
        csv_path: str,
        pdf_dir: str,
        dataset_dir: str,
        use_db: bool = True
    ):
        self.csv_path = Path(csv_path)
        self.pdf_dir = Path(pdf_dir)
        self.dataset_dir = Path(dataset_dir)
        self.use_db = use_db

        self.matcher = FieldMatcher()
        self.csv_data = {}
        self.label_data = {}
        self.report_data = {}

        # Database connection
        self.db = None
        if use_db:
            self.db = DocumentDB()
            self.db.connect()

        # Class ID to name mapping
        self.class_names = list(FIELD_CLASSES.keys())

    def load_csv(self):
        """Load CSV data."""
        with open(self.csv_path, 'r', encoding='utf-8-sig') as f:
            reader = csv.DictReader(f)
            for row in reader:
                doc_id = row['DocumentId']
                self.csv_data[doc_id] = row
        print(f"Loaded {len(self.csv_data)} records from CSV")

    def load_labels(self):
        """Load all label files from dataset."""
        for split in ['train', 'val', 'test']:
            label_dir = self.dataset_dir / split / 'labels'
            if not label_dir.exists():
                continue

            for label_file in label_dir.glob('*.txt'):
                # Parse document ID from filename (uuid_page_XXX.txt)
                name = label_file.stem
                parts = name.rsplit('_page_', 1)
                if len(parts) == 2:
                    doc_id = parts[0]
                    page_no = int(parts[1])
                else:
                    continue

                if doc_id not in self.label_data:
                    self.label_data[doc_id] = {'pages': {}, 'split': split}

                # Parse label file
                labels = []
                with open(label_file, 'r') as f:
                    for line in f:
                        parts = line.strip().split()
                        if len(parts) >= 5:
                            class_id = int(parts[0])
                            labels.append({
                                'class_id': class_id,
                                'class_name': self.class_names[class_id],
                                'x_center': float(parts[1]),
                                'y_center': float(parts[2]),
                                'width': float(parts[3]),
                                'height': float(parts[4])
                            })

                self.label_data[doc_id]['pages'][page_no] = labels

        total_docs = len(self.label_data)
        total_labels = sum(
            len(labels)
            for doc in self.label_data.values()
            for labels in doc['pages'].values()
        )
        print(f"Loaded labels for {total_docs} documents ({total_labels} total labels)")

    def load_report(self):
        """Load autolabel report from database."""
        if not self.db:
            print("Database not configured, skipping report loading")
            return

        # Get document IDs from CSV to query
        doc_ids = list(self.csv_data.keys())
        if not doc_ids:
            return

        # Query in batches to avoid memory issues
        batch_size = 1000
        loaded = 0

        for i in range(0, len(doc_ids), batch_size):
            batch_ids = doc_ids[i:i + batch_size]
            for doc_id in batch_ids:
                doc = self.db.get_document(doc_id)
                if doc:
                    self.report_data[doc_id] = doc
                    loaded += 1

        print(f"Loaded {loaded} autolabel reports from database")

    def analyze_document(self, doc_id: str, skip_missing_pdf: bool = True) -> Optional[DocumentAnalysis]:
        """Analyze a single document."""
        csv_row = self.csv_data.get(doc_id, {})
        label_info = self.label_data.get(doc_id, {'pages': {}})
        report = self.report_data.get(doc_id, {})

        # Check PDF
        pdf_path = self.pdf_dir / f"{doc_id}.pdf"
        pdf_exists = pdf_path.exists()

        # Skip documents without PDF if requested
        if skip_missing_pdf and not pdf_exists:
            return None

        pdf_type = "unknown"
        total_pages = 0

        if pdf_exists:
            pdf_type = "scanned" if not is_text_pdf(pdf_path) else "text"
            total_pages = len(label_info['pages']) or report.get('total_pages', 0)

        analysis = DocumentAnalysis(
            doc_id=doc_id,
            pdf_exists=pdf_exists,
            pdf_type=pdf_type,
            total_pages=total_pages
        )

        # Get labeled classes
        labeled_classes = set()
        for page_labels in label_info['pages'].values():
            for label in page_labels:
                labeled_classes.add(label['class_name'])

        # Analyze each field
        for field_name in FIELD_CLASSES.keys():
            csv_value = csv_row.get(field_name, '')
            if csv_value is None:
                csv_value = ''
            csv_value = str(csv_value).strip()

            # Handle datetime values (remove time part)
            if ' 00:00:00' in csv_value:
                csv_value = csv_value.replace(' 00:00:00', '')

            expected = bool(csv_value)
            labeled = field_name in labeled_classes

            field_analysis = FieldAnalysis(
                field_name=field_name,
                csv_value=csv_value,
                expected=expected,
                labeled=labeled,
                matched=False
            )

            if expected:
                analysis.csv_fields_count += 1
            if labeled:
                analysis.labeled_fields_count += 1

            # Diagnose failures
            if expected and not labeled:
                field_analysis.failure_reason = self._diagnose_failure(
                    doc_id, field_name, csv_value, pdf_path, pdf_type, report
                )
                field_analysis.details = self._get_failure_details(
                    doc_id, field_name, csv_value, pdf_path, pdf_type
                )
            elif not expected and labeled:
                field_analysis.failure_reason = "EXTRA_LABEL"
                field_analysis.details = {'note': 'Labeled but no CSV value'}

            analysis.fields.append(field_analysis)

        return analysis

    def _diagnose_failure(
        self,
        doc_id: str,
        field_name: str,
        csv_value: str,
        pdf_path: Path,
        pdf_type: str,
        report: dict
    ) -> str:
        """Diagnose why a field wasn't labeled."""

        if not pdf_path.exists():
            return "PDF_NOT_FOUND"

        if pdf_type == "scanned":
            return "SCANNED_PDF"

        # Try to match now with current normalizer (not historical report)
        if pdf_path.exists() and pdf_type == "text":
            try:
                # Check all pages
                for page_no in range(10):  # Max 10 pages
                    try:
                        tokens = list(extract_text_tokens(pdf_path, page_no))
                        if not tokens:
                            break

                        normalized = normalize_field(field_name, csv_value)
                        matches = self.matcher.find_matches(tokens, field_name, normalized, page_no)

                        if matches:
                            return "MATCHER_OK_NOW"  # Would match with current normalizer
                    except Exception:
                        break

                return "VALUE_NOT_IN_PDF"

            except Exception as e:
                return f"PDF_ERROR: {str(e)[:50]}"

        return "UNKNOWN"

    def _get_failure_details(
        self,
        doc_id: str,
        field_name: str,
        csv_value: str,
        pdf_path: Path,
        pdf_type: str
    ) -> dict:
        """Get detailed information about a failure."""
        details = {
            'csv_value': csv_value,
            'normalized_candidates': [],
            'pdf_tokens_sample': [],
            'potential_matches': []
        }

        # Get normalized candidates
        try:
            details['normalized_candidates'] = normalize_field(field_name, csv_value)
        except Exception:
            pass

        # Get PDF tokens if available
        if pdf_path.exists() and pdf_type == "text":
            try:
                tokens = list(extract_text_tokens(pdf_path, 0))[:100]

                # Find tokens that might be related
                candidates = details['normalized_candidates']
                for token in tokens:
                    text = token.text.strip()
                    # Check if any candidate is substring or similar
                    for cand in candidates:
                        if cand in text or text in cand:
                            details['potential_matches'].append({
                                'token': text,
                                'candidate': cand,
                                'bbox': token.bbox
                            })
                            break
                    # Also collect date-like or number-like tokens for reference
                    if field_name in ('InvoiceDate', 'InvoiceDueDate'):
                        if any(c.isdigit() for c in text) and len(text) >= 6:
                            details['pdf_tokens_sample'].append(text)
                    elif field_name == 'Amount':
                        if any(c.isdigit() for c in text) and (',' in text or '.' in text or len(text) >= 4):
                            details['pdf_tokens_sample'].append(text)

                # Limit samples
                details['pdf_tokens_sample'] = details['pdf_tokens_sample'][:10]
                details['potential_matches'] = details['potential_matches'][:5]

            except Exception:
                pass

        return details

    def run_analysis(self, limit: Optional[int] = None, skip_missing_pdf: bool = True) -> list[DocumentAnalysis]:
        """Run analysis on all documents."""
        self.load_csv()
        self.load_labels()
        self.load_report()

        results = []
        doc_ids = list(self.csv_data.keys())
        skipped = 0

        for doc_id in doc_ids:
            analysis = self.analyze_document(doc_id, skip_missing_pdf=skip_missing_pdf)
            if analysis is None:
                skipped += 1
                continue
            results.append(analysis)
            if limit and len(results) >= limit:
                break

        if skipped > 0:
            print(f"Skipped {skipped} documents without PDF files")

        return results

    def generate_report(
        self,
        results: list[DocumentAnalysis],
        output_path: str,
        verbose: bool = False
    ):
        """Generate analysis report."""
        output = Path(output_path)
        output.parent.mkdir(parents=True, exist_ok=True)

        # Collect statistics
        stats = {
            'total_documents': len(results),
            'documents_with_issues': 0,
            'total_expected_fields': 0,
            'total_labeled_fields': 0,
            'missing_labels': 0,
            'extra_labels': 0,
            'failure_reasons': defaultdict(int),
            'failures_by_field': defaultdict(lambda: defaultdict(int))
        }

        issues = []

        for analysis in results:
            stats['total_expected_fields'] += analysis.csv_fields_count
            stats['total_labeled_fields'] += analysis.labeled_fields_count

            if analysis.has_issues:
                stats['documents_with_issues'] += 1

            for f in analysis.fields:
                if f.expected and not f.labeled:
                    stats['missing_labels'] += 1
                    stats['failure_reasons'][f.failure_reason] += 1
                    stats['failures_by_field'][f.field_name][f.failure_reason] += 1

                    issues.append({
                        'doc_id': analysis.doc_id,
                        'field': f.field_name,
                        'csv_value': f.csv_value,
                        'reason': f.failure_reason,
                        'details': f.details if verbose else {}
                    })
                elif not f.expected and f.labeled:
                    stats['extra_labels'] += 1

        # Write JSON report
        report = {
            'summary': {
                'total_documents': stats['total_documents'],
                'documents_with_issues': stats['documents_with_issues'],
                'issue_rate': f"{stats['documents_with_issues'] / stats['total_documents'] * 100:.1f}%",
                'total_expected_fields': stats['total_expected_fields'],
                'total_labeled_fields': stats['total_labeled_fields'],
                'label_coverage': f"{stats['total_labeled_fields'] / max(1, stats['total_expected_fields']) * 100:.1f}%",
                'missing_labels': stats['missing_labels'],
                'extra_labels': stats['extra_labels']
            },
            'failure_reasons': dict(stats['failure_reasons']),
            'failures_by_field': {
                field: dict(reasons)
                for field, reasons in stats['failures_by_field'].items()
            },
            'issues': issues
        }

        with open(output, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2, ensure_ascii=False)

        print(f"\nReport saved to: {output}")

        return report


def print_summary(report: dict):
    """Print summary to console."""
    summary = report['summary']

    print("\n" + "=" * 60)
    print("LABEL ANALYSIS SUMMARY")
    print("=" * 60)

    print(f"\nDocuments:")
    print(f"  Total:        {summary['total_documents']}")
    print(f"  With issues:  {summary['documents_with_issues']} ({summary['issue_rate']})")

    print(f"\nFields:")
    print(f"  Expected:     {summary['total_expected_fields']}")
    print(f"  Labeled:      {summary['total_labeled_fields']} ({summary['label_coverage']})")
    print(f"  Missing:      {summary['missing_labels']}")
    print(f"  Extra:        {summary['extra_labels']}")

    print(f"\nFailure Reasons:")
    for reason, count in sorted(report['failure_reasons'].items(), key=lambda x: -x[1]):
        print(f"  {reason}: {count}")

    print(f"\nFailures by Field:")
    for field, reasons in report['failures_by_field'].items():
        total = sum(reasons.values())
        print(f"  {field}: {total}")
        for reason, count in sorted(reasons.items(), key=lambda x: -x[1]):
            print(f"    - {reason}: {count}")

    # Show sample issues
    if report['issues']:
        print(f"\n" + "-" * 60)
        print("SAMPLE ISSUES (first 10)")
        print("-" * 60)

        for issue in report['issues'][:10]:
            print(f"\n[{issue['doc_id']}] {issue['field']}")
            print(f"  CSV value: {issue['csv_value']}")
            print(f"  Reason: {issue['reason']}")

            if issue.get('details'):
                details = issue['details']
                if details.get('normalized_candidates'):
                    print(f"  Candidates: {details['normalized_candidates'][:5]}")
                if details.get('pdf_tokens_sample'):
                    print(f"  PDF samples: {details['pdf_tokens_sample'][:5]}")
                if details.get('potential_matches'):
                    print(f"  Potential matches:")
                    for pm in details['potential_matches'][:3]:
                        print(f"    - token='{pm['token']}' matches candidate='{pm['candidate']}'")


def main():
    parser = argparse.ArgumentParser(
        description='Analyze auto-generated labels and diagnose failures'
    )
    parser.add_argument(
        '--csv', '-c',
        default='data/structured_data/document_export_20260109_220326.csv',
        help='Path to structured data CSV file'
    )
    parser.add_argument(
        '--pdf-dir', '-p',
        default='data/raw_pdfs',
        help='Directory containing PDF files'
    )
    parser.add_argument(
        '--dataset', '-d',
        default='data/dataset',
        help='Dataset directory with labels'
    )
    parser.add_argument(
        '--output', '-o',
        default='reports/label_analysis.json',
        help='Output path for analysis report'
    )
    parser.add_argument(
        '--limit', '-l',
        type=int,
        default=None,
        help='Limit number of documents to analyze'
    )
    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
        help='Include detailed failure information'
    )
    parser.add_argument(
        '--single', '-s',
        help='Analyze single document ID'
    )
    parser.add_argument(
        '--no-db',
        action='store_true',
        help='Skip database, only analyze label files'
    )

    args = parser.parse_args()

    analyzer = LabelAnalyzer(
        csv_path=args.csv,
        pdf_dir=args.pdf_dir,
        dataset_dir=args.dataset,
        use_db=not args.no_db
    )

    if args.single:
        # Analyze single document
        analyzer.load_csv()
        analyzer.load_labels()
        analyzer.load_report()

        analysis = analyzer.analyze_document(args.single)

        print(f"\n{'=' * 60}")
        print(f"Document: {analysis.doc_id}")
        print(f"{'=' * 60}")
        print(f"PDF exists: {analysis.pdf_exists}")
        print(f"PDF type: {analysis.pdf_type}")
        print(f"Pages: {analysis.total_pages}")
        print(f"\nFields (CSV: {analysis.csv_fields_count}, Labeled: {analysis.labeled_fields_count}):")

        for f in analysis.fields:
            status = "✓" if f.labeled else ("✗" if f.expected else "-")
            value_str = f.csv_value[:30] if f.csv_value else "(empty)"
            print(f"  [{status}] {f.field_name}: {value_str}")

            if f.failure_reason:
                print(f"      Reason: {f.failure_reason}")
                if f.details.get('normalized_candidates'):
                    print(f"      Candidates: {f.details['normalized_candidates']}")
                if f.details.get('potential_matches'):
                    print(f"      Potential matches in PDF:")
                    for pm in f.details['potential_matches'][:3]:
                        print(f"        - '{pm['token']}'")
    else:
        # Full analysis
        print("Running label analysis...")
        results = analyzer.run_analysis(limit=args.limit)
        report = analyzer.generate_report(results, args.output, verbose=args.verbose)
        print_summary(report)


if __name__ == '__main__':
    main()