Fix the skill

2026-02-04 23:30:06 +01:00
parent fa996683c3
commit 15533285c6
4 changed files with 788 additions and 1730 deletions
--- a/.opencode/skills/coding-standards/SKILL.md
+++ b/.opencode/skills/coding-standards/SKILL.md
@@ -1,665 +1,234 @@
 ---
 name: coding-standards
-description: Universal coding standards, best practices, and patterns for Python, FastAPI, and data processing development.
+description: .NET/C# coding standards and best practices.
 ---

-# Coding Standards & Best Practices
+# .NET Coding Standards

-Python coding standards for the Invoice Master project.
+## Core Principles

-## Code Quality Principles
+- **Readability First** - Clear names, self-documenting code
+- **KISS** - Simplest solution that works
+- **DRY** - Extract common logic, avoid copy-paste
+- **YAGNI** - Don't build features before needed

-### 1. Readability First
- Code is read more than written
- Clear variable and function names
- Self-documenting code preferred over comments
- Consistent formatting (follow PEP 8)
+## Naming Conventions

-### 2. KISS (Keep It Simple, Stupid)
- Simplest solution that works
- Avoid over-engineering
- No premature optimization
- Easy to understand > clever code
+```csharp
+// PascalCase: Types, methods, properties, public fields
+public class DocumentService { }
+public async Task<Document> GetByIdAsync(Guid id) { }
+public string InvoiceNumber { get; init; }

-### 3. DRY (Don't Repeat Yourself)
- Extract common logic into functions
- Create reusable utilities
- Share modules across the codebase
- Avoid copy-paste programming
+// camelCase: Parameters, local variables, private fields with underscore
+private readonly ILogger<DocumentService> _logger;
+public void Process(string documentId, int pageCount) { }

-### 4. YAGNI (You Aren't Gonna Need It)
- Don't build features before they're needed
- Avoid speculative generality
- Add complexity only when required
- Start simple, refactor when needed
+// Interfaces: I prefix
+public interface IDocumentRepository { }

-## Python Standards
-
-### Variable Naming
-
-```python
-# GOOD: Descriptive names
-invoice_number = "INV-2024-001"
-is_valid_document = True
-total_confidence_score = 0.95
-
-# BAD: Unclear names
-inv = "INV-2024-001"
-flag = True
-x = 0.95
+// Async methods: Async suffix
+public async Task<Document> LoadAsync(CancellationToken ct)
 ```

-### Function Naming
+## Modern C# Features

-```python
-# GOOD: Verb-noun pattern with type hints
-def extract_invoice_fields(pdf_path: Path) -> dict[str, str]:
-    """Extract fields from invoice PDF."""
-    ...
+```csharp
+// Records for DTOs and value objects
+public sealed record CreateDocumentRequest(string Name, string Type);
+public sealed record DocumentDto(Guid Id, string Name, DateTime CreatedAt);

-def calculate_confidence(predictions: list[float]) -> float:
-    """Calculate average confidence score."""
-    ...
+// Primary constructors
+public class DocumentService(IRepository<Document> repo, ILogger<DocumentService> logger)
+{
+    public async Task<Document?> GetAsync(Guid id, CancellationToken ct) =>
+        await repo.GetByIdAsync(id, ct);
+}

-def is_valid_bankgiro(value: str) -> bool:
-    """Check if value is valid Bankgiro number."""
-    ...
+// Pattern matching
+var message = result switch
+{
+    { IsSuccess: true, Value: var doc } => $"Found: {doc.Name}",
+    { Error: var err } => $"Error: {err}",
+    _ => "Unknown"
+};

-# BAD: Unclear or noun-only
-def invoice(path):
-    ...
+// Collection expressions
+int[] numbers = [1, 2, 3];
+List<string> names = ["Alice", "Bob"];

-def confidence(p):
-    ...
-
-def bankgiro(v):
-    ...
+// Null coalescing
+var name = user?.Name ?? "Unknown";
+list ??= [];
 ```

-### Type Hints (REQUIRED)
+## Immutability (Critical)

-```python
-# GOOD: Full type annotations
-from typing import Optional
-from pathlib import Path
-from dataclasses import dataclass
+```csharp
+// GOOD: Create new objects
+public record User(string Name, int Age)
+{
+    public User WithName(string newName) => this with { Name = newName };
+}

-@dataclass
-class InferenceResult:
-    document_id: str
-    fields: dict[str, str]
-    confidence: dict[str, float]
-    processing_time_ms: float
+// GOOD: Immutable collections
+public IReadOnlyList<string> GetNames() => _names.AsReadOnly();

-def process_document(
-    pdf_path: Path,
-    confidence_threshold: float = 0.5
-) -> InferenceResult:
-    """Process PDF and return extracted fields."""
-    ...
-
-# BAD: No type hints
-def process_document(pdf_path, confidence_threshold=0.5):
-    ...
+// BAD: Mutation
+public void UpdateUser(User user, string name)
+{
+    user.Name = name;  // MUTATION!
+}
 ```

-### Immutability Pattern (CRITICAL)
+## Error Handling

-```python
-# GOOD: Create new objects, don't mutate
-def update_fields(fields: dict[str, str], updates: dict[str, str]) -> dict[str, str]:
-    return {**fields, **updates}
+```csharp
+// Domain exceptions
+public class NotFoundException(string resource, Guid id)
+    : Exception($"{resource} not found: {id}");

-def add_item(items: list[str], new_item: str) -> list[str]:
-    return [*items, new_item]
+// Comprehensive handling
+public async Task<Document> LoadAsync(Guid id, CancellationToken ct)
+{
+    try
+    {
+        var doc = await _repo.GetByIdAsync(id, ct);
+        return doc ?? throw new NotFoundException("Document", id);
+    }
+    catch (Exception ex) when (ex is not NotFoundException)
+    {
+        _logger.LogError(ex, "Failed to load document {Id}", id);
+        throw;
+    }
+}

-# BAD: Direct mutation
-def update_fields(fields: dict[str, str], updates: dict[str, str]) -> dict[str, str]:
-    fields.update(updates)  # MUTATION!
-    return fields
-
-def add_item(items: list[str], new_item: str) -> list[str]:
-    items.append(new_item)  # MUTATION!
-    return items
+// Result pattern for expected failures
+public Result<Document> Validate(CreateRequest request) =>
+    string.IsNullOrEmpty(request.Name)
+        ? Result<Document>.Fail("Name required")
+        : Result<Document>.Ok(new Document(request.Name));
 ```

-### Error Handling
+## Async/Await

-```python
-import logging
+```csharp
+// Always pass CancellationToken
+public async Task<Document> GetAsync(Guid id, CancellationToken ct)

-logger = logging.getLogger(__name__)
+// Use ConfigureAwait(false) in libraries
+await _client.GetAsync(url, ct).ConfigureAwait(false);

-# GOOD: Comprehensive error handling with logging
-def load_model(model_path: Path) -> Model:
-    """Load YOLO model from path."""
-    try:
-        if not model_path.exists():
-            raise FileNotFoundError(f"Model not found: {model_path}")
+// Avoid async void
+public async Task ProcessAsync() { }  // Good
+public async void Process() { }       // Bad

-        model = YOLO(str(model_path))
-        logger.info(f"Model loaded: {model_path}")
-        return model
-    except Exception as e:
-        logger.error(f"Failed to load model: {e}")
-        raise RuntimeError(f"Model loading failed: {model_path}") from e
-
-# BAD: No error handling
-def load_model(model_path):
-    return YOLO(str(model_path))
-
-# BAD: Bare except
-def load_model(model_path):
-    try:
-        return YOLO(str(model_path))
-    except:  # Never use bare except!
-        return None
+// Parallel when independent
+var tasks = ids.Select(id => GetAsync(id, ct));
+var results = await Task.WhenAll(tasks);
 ```

-### Async Best Practices
+## LINQ Best Practices

-```python
-import asyncio
+```csharp
+// Prefer method syntax for complex queries
+var result = documents
+    .Where(d => d.Status == "Active")
+    .OrderByDescending(d => d.CreatedAt)
+    .Select(d => new DocumentDto(d.Id, d.Name, d.CreatedAt))
+    .Take(10);

-# GOOD: Parallel execution when possible
-async def process_batch(pdf_paths: list[Path]) -> list[InferenceResult]:
-    tasks = [process_document(path) for path in pdf_paths]
-    results = await asyncio.gather(*tasks, return_exceptions=True)
+// Use Any() instead of Count() > 0
+if (documents.Any(d => d.IsValid)) { }

-    # Handle exceptions
-    valid_results = []
-    for path, result in zip(pdf_paths, results):
-        if isinstance(result, Exception):
-            logger.error(f"Failed to process {path}: {result}")
-        else:
-            valid_results.append(result)
-    return valid_results
-
-# BAD: Sequential when unnecessary
-async def process_batch(pdf_paths: list[Path]) -> list[InferenceResult]:
-    results = []
-    for path in pdf_paths:
-        result = await process_document(path)
-        results.append(result)
-    return results
-```
-
-### Context Managers
-
-```python
-from contextlib import contextmanager
-from pathlib import Path
-import tempfile
-
-# GOOD: Proper resource management
-@contextmanager
-def temp_pdf_copy(pdf_path: Path):
-    """Create temporary copy of PDF for processing."""
-    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
-        tmp.write(pdf_path.read_bytes())
-        tmp_path = Path(tmp.name)
-    try:
-        yield tmp_path
-    finally:
-        tmp_path.unlink(missing_ok=True)
-
-# Usage
-with temp_pdf_copy(original_pdf) as tmp_pdf:
-    result = process_pdf(tmp_pdf)
-```
-
-## FastAPI Best Practices
-
-### Route Structure
-
-```python
-from fastapi import APIRouter, HTTPException, Depends, Query, File, UploadFile
-from pydantic import BaseModel
-
-router = APIRouter(prefix="/api/v1", tags=["inference"])
-
-class InferenceResponse(BaseModel):
-    success: bool
-    document_id: str
-    fields: dict[str, str]
-    confidence: dict[str, float]
-    processing_time_ms: float
-
-@router.post("/infer", response_model=InferenceResponse)
-async def infer_document(
-    file: UploadFile = File(...),
-    confidence_threshold: float = Query(0.5, ge=0.0, le=1.0)
-) -> InferenceResponse:
-    """Process invoice PDF and extract fields."""
-    if not file.filename.endswith(".pdf"):
-        raise HTTPException(status_code=400, detail="Only PDF files accepted")
-
-    result = await inference_service.process(file, confidence_threshold)
-    return InferenceResponse(
-        success=True,
-        document_id=result.document_id,
-        fields=result.fields,
-        confidence=result.confidence,
-        processing_time_ms=result.processing_time_ms
-    )
-```
-
-### Input Validation with Pydantic
-
-```python
-from pydantic import BaseModel, Field, field_validator
-from datetime import date
-import re
-
-class InvoiceData(BaseModel):
-    invoice_number: str = Field(..., min_length=1, max_length=50)
-    invoice_date: date
-    amount: float = Field(..., gt=0)
-    bankgiro: str | None = None
-    ocr_number: str | None = None
-
-    @field_validator("bankgiro")
-    @classmethod
-    def validate_bankgiro(cls, v: str | None) -> str | None:
-        if v is None:
-            return None
-        # Bankgiro: 7-8 digits
-        cleaned = re.sub(r"[^0-9]", "", v)
-        if not (7 <= len(cleaned) <= 8):
-            raise ValueError("Bankgiro must be 7-8 digits")
-        return cleaned
-
-    @field_validator("ocr_number")
-    @classmethod
-    def validate_ocr(cls, v: str | None) -> str | None:
-        if v is None:
-            return None
-        # OCR: 2-25 digits
-        cleaned = re.sub(r"[^0-9]", "", v)
-        if not (2 <= len(cleaned) <= 25):
-            raise ValueError("OCR must be 2-25 digits")
-        return cleaned
-```
-
-### Response Format
-
-```python
-from pydantic import BaseModel
-from typing import Generic, TypeVar
-
-T = TypeVar("T")
-
-class ApiResponse(BaseModel, Generic[T]):
-    success: bool
-    data: T | None = None
-    error: str | None = None
-    meta: dict | None = None
-
-# Success response
-return ApiResponse(
-    success=True,
-    data=result,
-    meta={"processing_time_ms": elapsed_ms}
-)
-
-# Error response
-return ApiResponse(
-    success=False,
-    error="Invalid PDF format"
-)
+// Avoid multiple enumerations
+var list = documents.ToList();  // Materialize once
+var count = list.Count;
+var first = list.FirstOrDefault();
 ```

 ## File Organization

-### Project Structure
-
 ```
 src/
-├── cli/                  # Command-line interfaces
-│   ├── autolabel.py
-│   ├── train.py
-│   └── infer.py
-├── pdf/                  # PDF processing
-│   ├── extractor.py
-│   └── renderer.py
-├── ocr/                  # OCR processing
-│   ├── paddle_ocr.py
-│   └── machine_code_parser.py
-├── inference/            # Inference pipeline
-│   ├── pipeline.py
-│   ├── yolo_detector.py
-│   └── field_extractor.py
-├── normalize/            # Field normalization
-│   ├── base.py
-│   ├── date_normalizer.py
-│   └── amount_normalizer.py
-├── web/                  # FastAPI application
-│   ├── app.py
-│   ├── routes.py
-│   ├── services.py
-│   └── schemas.py
-└── utils/                # Shared utilities
-    ├── validators.py
-    ├── text_cleaner.py
-    └── logging.py
-tests/                    # Mirror of src structure
-    ├── test_pdf/
-    ├── test_ocr/
-    └── test_inference/
+  Domain/           # Entities, value objects
+  Application/      # Use cases, DTOs, interfaces
+  Infrastructure/   # EF Core, external services
+  Api/              # Controllers, middleware
+tests/
+  Unit/
+  Integration/
 ```

-### File Naming
+**Guidelines:**
+- Max 800 lines per file (typical 200-400)
+- Max 50 lines per method
+- One class per file (except nested)
+- Group by feature, not by type

-```
-src/ocr/paddle_ocr.py           # snake_case for modules
-src/inference/yolo_detector.py  # snake_case for modules
-tests/test_paddle_ocr.py        # test_ prefix for tests
-config.py                       # snake_case for config
+## Code Smells
+
+```csharp
+// BAD: Deep nesting
+if (doc != null)
+    if (doc.IsValid)
+        if (doc.HasFields)
+            // ...
+
+// GOOD: Early returns
+if (doc is null) return null;
+if (!doc.IsValid) return null;
+if (!doc.HasFields) return null;
+// ...
+
+// BAD: Magic numbers
+if (confidence > 0.5) { }
+
+// GOOD: Named constants
+private const double ConfidenceThreshold = 0.5;
+if (confidence > ConfidenceThreshold) { }
 ```

-### Module Size Guidelines
+## Logging

- **Maximum**: 800 lines per file
- **Typical**: 200-400 lines per file
- **Functions**: Max 50 lines each
- Extract utilities when modules grow too large
+```csharp
+// Structured logging with templates
+_logger.LogInformation("Processing document {DocumentId}", docId);
+_logger.LogError(ex, "Failed to process {DocumentId}", docId);

-## Comments & Documentation
-
-### When to Comment
-
-```python
-# GOOD: Explain WHY, not WHAT
-# Swedish Bankgiro uses Luhn algorithm with weight [1,2,1,2...]
-def validate_bankgiro_checksum(bankgiro: str) -> bool:
-    ...
-
-# Payment line format: 7 groups separated by #, checksum at end
-def parse_payment_line(line: str) -> PaymentLineData:
-    ...
-
-# BAD: Stating the obvious
-# Increment counter by 1
-count += 1
-
-# Set name to user's name
-name = user.name
+// Appropriate levels
+LogDebug    // Development details
+LogInformation  // Normal operations
+LogWarning  // Potential issues
+LogError    // Errors with exceptions
 ```

-### Docstrings for Public APIs
+## Testing (AAA Pattern)

-```python
-def extract_invoice_fields(
-    pdf_path: Path,
-    confidence_threshold: float = 0.5,
-    use_gpu: bool = True
-) -> InferenceResult:
-    """Extract structured fields from Swedish invoice PDF.
+```csharp
+[Fact]
+public async Task GetById_WithValidId_ReturnsDocument()
+{
+    // Arrange
+    var repo = Substitute.For<IRepository<Document>>();
+    repo.GetByIdAsync(Arg.Any<Guid>(), Arg.Any<CancellationToken>())
+        .Returns(new Document("Test"));
+    var service = new DocumentService(repo);

-    Uses YOLOv11 for field detection and PaddleOCR for text extraction.
-    Applies field-specific normalization and validation.
+    // Act
+    var result = await service.GetAsync(Guid.NewGuid(), CancellationToken.None);

-    Args:
-        pdf_path: Path to the invoice PDF file.
-        confidence_threshold: Minimum confidence for field detection (0.0-1.0).
-        use_gpu: Whether to use GPU acceleration.
-
-    Returns:
-        InferenceResult containing extracted fields and confidence scores.
-
-    Raises:
-        FileNotFoundError: If PDF file doesn't exist.
-        ProcessingError: If OCR or detection fails.
-
-    Example:
-        >>> result = extract_invoice_fields(Path("invoice.pdf"))
-        >>> print(result.fields["invoice_number"])
-        "INV-2024-001"
-    """
-    ...
+    // Assert
+    result.Should().NotBeNull();
+    result!.Name.Should().Be("Test");
+}
 ```

-## Performance Best Practices
+## Key Rules

-### Caching
-
-```python
-from functools import lru_cache
-from cachetools import TTLCache
-
-# Static data: LRU cache
-@lru_cache(maxsize=100)
-def get_field_config(field_name: str) -> FieldConfig:
-    """Load field configuration (cached)."""
-    return load_config(field_name)
-
-# Dynamic data: TTL cache
-_document_cache = TTLCache(maxsize=1000, ttl=300)  # 5 minutes
-
-def get_document_cached(doc_id: str) -> Document | None:
-    if doc_id in _document_cache:
-        return _document_cache[doc_id]
-
-    doc = repo.find_by_id(doc_id)
-    if doc:
-        _document_cache[doc_id] = doc
-    return doc
-```
-
-### Database Queries
-
-```python
-# GOOD: Select only needed columns
-cur.execute("""
-    SELECT id, status, fields->>'invoice_number'
-    FROM documents
-    WHERE status = %s
-    LIMIT %s
-""", ('processed', 10))
-
-# BAD: Select everything
-cur.execute("SELECT * FROM documents")
-
-# GOOD: Batch operations
-cur.executemany(
-    "INSERT INTO labels (doc_id, field, value) VALUES (%s, %s, %s)",
-    [(doc_id, f, v) for f, v in fields.items()]
-)
-
-# BAD: Individual inserts in loop
-for field, value in fields.items():
-    cur.execute("INSERT INTO labels ...", (doc_id, field, value))
-```
-
-### Lazy Loading
-
-```python
-class InferencePipeline:
-    def __init__(self, model_path: Path):
-        self.model_path = model_path
-        self._model: YOLO | None = None
-        self._ocr: PaddleOCR | None = None
-
-    @property
-    def model(self) -> YOLO:
-        """Lazy load YOLO model."""
-        if self._model is None:
-            self._model = YOLO(str(self.model_path))
-        return self._model
-
-    @property
-    def ocr(self) -> PaddleOCR:
-        """Lazy load PaddleOCR."""
-        if self._ocr is None:
-            self._ocr = PaddleOCR(use_angle_cls=True, lang="latin")
-        return self._ocr
-```
-
-## Testing Standards
-
-### Test Structure (AAA Pattern)
-
-```python
-def test_extract_bankgiro_valid():
-    # Arrange
-    text = "Bankgiro: 123-4567"
-
-    # Act
-    result = extract_bankgiro(text)
-
-    # Assert
-    assert result == "1234567"
-
-def test_extract_bankgiro_invalid_returns_none():
-    # Arrange
-    text = "No bankgiro here"
-
-    # Act
-    result = extract_bankgiro(text)
-
-    # Assert
-    assert result is None
-```
-
-### Test Naming
-
-```python
-# GOOD: Descriptive test names
-def test_parse_payment_line_extracts_all_fields(): ...
-def test_parse_payment_line_handles_missing_checksum(): ...
-def test_validate_ocr_returns_false_for_invalid_checksum(): ...
-
-# BAD: Vague test names
-def test_parse(): ...
-def test_works(): ...
-def test_payment_line(): ...
-```
-
-### Fixtures
-
-```python
-import pytest
-from pathlib import Path
-
-@pytest.fixture
-def sample_invoice_pdf(tmp_path: Path) -> Path:
-    """Create sample invoice PDF for testing."""
-    pdf_path = tmp_path / "invoice.pdf"
-    # Create test PDF...
-    return pdf_path
-
-@pytest.fixture
-def inference_pipeline(sample_model_path: Path) -> InferencePipeline:
-    """Create inference pipeline with test model."""
-    return InferencePipeline(sample_model_path)
-
-def test_process_invoice(inference_pipeline, sample_invoice_pdf):
-    result = inference_pipeline.process(sample_invoice_pdf)
-    assert result.fields.get("invoice_number") is not None
-```
-
-## Code Smell Detection
-
-### 1. Long Functions
-
-```python
-# BAD: Function > 50 lines
-def process_document():
-    # 100 lines of code...
-
-# GOOD: Split into smaller functions
-def process_document(pdf_path: Path) -> InferenceResult:
-    image = render_pdf(pdf_path)
-    detections = detect_fields(image)
-    ocr_results = extract_text(image, detections)
-    fields = normalize_fields(ocr_results)
-    return build_result(fields)
-```
-
-### 2. Deep Nesting
-
-```python
-# BAD: 5+ levels of nesting
-if document:
-    if document.is_valid:
-        if document.has_fields:
-            if field in document.fields:
-                if document.fields[field]:
-                    # Do something
-
-# GOOD: Early returns
-if not document:
-    return None
-if not document.is_valid:
-    return None
-if not document.has_fields:
-    return None
-if field not in document.fields:
-    return None
-if not document.fields[field]:
-    return None
-
-# Do something
-```
-
-### 3. Magic Numbers
-
-```python
-# BAD: Unexplained numbers
-if confidence > 0.5:
-    ...
-time.sleep(3)
-
-# GOOD: Named constants
-CONFIDENCE_THRESHOLD = 0.5
-RETRY_DELAY_SECONDS = 3
-
-if confidence > CONFIDENCE_THRESHOLD:
-    ...
-time.sleep(RETRY_DELAY_SECONDS)
-```
-
-### 4. Mutable Default Arguments
-
-```python
-# BAD: Mutable default argument
-def process_fields(fields: list = []):  # DANGEROUS!
-    fields.append("new_field")
-    return fields
-
-# GOOD: Use None as default
-def process_fields(fields: list | None = None) -> list:
-    if fields is None:
-        fields = []
-    return [*fields, "new_field"]
-```
-
-## Logging Standards
-
-```python
-import logging
-
-# Module-level logger
-logger = logging.getLogger(__name__)
-
-# GOOD: Appropriate log levels
-logger.debug("Processing document: %s", doc_id)
-logger.info("Document processed successfully: %s", doc_id)
-logger.warning("Low confidence score: %.2f", confidence)
-logger.error("Failed to process document: %s", error)
-
-# GOOD: Structured logging with extra data
-logger.info(
-    "Inference complete",
-    extra={
-        "document_id": doc_id,
-        "field_count": len(fields),
-        "processing_time_ms": elapsed_ms
-    }
-)
-
-# BAD: Using print()
-print(f"Processing {doc_id}")  # Never in production!
-```
-
-**Remember**: Code quality is not negotiable. Clear, maintainable Python code with proper type hints enables confident development and refactoring.
+- Always use `CancellationToken` for async methods
+- Prefer `records` for DTOs and immutable data
+- Use `IReadOnlyList<T>` for return types
+- Never use `async void` (except event handlers)
+- Always handle `null` with pattern matching or null operators
+- Use structured logging, never `Console.WriteLine`