Files

Yaojia Wang d5101e3604 Add claude config

2026-01-25 16:17:23 +01:00

16 KiB

Raw Permalink Blame History

name, description

name	description
coding-standards	Universal coding standards, best practices, and patterns for Python, FastAPI, and data processing development.

Coding Standards & Best Practices

Python coding standards for the Invoice Master project.

Code Quality Principles

1. Readability First

Code is read more than written
Clear variable and function names
Self-documenting code preferred over comments
Consistent formatting (follow PEP 8)

2. KISS (Keep It Simple, Stupid)

Simplest solution that works
Avoid over-engineering
No premature optimization
Easy to understand > clever code

3. DRY (Don't Repeat Yourself)

Extract common logic into functions
Create reusable utilities
Share modules across the codebase
Avoid copy-paste programming

4. YAGNI (You Aren't Gonna Need It)

Don't build features before they're needed
Avoid speculative generality
Add complexity only when required
Start simple, refactor when needed

Python Standards

Variable Naming

# GOOD: Descriptive names
invoice_number = "INV-2024-001"
is_valid_document = True
total_confidence_score = 0.95

# BAD: Unclear names
inv = "INV-2024-001"
flag = True
x = 0.95

Function Naming

# GOOD: Verb-noun pattern with type hints
def extract_invoice_fields(pdf_path: Path) -> dict[str, str]:
    """Extract fields from invoice PDF."""
    ...

def calculate_confidence(predictions: list[float]) -> float:
    """Calculate average confidence score."""
    ...

def is_valid_bankgiro(value: str) -> bool:
    """Check if value is valid Bankgiro number."""
    ...

# BAD: Unclear or noun-only
def invoice(path):
    ...

def confidence(p):
    ...

def bankgiro(v):
    ...

Type Hints (REQUIRED)

# GOOD: Full type annotations
from typing import Optional
from pathlib import Path
from dataclasses import dataclass

@dataclass
class InferenceResult:
    document_id: str
    fields: dict[str, str]
    confidence: dict[str, float]
    processing_time_ms: float

def process_document(
    pdf_path: Path,
    confidence_threshold: float = 0.5
) -> InferenceResult:
    """Process PDF and return extracted fields."""
    ...

# BAD: No type hints
def process_document(pdf_path, confidence_threshold=0.5):
    ...

Immutability Pattern (CRITICAL)

# GOOD: Create new objects, don't mutate
def update_fields(fields: dict[str, str], updates: dict[str, str]) -> dict[str, str]:
    return {**fields, **updates}

def add_item(items: list[str], new_item: str) -> list[str]:
    return [*items, new_item]

# BAD: Direct mutation
def update_fields(fields: dict[str, str], updates: dict[str, str]) -> dict[str, str]:
    fields.update(updates)  # MUTATION!
    return fields

def add_item(items: list[str], new_item: str) -> list[str]:
    items.append(new_item)  # MUTATION!
    return items

Error Handling

import logging

logger = logging.getLogger(__name__)

# GOOD: Comprehensive error handling with logging
def load_model(model_path: Path) -> Model:
    """Load YOLO model from path."""
    try:
        if not model_path.exists():
            raise FileNotFoundError(f"Model not found: {model_path}")

        model = YOLO(str(model_path))
        logger.info(f"Model loaded: {model_path}")
        return model
    except Exception as e:
        logger.error(f"Failed to load model: {e}")
        raise RuntimeError(f"Model loading failed: {model_path}") from e

# BAD: No error handling
def load_model(model_path):
    return YOLO(str(model_path))

# BAD: Bare except
def load_model(model_path):
    try:
        return YOLO(str(model_path))
    except:  # Never use bare except!
        return None

Async Best Practices

import asyncio

# GOOD: Parallel execution when possible
async def process_batch(pdf_paths: list[Path]) -> list[InferenceResult]:
    tasks = [process_document(path) for path in pdf_paths]
    results = await asyncio.gather(*tasks, return_exceptions=True)

    # Handle exceptions
    valid_results = []
    for path, result in zip(pdf_paths, results):
        if isinstance(result, Exception):
            logger.error(f"Failed to process {path}: {result}")
        else:
            valid_results.append(result)
    return valid_results

# BAD: Sequential when unnecessary
async def process_batch(pdf_paths: list[Path]) -> list[InferenceResult]:
    results = []
    for path in pdf_paths:
        result = await process_document(path)
        results.append(result)
    return results

Context Managers

from contextlib import contextmanager
from pathlib import Path
import tempfile

# GOOD: Proper resource management
@contextmanager
def temp_pdf_copy(pdf_path: Path):
    """Create temporary copy of PDF for processing."""
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
        tmp.write(pdf_path.read_bytes())
        tmp_path = Path(tmp.name)
    try:
        yield tmp_path
    finally:
        tmp_path.unlink(missing_ok=True)

# Usage
with temp_pdf_copy(original_pdf) as tmp_pdf:
    result = process_pdf(tmp_pdf)

FastAPI Best Practices

Route Structure

from fastapi import APIRouter, HTTPException, Depends, Query, File, UploadFile
from pydantic import BaseModel

router = APIRouter(prefix="/api/v1", tags=["inference"])

class InferenceResponse(BaseModel):
    success: bool
    document_id: str
    fields: dict[str, str]
    confidence: dict[str, float]
    processing_time_ms: float

@router.post("/infer", response_model=InferenceResponse)
async def infer_document(
    file: UploadFile = File(...),
    confidence_threshold: float = Query(0.5, ge=0.0, le=1.0)
) -> InferenceResponse:
    """Process invoice PDF and extract fields."""
    if not file.filename.endswith(".pdf"):
        raise HTTPException(status_code=400, detail="Only PDF files accepted")

    result = await inference_service.process(file, confidence_threshold)
    return InferenceResponse(
        success=True,
        document_id=result.document_id,
        fields=result.fields,
        confidence=result.confidence,
        processing_time_ms=result.processing_time_ms
    )

Input Validation with Pydantic

from pydantic import BaseModel, Field, field_validator
from datetime import date
import re

class InvoiceData(BaseModel):
    invoice_number: str = Field(..., min_length=1, max_length=50)
    invoice_date: date
    amount: float = Field(..., gt=0)
    bankgiro: str | None = None
    ocr_number: str | None = None

    @field_validator("bankgiro")
    @classmethod
    def validate_bankgiro(cls, v: str | None) -> str | None:
        if v is None:
            return None
        # Bankgiro: 7-8 digits
        cleaned = re.sub(r"[^0-9]", "", v)
        if not (7 <= len(cleaned) <= 8):
            raise ValueError("Bankgiro must be 7-8 digits")
        return cleaned

    @field_validator("ocr_number")
    @classmethod
    def validate_ocr(cls, v: str | None) -> str | None:
        if v is None:
            return None
        # OCR: 2-25 digits
        cleaned = re.sub(r"[^0-9]", "", v)
        if not (2 <= len(cleaned) <= 25):
            raise ValueError("OCR must be 2-25 digits")
        return cleaned

Response Format

from pydantic import BaseModel
from typing import Generic, TypeVar

T = TypeVar("T")

class ApiResponse(BaseModel, Generic[T]):
    success: bool
    data: T | None = None
    error: str | None = None
    meta: dict | None = None

# Success response
return ApiResponse(
    success=True,
    data=result,
    meta={"processing_time_ms": elapsed_ms}
)

# Error response
return ApiResponse(
    success=False,
    error="Invalid PDF format"
)

File Organization

Project Structure

src/
├── cli/                  # Command-line interfaces
│   ├── autolabel.py
│   ├── train.py
│   └── infer.py
├── pdf/                  # PDF processing
│   ├── extractor.py
│   └── renderer.py
├── ocr/                  # OCR processing
│   ├── paddle_ocr.py
│   └── machine_code_parser.py
├── inference/            # Inference pipeline
│   ├── pipeline.py
│   ├── yolo_detector.py
│   └── field_extractor.py
├── normalize/            # Field normalization
│   ├── base.py
│   ├── date_normalizer.py
│   └── amount_normalizer.py
├── web/                  # FastAPI application
│   ├── app.py
│   ├── routes.py
│   ├── services.py
│   └── schemas.py
└── utils/                # Shared utilities
    ├── validators.py
    ├── text_cleaner.py
    └── logging.py
tests/                    # Mirror of src structure
    ├── test_pdf/
    ├── test_ocr/
    └── test_inference/

File Naming

src/ocr/paddle_ocr.py           # snake_case for modules
src/inference/yolo_detector.py  # snake_case for modules
tests/test_paddle_ocr.py        # test_ prefix for tests
config.py                       # snake_case for config

Module Size Guidelines

Maximum: 800 lines per file
Typical: 200-400 lines per file
Functions: Max 50 lines each
Extract utilities when modules grow too large

Comments & Documentation

When to Comment

# GOOD: Explain WHY, not WHAT
# Swedish Bankgiro uses Luhn algorithm with weight [1,2,1,2...]
def validate_bankgiro_checksum(bankgiro: str) -> bool:
    ...

# Payment line format: 7 groups separated by #, checksum at end
def parse_payment_line(line: str) -> PaymentLineData:
    ...

# BAD: Stating the obvious
# Increment counter by 1
count += 1

# Set name to user's name
name = user.name

Docstrings for Public APIs

def extract_invoice_fields(
    pdf_path: Path,
    confidence_threshold: float = 0.5,
    use_gpu: bool = True
) -> InferenceResult:
    """Extract structured fields from Swedish invoice PDF.

    Uses YOLOv11 for field detection and PaddleOCR for text extraction.
    Applies field-specific normalization and validation.

    Args:
        pdf_path: Path to the invoice PDF file.
        confidence_threshold: Minimum confidence for field detection (0.0-1.0).
        use_gpu: Whether to use GPU acceleration.

    Returns:
        InferenceResult containing extracted fields and confidence scores.

    Raises:
        FileNotFoundError: If PDF file doesn't exist.
        ProcessingError: If OCR or detection fails.

    Example:
        >>> result = extract_invoice_fields(Path("invoice.pdf"))
        >>> print(result.fields["invoice_number"])
        "INV-2024-001"
    """
    ...

Performance Best Practices

Caching

from functools import lru_cache
from cachetools import TTLCache

# Static data: LRU cache
@lru_cache(maxsize=100)
def get_field_config(field_name: str) -> FieldConfig:
    """Load field configuration (cached)."""
    return load_config(field_name)

# Dynamic data: TTL cache
_document_cache = TTLCache(maxsize=1000, ttl=300)  # 5 minutes

def get_document_cached(doc_id: str) -> Document | None:
    if doc_id in _document_cache:
        return _document_cache[doc_id]

    doc = repo.find_by_id(doc_id)
    if doc:
        _document_cache[doc_id] = doc
    return doc

Database Queries

# GOOD: Select only needed columns
cur.execute("""
    SELECT id, status, fields->>'invoice_number'
    FROM documents
    WHERE status = %s
    LIMIT %s
""", ('processed', 10))

# BAD: Select everything
cur.execute("SELECT * FROM documents")

# GOOD: Batch operations
cur.executemany(
    "INSERT INTO labels (doc_id, field, value) VALUES (%s, %s, %s)",
    [(doc_id, f, v) for f, v in fields.items()]
)

# BAD: Individual inserts in loop
for field, value in fields.items():
    cur.execute("INSERT INTO labels ...", (doc_id, field, value))

Lazy Loading

class InferencePipeline:
    def __init__(self, model_path: Path):
        self.model_path = model_path
        self._model: YOLO | None = None
        self._ocr: PaddleOCR | None = None

    @property
    def model(self) -> YOLO:
        """Lazy load YOLO model."""
        if self._model is None:
            self._model = YOLO(str(self.model_path))
        return self._model

    @property
    def ocr(self) -> PaddleOCR:
        """Lazy load PaddleOCR."""
        if self._ocr is None:
            self._ocr = PaddleOCR(use_angle_cls=True, lang="latin")
        return self._ocr

Testing Standards

Test Structure (AAA Pattern)

def test_extract_bankgiro_valid():
    # Arrange
    text = "Bankgiro: 123-4567"

    # Act
    result = extract_bankgiro(text)

    # Assert
    assert result == "1234567"

def test_extract_bankgiro_invalid_returns_none():
    # Arrange
    text = "No bankgiro here"

    # Act
    result = extract_bankgiro(text)

    # Assert
    assert result is None

Test Naming

# GOOD: Descriptive test names
def test_parse_payment_line_extracts_all_fields(): ...
def test_parse_payment_line_handles_missing_checksum(): ...
def test_validate_ocr_returns_false_for_invalid_checksum(): ...

# BAD: Vague test names
def test_parse(): ...
def test_works(): ...
def test_payment_line(): ...

Fixtures

import pytest
from pathlib import Path

@pytest.fixture
def sample_invoice_pdf(tmp_path: Path) -> Path:
    """Create sample invoice PDF for testing."""
    pdf_path = tmp_path / "invoice.pdf"
    # Create test PDF...
    return pdf_path

@pytest.fixture
def inference_pipeline(sample_model_path: Path) -> InferencePipeline:
    """Create inference pipeline with test model."""
    return InferencePipeline(sample_model_path)

def test_process_invoice(inference_pipeline, sample_invoice_pdf):
    result = inference_pipeline.process(sample_invoice_pdf)
    assert result.fields.get("invoice_number") is not None

Code Smell Detection

1. Long Functions

# BAD: Function > 50 lines
def process_document():
    # 100 lines of code...

# GOOD: Split into smaller functions
def process_document(pdf_path: Path) -> InferenceResult:
    image = render_pdf(pdf_path)
    detections = detect_fields(image)
    ocr_results = extract_text(image, detections)
    fields = normalize_fields(ocr_results)
    return build_result(fields)

2. Deep Nesting

# BAD: 5+ levels of nesting
if document:
    if document.is_valid:
        if document.has_fields:
            if field in document.fields:
                if document.fields[field]:
                    # Do something

# GOOD: Early returns
if not document:
    return None
if not document.is_valid:
    return None
if not document.has_fields:
    return None
if field not in document.fields:
    return None
if not document.fields[field]:
    return None

# Do something

3. Magic Numbers

# BAD: Unexplained numbers
if confidence > 0.5:
    ...
time.sleep(3)

# GOOD: Named constants
CONFIDENCE_THRESHOLD = 0.5
RETRY_DELAY_SECONDS = 3

if confidence > CONFIDENCE_THRESHOLD:
    ...
time.sleep(RETRY_DELAY_SECONDS)

4. Mutable Default Arguments

# BAD: Mutable default argument
def process_fields(fields: list = []):  # DANGEROUS!
    fields.append("new_field")
    return fields

# GOOD: Use None as default
def process_fields(fields: list | None = None) -> list:
    if fields is None:
        fields = []
    return [*fields, "new_field"]

Logging Standards

import logging

# Module-level logger
logger = logging.getLogger(__name__)

# GOOD: Appropriate log levels
logger.debug("Processing document: %s", doc_id)
logger.info("Document processed successfully: %s", doc_id)
logger.warning("Low confidence score: %.2f", confidence)
logger.error("Failed to process document: %s", error)

# GOOD: Structured logging with extra data
logger.info(
    "Inference complete",
    extra={
        "document_id": doc_id,
        "field_count": len(fields),
        "processing_time_ms": elapsed_ms
    }
)

# BAD: Using print()
print(f"Processing {doc_id}")  # Never in production!

Remember: Code quality is not negotiable. Clear, maintainable Python code with proper type hints enables confident development and refactoring.

16 KiB Raw Permalink Blame History