666 lines
16 KiB
Markdown
666 lines
16 KiB
Markdown
---
|
|
name: coding-standards
|
|
description: Universal coding standards, best practices, and patterns for Python, FastAPI, and data processing development.
|
|
---
|
|
|
|
# Coding Standards & Best Practices
|
|
|
|
Python coding standards for the Invoice Master project.
|
|
|
|
## Code Quality Principles
|
|
|
|
### 1. Readability First
|
|
- Code is read more than written
|
|
- Clear variable and function names
|
|
- Self-documenting code preferred over comments
|
|
- Consistent formatting (follow PEP 8)
|
|
|
|
### 2. KISS (Keep It Simple, Stupid)
|
|
- Simplest solution that works
|
|
- Avoid over-engineering
|
|
- No premature optimization
|
|
- Easy to understand > clever code
|
|
|
|
### 3. DRY (Don't Repeat Yourself)
|
|
- Extract common logic into functions
|
|
- Create reusable utilities
|
|
- Share modules across the codebase
|
|
- Avoid copy-paste programming
|
|
|
|
### 4. YAGNI (You Aren't Gonna Need It)
|
|
- Don't build features before they're needed
|
|
- Avoid speculative generality
|
|
- Add complexity only when required
|
|
- Start simple, refactor when needed
|
|
|
|
## Python Standards
|
|
|
|
### Variable Naming
|
|
|
|
```python
|
|
# GOOD: Descriptive names
|
|
invoice_number = "INV-2024-001"
|
|
is_valid_document = True
|
|
total_confidence_score = 0.95
|
|
|
|
# BAD: Unclear names
|
|
inv = "INV-2024-001"
|
|
flag = True
|
|
x = 0.95
|
|
```
|
|
|
|
### Function Naming
|
|
|
|
```python
|
|
# GOOD: Verb-noun pattern with type hints
|
|
def extract_invoice_fields(pdf_path: Path) -> dict[str, str]:
|
|
"""Extract fields from invoice PDF."""
|
|
...
|
|
|
|
def calculate_confidence(predictions: list[float]) -> float:
|
|
"""Calculate average confidence score."""
|
|
...
|
|
|
|
def is_valid_bankgiro(value: str) -> bool:
|
|
"""Check if value is valid Bankgiro number."""
|
|
...
|
|
|
|
# BAD: Unclear or noun-only
|
|
def invoice(path):
|
|
...
|
|
|
|
def confidence(p):
|
|
...
|
|
|
|
def bankgiro(v):
|
|
...
|
|
```
|
|
|
|
### Type Hints (REQUIRED)
|
|
|
|
```python
|
|
# GOOD: Full type annotations
|
|
from typing import Optional
|
|
from pathlib import Path
|
|
from dataclasses import dataclass
|
|
|
|
@dataclass
|
|
class InferenceResult:
|
|
document_id: str
|
|
fields: dict[str, str]
|
|
confidence: dict[str, float]
|
|
processing_time_ms: float
|
|
|
|
def process_document(
|
|
pdf_path: Path,
|
|
confidence_threshold: float = 0.5
|
|
) -> InferenceResult:
|
|
"""Process PDF and return extracted fields."""
|
|
...
|
|
|
|
# BAD: No type hints
|
|
def process_document(pdf_path, confidence_threshold=0.5):
|
|
...
|
|
```
|
|
|
|
### Immutability Pattern (CRITICAL)
|
|
|
|
```python
|
|
# GOOD: Create new objects, don't mutate
|
|
def update_fields(fields: dict[str, str], updates: dict[str, str]) -> dict[str, str]:
|
|
return {**fields, **updates}
|
|
|
|
def add_item(items: list[str], new_item: str) -> list[str]:
|
|
return [*items, new_item]
|
|
|
|
# BAD: Direct mutation
|
|
def update_fields(fields: dict[str, str], updates: dict[str, str]) -> dict[str, str]:
|
|
fields.update(updates) # MUTATION!
|
|
return fields
|
|
|
|
def add_item(items: list[str], new_item: str) -> list[str]:
|
|
items.append(new_item) # MUTATION!
|
|
return items
|
|
```
|
|
|
|
### Error Handling
|
|
|
|
```python
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# GOOD: Comprehensive error handling with logging
|
|
def load_model(model_path: Path) -> Model:
|
|
"""Load YOLO model from path."""
|
|
try:
|
|
if not model_path.exists():
|
|
raise FileNotFoundError(f"Model not found: {model_path}")
|
|
|
|
model = YOLO(str(model_path))
|
|
logger.info(f"Model loaded: {model_path}")
|
|
return model
|
|
except Exception as e:
|
|
logger.error(f"Failed to load model: {e}")
|
|
raise RuntimeError(f"Model loading failed: {model_path}") from e
|
|
|
|
# BAD: No error handling
|
|
def load_model(model_path):
|
|
return YOLO(str(model_path))
|
|
|
|
# BAD: Bare except
|
|
def load_model(model_path):
|
|
try:
|
|
return YOLO(str(model_path))
|
|
except: # Never use bare except!
|
|
return None
|
|
```
|
|
|
|
### Async Best Practices
|
|
|
|
```python
|
|
import asyncio
|
|
|
|
# GOOD: Parallel execution when possible
|
|
async def process_batch(pdf_paths: list[Path]) -> list[InferenceResult]:
|
|
tasks = [process_document(path) for path in pdf_paths]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Handle exceptions
|
|
valid_results = []
|
|
for path, result in zip(pdf_paths, results):
|
|
if isinstance(result, Exception):
|
|
logger.error(f"Failed to process {path}: {result}")
|
|
else:
|
|
valid_results.append(result)
|
|
return valid_results
|
|
|
|
# BAD: Sequential when unnecessary
|
|
async def process_batch(pdf_paths: list[Path]) -> list[InferenceResult]:
|
|
results = []
|
|
for path in pdf_paths:
|
|
result = await process_document(path)
|
|
results.append(result)
|
|
return results
|
|
```
|
|
|
|
### Context Managers
|
|
|
|
```python
|
|
from contextlib import contextmanager
|
|
from pathlib import Path
|
|
import tempfile
|
|
|
|
# GOOD: Proper resource management
|
|
@contextmanager
|
|
def temp_pdf_copy(pdf_path: Path):
|
|
"""Create temporary copy of PDF for processing."""
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
|
tmp.write(pdf_path.read_bytes())
|
|
tmp_path = Path(tmp.name)
|
|
try:
|
|
yield tmp_path
|
|
finally:
|
|
tmp_path.unlink(missing_ok=True)
|
|
|
|
# Usage
|
|
with temp_pdf_copy(original_pdf) as tmp_pdf:
|
|
result = process_pdf(tmp_pdf)
|
|
```
|
|
|
|
## FastAPI Best Practices
|
|
|
|
### Route Structure
|
|
|
|
```python
|
|
from fastapi import APIRouter, HTTPException, Depends, Query, File, UploadFile
|
|
from pydantic import BaseModel
|
|
|
|
router = APIRouter(prefix="/api/v1", tags=["inference"])
|
|
|
|
class InferenceResponse(BaseModel):
|
|
success: bool
|
|
document_id: str
|
|
fields: dict[str, str]
|
|
confidence: dict[str, float]
|
|
processing_time_ms: float
|
|
|
|
@router.post("/infer", response_model=InferenceResponse)
|
|
async def infer_document(
|
|
file: UploadFile = File(...),
|
|
confidence_threshold: float = Query(0.5, ge=0.0, le=1.0)
|
|
) -> InferenceResponse:
|
|
"""Process invoice PDF and extract fields."""
|
|
if not file.filename.endswith(".pdf"):
|
|
raise HTTPException(status_code=400, detail="Only PDF files accepted")
|
|
|
|
result = await inference_service.process(file, confidence_threshold)
|
|
return InferenceResponse(
|
|
success=True,
|
|
document_id=result.document_id,
|
|
fields=result.fields,
|
|
confidence=result.confidence,
|
|
processing_time_ms=result.processing_time_ms
|
|
)
|
|
```
|
|
|
|
### Input Validation with Pydantic
|
|
|
|
```python
|
|
from pydantic import BaseModel, Field, field_validator
|
|
from datetime import date
|
|
import re
|
|
|
|
class InvoiceData(BaseModel):
|
|
invoice_number: str = Field(..., min_length=1, max_length=50)
|
|
invoice_date: date
|
|
amount: float = Field(..., gt=0)
|
|
bankgiro: str | None = None
|
|
ocr_number: str | None = None
|
|
|
|
@field_validator("bankgiro")
|
|
@classmethod
|
|
def validate_bankgiro(cls, v: str | None) -> str | None:
|
|
if v is None:
|
|
return None
|
|
# Bankgiro: 7-8 digits
|
|
cleaned = re.sub(r"[^0-9]", "", v)
|
|
if not (7 <= len(cleaned) <= 8):
|
|
raise ValueError("Bankgiro must be 7-8 digits")
|
|
return cleaned
|
|
|
|
@field_validator("ocr_number")
|
|
@classmethod
|
|
def validate_ocr(cls, v: str | None) -> str | None:
|
|
if v is None:
|
|
return None
|
|
# OCR: 2-25 digits
|
|
cleaned = re.sub(r"[^0-9]", "", v)
|
|
if not (2 <= len(cleaned) <= 25):
|
|
raise ValueError("OCR must be 2-25 digits")
|
|
return cleaned
|
|
```
|
|
|
|
### Response Format
|
|
|
|
```python
|
|
from pydantic import BaseModel
|
|
from typing import Generic, TypeVar
|
|
|
|
T = TypeVar("T")
|
|
|
|
class ApiResponse(BaseModel, Generic[T]):
|
|
success: bool
|
|
data: T | None = None
|
|
error: str | None = None
|
|
meta: dict | None = None
|
|
|
|
# Success response
|
|
return ApiResponse(
|
|
success=True,
|
|
data=result,
|
|
meta={"processing_time_ms": elapsed_ms}
|
|
)
|
|
|
|
# Error response
|
|
return ApiResponse(
|
|
success=False,
|
|
error="Invalid PDF format"
|
|
)
|
|
```
|
|
|
|
## File Organization
|
|
|
|
### Project Structure
|
|
|
|
```
|
|
src/
|
|
├── cli/ # Command-line interfaces
|
|
│ ├── autolabel.py
|
|
│ ├── train.py
|
|
│ └── infer.py
|
|
├── pdf/ # PDF processing
|
|
│ ├── extractor.py
|
|
│ └── renderer.py
|
|
├── ocr/ # OCR processing
|
|
│ ├── paddle_ocr.py
|
|
│ └── machine_code_parser.py
|
|
├── inference/ # Inference pipeline
|
|
│ ├── pipeline.py
|
|
│ ├── yolo_detector.py
|
|
│ └── field_extractor.py
|
|
├── normalize/ # Field normalization
|
|
│ ├── base.py
|
|
│ ├── date_normalizer.py
|
|
│ └── amount_normalizer.py
|
|
├── web/ # FastAPI application
|
|
│ ├── app.py
|
|
│ ├── routes.py
|
|
│ ├── services.py
|
|
│ └── schemas.py
|
|
└── utils/ # Shared utilities
|
|
├── validators.py
|
|
├── text_cleaner.py
|
|
└── logging.py
|
|
tests/ # Mirror of src structure
|
|
├── test_pdf/
|
|
├── test_ocr/
|
|
└── test_inference/
|
|
```
|
|
|
|
### File Naming
|
|
|
|
```
|
|
src/ocr/paddle_ocr.py # snake_case for modules
|
|
src/inference/yolo_detector.py # snake_case for modules
|
|
tests/test_paddle_ocr.py # test_ prefix for tests
|
|
config.py # snake_case for config
|
|
```
|
|
|
|
### Module Size Guidelines
|
|
|
|
- **Maximum**: 800 lines per file
|
|
- **Typical**: 200-400 lines per file
|
|
- **Functions**: Max 50 lines each
|
|
- Extract utilities when modules grow too large
|
|
|
|
## Comments & Documentation
|
|
|
|
### When to Comment
|
|
|
|
```python
|
|
# GOOD: Explain WHY, not WHAT
|
|
# Swedish Bankgiro uses Luhn algorithm with weight [1,2,1,2...]
|
|
def validate_bankgiro_checksum(bankgiro: str) -> bool:
|
|
...
|
|
|
|
# Payment line format: 7 groups separated by #, checksum at end
|
|
def parse_payment_line(line: str) -> PaymentLineData:
|
|
...
|
|
|
|
# BAD: Stating the obvious
|
|
# Increment counter by 1
|
|
count += 1
|
|
|
|
# Set name to user's name
|
|
name = user.name
|
|
```
|
|
|
|
### Docstrings for Public APIs
|
|
|
|
```python
|
|
def extract_invoice_fields(
|
|
pdf_path: Path,
|
|
confidence_threshold: float = 0.5,
|
|
use_gpu: bool = True
|
|
) -> InferenceResult:
|
|
"""Extract structured fields from Swedish invoice PDF.
|
|
|
|
Uses YOLOv11 for field detection and PaddleOCR for text extraction.
|
|
Applies field-specific normalization and validation.
|
|
|
|
Args:
|
|
pdf_path: Path to the invoice PDF file.
|
|
confidence_threshold: Minimum confidence for field detection (0.0-1.0).
|
|
use_gpu: Whether to use GPU acceleration.
|
|
|
|
Returns:
|
|
InferenceResult containing extracted fields and confidence scores.
|
|
|
|
Raises:
|
|
FileNotFoundError: If PDF file doesn't exist.
|
|
ProcessingError: If OCR or detection fails.
|
|
|
|
Example:
|
|
>>> result = extract_invoice_fields(Path("invoice.pdf"))
|
|
>>> print(result.fields["invoice_number"])
|
|
"INV-2024-001"
|
|
"""
|
|
...
|
|
```
|
|
|
|
## Performance Best Practices
|
|
|
|
### Caching
|
|
|
|
```python
|
|
from functools import lru_cache
|
|
from cachetools import TTLCache
|
|
|
|
# Static data: LRU cache
|
|
@lru_cache(maxsize=100)
|
|
def get_field_config(field_name: str) -> FieldConfig:
|
|
"""Load field configuration (cached)."""
|
|
return load_config(field_name)
|
|
|
|
# Dynamic data: TTL cache
|
|
_document_cache = TTLCache(maxsize=1000, ttl=300) # 5 minutes
|
|
|
|
def get_document_cached(doc_id: str) -> Document | None:
|
|
if doc_id in _document_cache:
|
|
return _document_cache[doc_id]
|
|
|
|
doc = repo.find_by_id(doc_id)
|
|
if doc:
|
|
_document_cache[doc_id] = doc
|
|
return doc
|
|
```
|
|
|
|
### Database Queries
|
|
|
|
```python
|
|
# GOOD: Select only needed columns
|
|
cur.execute("""
|
|
SELECT id, status, fields->>'invoice_number'
|
|
FROM documents
|
|
WHERE status = %s
|
|
LIMIT %s
|
|
""", ('processed', 10))
|
|
|
|
# BAD: Select everything
|
|
cur.execute("SELECT * FROM documents")
|
|
|
|
# GOOD: Batch operations
|
|
cur.executemany(
|
|
"INSERT INTO labels (doc_id, field, value) VALUES (%s, %s, %s)",
|
|
[(doc_id, f, v) for f, v in fields.items()]
|
|
)
|
|
|
|
# BAD: Individual inserts in loop
|
|
for field, value in fields.items():
|
|
cur.execute("INSERT INTO labels ...", (doc_id, field, value))
|
|
```
|
|
|
|
### Lazy Loading
|
|
|
|
```python
|
|
class InferencePipeline:
|
|
def __init__(self, model_path: Path):
|
|
self.model_path = model_path
|
|
self._model: YOLO | None = None
|
|
self._ocr: PaddleOCR | None = None
|
|
|
|
@property
|
|
def model(self) -> YOLO:
|
|
"""Lazy load YOLO model."""
|
|
if self._model is None:
|
|
self._model = YOLO(str(self.model_path))
|
|
return self._model
|
|
|
|
@property
|
|
def ocr(self) -> PaddleOCR:
|
|
"""Lazy load PaddleOCR."""
|
|
if self._ocr is None:
|
|
self._ocr = PaddleOCR(use_angle_cls=True, lang="latin")
|
|
return self._ocr
|
|
```
|
|
|
|
## Testing Standards
|
|
|
|
### Test Structure (AAA Pattern)
|
|
|
|
```python
|
|
def test_extract_bankgiro_valid():
|
|
# Arrange
|
|
text = "Bankgiro: 123-4567"
|
|
|
|
# Act
|
|
result = extract_bankgiro(text)
|
|
|
|
# Assert
|
|
assert result == "1234567"
|
|
|
|
def test_extract_bankgiro_invalid_returns_none():
|
|
# Arrange
|
|
text = "No bankgiro here"
|
|
|
|
# Act
|
|
result = extract_bankgiro(text)
|
|
|
|
# Assert
|
|
assert result is None
|
|
```
|
|
|
|
### Test Naming
|
|
|
|
```python
|
|
# GOOD: Descriptive test names
|
|
def test_parse_payment_line_extracts_all_fields(): ...
|
|
def test_parse_payment_line_handles_missing_checksum(): ...
|
|
def test_validate_ocr_returns_false_for_invalid_checksum(): ...
|
|
|
|
# BAD: Vague test names
|
|
def test_parse(): ...
|
|
def test_works(): ...
|
|
def test_payment_line(): ...
|
|
```
|
|
|
|
### Fixtures
|
|
|
|
```python
|
|
import pytest
|
|
from pathlib import Path
|
|
|
|
@pytest.fixture
|
|
def sample_invoice_pdf(tmp_path: Path) -> Path:
|
|
"""Create sample invoice PDF for testing."""
|
|
pdf_path = tmp_path / "invoice.pdf"
|
|
# Create test PDF...
|
|
return pdf_path
|
|
|
|
@pytest.fixture
|
|
def inference_pipeline(sample_model_path: Path) -> InferencePipeline:
|
|
"""Create inference pipeline with test model."""
|
|
return InferencePipeline(sample_model_path)
|
|
|
|
def test_process_invoice(inference_pipeline, sample_invoice_pdf):
|
|
result = inference_pipeline.process(sample_invoice_pdf)
|
|
assert result.fields.get("invoice_number") is not None
|
|
```
|
|
|
|
## Code Smell Detection
|
|
|
|
### 1. Long Functions
|
|
|
|
```python
|
|
# BAD: Function > 50 lines
|
|
def process_document():
|
|
# 100 lines of code...
|
|
|
|
# GOOD: Split into smaller functions
|
|
def process_document(pdf_path: Path) -> InferenceResult:
|
|
image = render_pdf(pdf_path)
|
|
detections = detect_fields(image)
|
|
ocr_results = extract_text(image, detections)
|
|
fields = normalize_fields(ocr_results)
|
|
return build_result(fields)
|
|
```
|
|
|
|
### 2. Deep Nesting
|
|
|
|
```python
|
|
# BAD: 5+ levels of nesting
|
|
if document:
|
|
if document.is_valid:
|
|
if document.has_fields:
|
|
if field in document.fields:
|
|
if document.fields[field]:
|
|
# Do something
|
|
|
|
# GOOD: Early returns
|
|
if not document:
|
|
return None
|
|
if not document.is_valid:
|
|
return None
|
|
if not document.has_fields:
|
|
return None
|
|
if field not in document.fields:
|
|
return None
|
|
if not document.fields[field]:
|
|
return None
|
|
|
|
# Do something
|
|
```
|
|
|
|
### 3. Magic Numbers
|
|
|
|
```python
|
|
# BAD: Unexplained numbers
|
|
if confidence > 0.5:
|
|
...
|
|
time.sleep(3)
|
|
|
|
# GOOD: Named constants
|
|
CONFIDENCE_THRESHOLD = 0.5
|
|
RETRY_DELAY_SECONDS = 3
|
|
|
|
if confidence > CONFIDENCE_THRESHOLD:
|
|
...
|
|
time.sleep(RETRY_DELAY_SECONDS)
|
|
```
|
|
|
|
### 4. Mutable Default Arguments
|
|
|
|
```python
|
|
# BAD: Mutable default argument
|
|
def process_fields(fields: list = []): # DANGEROUS!
|
|
fields.append("new_field")
|
|
return fields
|
|
|
|
# GOOD: Use None as default
|
|
def process_fields(fields: list | None = None) -> list:
|
|
if fields is None:
|
|
fields = []
|
|
return [*fields, "new_field"]
|
|
```
|
|
|
|
## Logging Standards
|
|
|
|
```python
|
|
import logging
|
|
|
|
# Module-level logger
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# GOOD: Appropriate log levels
|
|
logger.debug("Processing document: %s", doc_id)
|
|
logger.info("Document processed successfully: %s", doc_id)
|
|
logger.warning("Low confidence score: %.2f", confidence)
|
|
logger.error("Failed to process document: %s", error)
|
|
|
|
# GOOD: Structured logging with extra data
|
|
logger.info(
|
|
"Inference complete",
|
|
extra={
|
|
"document_id": doc_id,
|
|
"field_count": len(fields),
|
|
"processing_time_ms": elapsed_ms
|
|
}
|
|
)
|
|
|
|
# BAD: Using print()
|
|
print(f"Processing {doc_id}") # Never in production!
|
|
```
|
|
|
|
**Remember**: Code quality is not negotiable. Clear, maintainable Python code with proper type hints enables confident development and refactoring.
|