Initial commit: Invoice field extraction system using YOLO + OCR
Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
306
src/data/csv_loader.py
Normal file
306
src/data/csv_loader.py
Normal file
@@ -0,0 +1,306 @@
|
||||
"""
|
||||
CSV Data Loader
|
||||
|
||||
Loads and parses structured invoice data from CSV files.
|
||||
Follows the CSV specification for invoice data.
|
||||
"""
|
||||
|
||||
import csv
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, date
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterator
|
||||
|
||||
|
||||
@dataclass
|
||||
class InvoiceRow:
|
||||
"""Parsed invoice data row."""
|
||||
DocumentId: str
|
||||
InvoiceDate: date | None = None
|
||||
InvoiceNumber: str | None = None
|
||||
InvoiceDueDate: date | None = None
|
||||
OCR: str | None = None
|
||||
Message: str | None = None
|
||||
Bankgiro: str | None = None
|
||||
Plusgiro: str | None = None
|
||||
Amount: Decimal | None = None
|
||||
|
||||
# Raw values for reference
|
||||
raw_data: dict = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dictionary for matching."""
|
||||
return {
|
||||
'DocumentId': self.DocumentId,
|
||||
'InvoiceDate': self.InvoiceDate.isoformat() if self.InvoiceDate else None,
|
||||
'InvoiceNumber': self.InvoiceNumber,
|
||||
'InvoiceDueDate': self.InvoiceDueDate.isoformat() if self.InvoiceDueDate else None,
|
||||
'OCR': self.OCR,
|
||||
'Bankgiro': self.Bankgiro,
|
||||
'Plusgiro': self.Plusgiro,
|
||||
'Amount': str(self.Amount) if self.Amount else None,
|
||||
}
|
||||
|
||||
def get_field_value(self, field_name: str) -> str | None:
|
||||
"""Get field value as string for matching."""
|
||||
value = getattr(self, field_name, None)
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, date):
|
||||
return value.isoformat()
|
||||
if isinstance(value, Decimal):
|
||||
return str(value)
|
||||
return str(value) if value else None
|
||||
|
||||
|
||||
class CSVLoader:
|
||||
"""Loads invoice data from CSV files."""
|
||||
|
||||
# Expected field mappings (CSV header -> InvoiceRow attribute)
|
||||
FIELD_MAPPINGS = {
|
||||
'DocumentId': 'DocumentId',
|
||||
'InvoiceDate': 'InvoiceDate',
|
||||
'InvoiceNumber': 'InvoiceNumber',
|
||||
'InvoiceDueDate': 'InvoiceDueDate',
|
||||
'OCR': 'OCR',
|
||||
'Message': 'Message',
|
||||
'Bankgiro': 'Bankgiro',
|
||||
'Plusgiro': 'Plusgiro',
|
||||
'Amount': 'Amount',
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
csv_path: str | Path,
|
||||
pdf_dir: str | Path | None = None,
|
||||
doc_map_path: str | Path | None = None,
|
||||
encoding: str = 'utf-8'
|
||||
):
|
||||
"""
|
||||
Initialize CSV loader.
|
||||
|
||||
Args:
|
||||
csv_path: Path to the CSV file
|
||||
pdf_dir: Directory containing PDF files (default: data/raw_pdfs)
|
||||
doc_map_path: Optional path to document mapping CSV
|
||||
encoding: CSV file encoding (default: utf-8)
|
||||
"""
|
||||
self.csv_path = Path(csv_path)
|
||||
self.pdf_dir = Path(pdf_dir) if pdf_dir else self.csv_path.parent.parent / 'raw_pdfs'
|
||||
self.doc_map_path = Path(doc_map_path) if doc_map_path else None
|
||||
self.encoding = encoding
|
||||
|
||||
# Load document mapping if provided
|
||||
self.doc_map = self._load_doc_map() if self.doc_map_path else {}
|
||||
|
||||
def _load_doc_map(self) -> dict[str, str]:
|
||||
"""Load document ID to filename mapping."""
|
||||
mapping = {}
|
||||
if self.doc_map_path and self.doc_map_path.exists():
|
||||
with open(self.doc_map_path, 'r', encoding=self.encoding) as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
doc_id = row.get('DocumentId', '').strip()
|
||||
filename = row.get('FileName', '').strip()
|
||||
if doc_id and filename:
|
||||
mapping[doc_id] = filename
|
||||
return mapping
|
||||
|
||||
def _parse_date(self, value: str | None) -> date | None:
|
||||
"""Parse date from various formats."""
|
||||
if not value or not value.strip():
|
||||
return None
|
||||
|
||||
value = value.strip()
|
||||
|
||||
# Try different date formats
|
||||
formats = [
|
||||
'%Y-%m-%d',
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%Y-%m-%d %H:%M:%S.%f',
|
||||
'%d/%m/%Y',
|
||||
'%d.%m.%Y',
|
||||
'%d-%m-%Y',
|
||||
'%Y%m%d',
|
||||
]
|
||||
|
||||
for fmt in formats:
|
||||
try:
|
||||
return datetime.strptime(value, fmt).date()
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def _parse_amount(self, value: str | None) -> Decimal | None:
|
||||
"""Parse monetary amount from various formats."""
|
||||
if not value or not value.strip():
|
||||
return None
|
||||
|
||||
value = value.strip()
|
||||
|
||||
# Remove currency symbols and common suffixes
|
||||
value = value.replace('SEK', '').replace('kr', '').replace(':-', '')
|
||||
value = value.strip()
|
||||
|
||||
# Remove spaces (thousand separators)
|
||||
value = value.replace(' ', '').replace('\xa0', '')
|
||||
|
||||
# Handle comma as decimal separator (European format)
|
||||
if ',' in value and '.' not in value:
|
||||
value = value.replace(',', '.')
|
||||
elif ',' in value and '.' in value:
|
||||
# Assume comma is thousands separator, dot is decimal
|
||||
value = value.replace(',', '')
|
||||
|
||||
try:
|
||||
return Decimal(value)
|
||||
except InvalidOperation:
|
||||
return None
|
||||
|
||||
def _parse_string(self, value: str | None) -> str | None:
|
||||
"""Parse string field with cleanup."""
|
||||
if value is None:
|
||||
return None
|
||||
value = value.strip()
|
||||
return value if value else None
|
||||
|
||||
def _parse_row(self, row: dict) -> InvoiceRow | None:
|
||||
"""Parse a single CSV row into InvoiceRow."""
|
||||
doc_id = self._parse_string(row.get('DocumentId'))
|
||||
if not doc_id:
|
||||
return None
|
||||
|
||||
return InvoiceRow(
|
||||
DocumentId=doc_id,
|
||||
InvoiceDate=self._parse_date(row.get('InvoiceDate')),
|
||||
InvoiceNumber=self._parse_string(row.get('InvoiceNumber')),
|
||||
InvoiceDueDate=self._parse_date(row.get('InvoiceDueDate')),
|
||||
OCR=self._parse_string(row.get('OCR')),
|
||||
Message=self._parse_string(row.get('Message')),
|
||||
Bankgiro=self._parse_string(row.get('Bankgiro')),
|
||||
Plusgiro=self._parse_string(row.get('Plusgiro')),
|
||||
Amount=self._parse_amount(row.get('Amount')),
|
||||
raw_data=dict(row)
|
||||
)
|
||||
|
||||
def load_all(self) -> list[InvoiceRow]:
|
||||
"""Load all rows from CSV."""
|
||||
rows = []
|
||||
for row in self.iter_rows():
|
||||
rows.append(row)
|
||||
return rows
|
||||
|
||||
def iter_rows(self) -> Iterator[InvoiceRow]:
|
||||
"""Iterate over CSV rows."""
|
||||
# Handle BOM - try utf-8-sig first to handle BOM correctly
|
||||
encodings = ['utf-8-sig', self.encoding, 'latin-1']
|
||||
|
||||
for enc in encodings:
|
||||
try:
|
||||
with open(self.csv_path, 'r', encoding=enc) as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
parsed = self._parse_row(row)
|
||||
if parsed:
|
||||
yield parsed
|
||||
return
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
raise ValueError(f"Could not read CSV file with any supported encoding")
|
||||
|
||||
def get_pdf_path(self, invoice_row: InvoiceRow) -> Path | None:
|
||||
"""
|
||||
Get PDF path for an invoice row.
|
||||
|
||||
Uses document mapping if available, otherwise assumes
|
||||
DocumentId.pdf naming convention.
|
||||
"""
|
||||
doc_id = invoice_row.DocumentId
|
||||
|
||||
# Check document mapping first
|
||||
if doc_id in self.doc_map:
|
||||
filename = self.doc_map[doc_id]
|
||||
pdf_path = self.pdf_dir / filename
|
||||
if pdf_path.exists():
|
||||
return pdf_path
|
||||
|
||||
# Try default naming patterns
|
||||
patterns = [
|
||||
f"{doc_id}.pdf",
|
||||
f"{doc_id.lower()}.pdf",
|
||||
f"{doc_id.upper()}.pdf",
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
pdf_path = self.pdf_dir / pattern
|
||||
if pdf_path.exists():
|
||||
return pdf_path
|
||||
|
||||
# Try glob patterns for partial matches
|
||||
for pdf_file in self.pdf_dir.glob(f"*{doc_id}*.pdf"):
|
||||
return pdf_file
|
||||
|
||||
return None
|
||||
|
||||
def get_row_by_id(self, doc_id: str) -> InvoiceRow | None:
|
||||
"""Get a specific row by DocumentId."""
|
||||
for row in self.iter_rows():
|
||||
if row.DocumentId == doc_id:
|
||||
return row
|
||||
return None
|
||||
|
||||
def validate(self) -> list[dict]:
|
||||
"""
|
||||
Validate CSV data and return issues.
|
||||
|
||||
Returns:
|
||||
List of validation issues
|
||||
"""
|
||||
issues = []
|
||||
|
||||
for i, row in enumerate(self.iter_rows(), start=2): # Start at 2 (header is row 1)
|
||||
# Check required DocumentId
|
||||
if not row.DocumentId:
|
||||
issues.append({
|
||||
'row': i,
|
||||
'field': 'DocumentId',
|
||||
'issue': 'Missing required DocumentId'
|
||||
})
|
||||
continue
|
||||
|
||||
# Check if PDF exists
|
||||
pdf_path = self.get_pdf_path(row)
|
||||
if not pdf_path:
|
||||
issues.append({
|
||||
'row': i,
|
||||
'doc_id': row.DocumentId,
|
||||
'field': 'PDF',
|
||||
'issue': 'PDF file not found'
|
||||
})
|
||||
|
||||
# Check for at least one matchable field
|
||||
matchable_fields = [
|
||||
row.InvoiceNumber,
|
||||
row.OCR,
|
||||
row.Bankgiro,
|
||||
row.Plusgiro,
|
||||
row.Amount
|
||||
]
|
||||
if not any(matchable_fields):
|
||||
issues.append({
|
||||
'row': i,
|
||||
'doc_id': row.DocumentId,
|
||||
'field': 'All',
|
||||
'issue': 'No matchable fields (InvoiceNumber/OCR/Bankgiro/Plusgiro/Amount)'
|
||||
})
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def load_invoice_csv(csv_path: str | Path, pdf_dir: str | Path | None = None) -> list[InvoiceRow]:
|
||||
"""Convenience function to load invoice CSV."""
|
||||
loader = CSVLoader(csv_path, pdf_dir)
|
||||
return loader.load_all()
|
||||
Reference in New Issue
Block a user