This commit is contained in:
Yaojia Wang
2026-02-01 00:08:40 +01:00
parent 33ada0350d
commit a516de4320
90 changed files with 11642 additions and 398 deletions

View File

@@ -20,7 +20,7 @@ from shared.config import get_db_connection_string
from shared.normalize import normalize_field
from shared.matcher import FieldMatcher
from shared.pdf import is_text_pdf, extract_text_tokens
from training.yolo.annotation_generator import FIELD_CLASSES
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
from shared.data.db import DocumentDB

View File

@@ -113,7 +113,7 @@ def process_single_document(args_tuple):
# Import inside worker to avoid pickling issues
from training.data.autolabel_report import AutoLabelReport
from shared.pdf import PDFDocument
from training.yolo.annotation_generator import FIELD_CLASSES
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
from training.processing.document_processor import process_page, record_unmatched_fields
start_time = time.time()
@@ -342,7 +342,8 @@ def main():
from shared.ocr import OCREngine
from shared.matcher import FieldMatcher
from shared.normalize import normalize_field
from training.yolo.annotation_generator import AnnotationGenerator, FIELD_CLASSES
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
from training.yolo.annotation_generator import AnnotationGenerator
# Handle comma-separated CSV paths
csv_input = args.csv

View File

@@ -90,7 +90,7 @@ def process_text_pdf(task_data: Dict[str, Any]) -> Dict[str, Any]:
import shutil
from training.data.autolabel_report import AutoLabelReport
from shared.pdf import PDFDocument
from training.yolo.annotation_generator import FIELD_CLASSES
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
from training.processing.document_processor import process_page, record_unmatched_fields
row_dict = task_data["row_dict"]
@@ -208,7 +208,7 @@ def process_scanned_pdf(task_data: Dict[str, Any]) -> Dict[str, Any]:
import shutil
from training.data.autolabel_report import AutoLabelReport
from shared.pdf import PDFDocument
from training.yolo.annotation_generator import FIELD_CLASSES
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
from training.processing.document_processor import process_page, record_unmatched_fields
row_dict = task_data["row_dict"]

View File

@@ -15,7 +15,8 @@ from training.data.autolabel_report import FieldMatchResult
from shared.matcher import FieldMatcher
from shared.normalize import normalize_field
from shared.ocr.machine_code_parser import MachineCodeParser
from training.yolo.annotation_generator import AnnotationGenerator, FIELD_CLASSES
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
from training.yolo.annotation_generator import AnnotationGenerator
def match_supplier_accounts(

View File

@@ -9,43 +9,12 @@ from pathlib import Path
from typing import Any
import csv
# Field class mapping for YOLO
# Note: supplier_accounts is not a separate class - its matches are mapped to Bankgiro/Plusgiro
FIELD_CLASSES = {
'InvoiceNumber': 0,
'InvoiceDate': 1,
'InvoiceDueDate': 2,
'OCR': 3,
'Bankgiro': 4,
'Plusgiro': 5,
'Amount': 6,
'supplier_organisation_number': 7,
'customer_number': 8,
'payment_line': 9, # Machine code payment line at bottom of invoice
}
# Fields that need matching but map to other YOLO classes
# supplier_accounts matches are classified as Bankgiro or Plusgiro based on account type
ACCOUNT_FIELD_MAPPING = {
'supplier_accounts': {
'BG': 'Bankgiro', # BG:xxx -> Bankgiro class
'PG': 'Plusgiro', # PG:xxx -> Plusgiro class
}
}
CLASS_NAMES = [
'invoice_number',
'invoice_date',
'invoice_due_date',
'ocr_number',
'bankgiro',
'plusgiro',
'amount',
'supplier_org_number',
'customer_number',
'payment_line', # Machine code payment line at bottom of invoice
]
# Import field mappings from single source of truth
from shared.fields import (
TRAINING_FIELD_CLASSES as FIELD_CLASSES,
CLASS_NAMES,
ACCOUNT_FIELD_MAPPING,
)
@dataclass

View File

@@ -101,7 +101,8 @@ class DatasetBuilder:
Returns:
DatasetStats with build results
"""
from .annotation_generator import AnnotationGenerator, CLASS_NAMES
from shared.fields import CLASS_NAMES
from .annotation_generator import AnnotationGenerator
random.seed(seed)

View File

@@ -18,7 +18,8 @@ import numpy as np
from PIL import Image
from shared.config import DEFAULT_DPI
from .annotation_generator import FIELD_CLASSES, YOLOAnnotation
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
from .annotation_generator import YOLOAnnotation
logger = logging.getLogger(__name__)