WIP
This commit is contained in:
@@ -20,7 +20,7 @@ from shared.config import get_db_connection_string
|
||||
from shared.normalize import normalize_field
|
||||
from shared.matcher import FieldMatcher
|
||||
from shared.pdf import is_text_pdf, extract_text_tokens
|
||||
from training.yolo.annotation_generator import FIELD_CLASSES
|
||||
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
|
||||
from shared.data.db import DocumentDB
|
||||
|
||||
|
||||
|
||||
@@ -113,7 +113,7 @@ def process_single_document(args_tuple):
|
||||
# Import inside worker to avoid pickling issues
|
||||
from training.data.autolabel_report import AutoLabelReport
|
||||
from shared.pdf import PDFDocument
|
||||
from training.yolo.annotation_generator import FIELD_CLASSES
|
||||
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
|
||||
from training.processing.document_processor import process_page, record_unmatched_fields
|
||||
|
||||
start_time = time.time()
|
||||
@@ -342,7 +342,8 @@ def main():
|
||||
from shared.ocr import OCREngine
|
||||
from shared.matcher import FieldMatcher
|
||||
from shared.normalize import normalize_field
|
||||
from training.yolo.annotation_generator import AnnotationGenerator, FIELD_CLASSES
|
||||
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
|
||||
from training.yolo.annotation_generator import AnnotationGenerator
|
||||
|
||||
# Handle comma-separated CSV paths
|
||||
csv_input = args.csv
|
||||
|
||||
@@ -90,7 +90,7 @@ def process_text_pdf(task_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
import shutil
|
||||
from training.data.autolabel_report import AutoLabelReport
|
||||
from shared.pdf import PDFDocument
|
||||
from training.yolo.annotation_generator import FIELD_CLASSES
|
||||
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
|
||||
from training.processing.document_processor import process_page, record_unmatched_fields
|
||||
|
||||
row_dict = task_data["row_dict"]
|
||||
@@ -208,7 +208,7 @@ def process_scanned_pdf(task_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
import shutil
|
||||
from training.data.autolabel_report import AutoLabelReport
|
||||
from shared.pdf import PDFDocument
|
||||
from training.yolo.annotation_generator import FIELD_CLASSES
|
||||
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
|
||||
from training.processing.document_processor import process_page, record_unmatched_fields
|
||||
|
||||
row_dict = task_data["row_dict"]
|
||||
|
||||
@@ -15,7 +15,8 @@ from training.data.autolabel_report import FieldMatchResult
|
||||
from shared.matcher import FieldMatcher
|
||||
from shared.normalize import normalize_field
|
||||
from shared.ocr.machine_code_parser import MachineCodeParser
|
||||
from training.yolo.annotation_generator import AnnotationGenerator, FIELD_CLASSES
|
||||
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
|
||||
from training.yolo.annotation_generator import AnnotationGenerator
|
||||
|
||||
|
||||
def match_supplier_accounts(
|
||||
|
||||
@@ -9,43 +9,12 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
import csv
|
||||
|
||||
|
||||
# Field class mapping for YOLO
|
||||
# Note: supplier_accounts is not a separate class - its matches are mapped to Bankgiro/Plusgiro
|
||||
FIELD_CLASSES = {
|
||||
'InvoiceNumber': 0,
|
||||
'InvoiceDate': 1,
|
||||
'InvoiceDueDate': 2,
|
||||
'OCR': 3,
|
||||
'Bankgiro': 4,
|
||||
'Plusgiro': 5,
|
||||
'Amount': 6,
|
||||
'supplier_organisation_number': 7,
|
||||
'customer_number': 8,
|
||||
'payment_line': 9, # Machine code payment line at bottom of invoice
|
||||
}
|
||||
|
||||
# Fields that need matching but map to other YOLO classes
|
||||
# supplier_accounts matches are classified as Bankgiro or Plusgiro based on account type
|
||||
ACCOUNT_FIELD_MAPPING = {
|
||||
'supplier_accounts': {
|
||||
'BG': 'Bankgiro', # BG:xxx -> Bankgiro class
|
||||
'PG': 'Plusgiro', # PG:xxx -> Plusgiro class
|
||||
}
|
||||
}
|
||||
|
||||
CLASS_NAMES = [
|
||||
'invoice_number',
|
||||
'invoice_date',
|
||||
'invoice_due_date',
|
||||
'ocr_number',
|
||||
'bankgiro',
|
||||
'plusgiro',
|
||||
'amount',
|
||||
'supplier_org_number',
|
||||
'customer_number',
|
||||
'payment_line', # Machine code payment line at bottom of invoice
|
||||
]
|
||||
# Import field mappings from single source of truth
|
||||
from shared.fields import (
|
||||
TRAINING_FIELD_CLASSES as FIELD_CLASSES,
|
||||
CLASS_NAMES,
|
||||
ACCOUNT_FIELD_MAPPING,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -101,7 +101,8 @@ class DatasetBuilder:
|
||||
Returns:
|
||||
DatasetStats with build results
|
||||
"""
|
||||
from .annotation_generator import AnnotationGenerator, CLASS_NAMES
|
||||
from shared.fields import CLASS_NAMES
|
||||
from .annotation_generator import AnnotationGenerator
|
||||
|
||||
random.seed(seed)
|
||||
|
||||
|
||||
@@ -18,7 +18,8 @@ import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from shared.config import DEFAULT_DPI
|
||||
from .annotation_generator import FIELD_CLASSES, YOLOAnnotation
|
||||
from shared.fields import TRAINING_FIELD_CLASSES as FIELD_CLASSES
|
||||
from .annotation_generator import YOLOAnnotation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user