138 lines
4.7 KiB
Python
138 lines
4.7 KiB
Python
"""
|
|
Configuration file for system-dependent paths and settings.
|
|
|
|
This file contains paths that may vary between different systems.
|
|
Copy this file and modify the paths according to your local installation.
|
|
"""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# ============================================================================
|
|
# System Paths - Modify these according to your installation
|
|
# ============================================================================
|
|
|
|
# Poppler path (required for PDF to image conversion)
|
|
# Download from: https://github.com/oschwartz10612/poppler-windows/releases
|
|
# Example: r"C:\poppler-23.11.0\bin"
|
|
POPPLER_PATH = os.getenv("POPPLER_PATH", r"C:\Program Files\poppler-25.07.0\Library\bin")
|
|
|
|
# Tesseract path (optional - only needed if not in system PATH)
|
|
# Download from: https://github.com/UB-Mannheim/tesseract/wiki
|
|
# Example: r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
|
TESSERACT_CMD = os.getenv("TESSERACT_CMD", None)
|
|
|
|
# ============================================================================
|
|
# Project Paths - Generally don't need to modify these
|
|
# ============================================================================
|
|
|
|
# Project root directory
|
|
PROJECT_ROOT = Path(__file__).parent.absolute()
|
|
|
|
# Data directories
|
|
DATA_DIR = PROJECT_ROOT / "data"
|
|
RAW_INVOICES_DIR = DATA_DIR / "raw_invoices"
|
|
PROCESSED_IMAGES_DIR = DATA_DIR / "processed_images"
|
|
OCR_RESULTS_DIR = DATA_DIR / "ocr_results"
|
|
|
|
# YOLO dataset directories
|
|
YOLO_DATASET_DIR = DATA_DIR / "yolo_dataset"
|
|
YOLO_TEMP_IMAGES_DIR = YOLO_DATASET_DIR / "temp_all_images"
|
|
YOLO_TEMP_LABELS_DIR = YOLO_DATASET_DIR / "temp_all_labels"
|
|
YOLO_TRAIN_IMAGES_DIR = YOLO_DATASET_DIR / "images" / "train"
|
|
YOLO_TRAIN_LABELS_DIR = YOLO_DATASET_DIR / "labels" / "train"
|
|
YOLO_VAL_IMAGES_DIR = YOLO_DATASET_DIR / "images" / "val"
|
|
YOLO_VAL_LABELS_DIR = YOLO_DATASET_DIR / "labels" / "val"
|
|
|
|
# Model directories
|
|
MODELS_DIR = PROJECT_ROOT / "models"
|
|
DEFAULT_MODEL_PATH = MODELS_DIR / "payment_slip_detector_v1" / "weights" / "best.pt"
|
|
|
|
# ============================================================================
|
|
# OCR Settings
|
|
# ============================================================================
|
|
|
|
# Tesseract language (Swedish + English)
|
|
TESSERACT_LANG = "swe" # Ensure Swedish language pack is installed
|
|
|
|
# OCR confidence threshold (0-100)
|
|
OCR_CONFIDENCE_THRESHOLD = 0
|
|
|
|
# ============================================================================
|
|
# Training Settings
|
|
# ============================================================================
|
|
|
|
# YOLO model size: n (nano), s (small), m (medium), l (large), x (xlarge)
|
|
YOLO_MODEL_SIZE = "n"
|
|
|
|
# Training epochs
|
|
TRAINING_EPOCHS = 100
|
|
|
|
# Batch size
|
|
BATCH_SIZE = 16
|
|
|
|
# Image size for training
|
|
IMAGE_SIZE = 640
|
|
|
|
# Validation split ratio (0.0 to 1.0)
|
|
VALIDATION_SPLIT = 0.2
|
|
|
|
# Random seed for reproducibility
|
|
RANDOM_SEED = 42
|
|
|
|
# ============================================================================
|
|
# API Settings (for main.py FastAPI server)
|
|
# ============================================================================
|
|
|
|
# API host
|
|
API_HOST = "127.0.0.1"
|
|
|
|
# API port
|
|
API_PORT = 8000
|
|
|
|
# ============================================================================
|
|
# Helper Functions
|
|
# ============================================================================
|
|
|
|
def apply_tesseract_path():
|
|
"""Apply Tesseract path if configured."""
|
|
if TESSERACT_CMD:
|
|
import pytesseract
|
|
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
|
|
|
|
def validate_paths():
|
|
"""Validate that required system paths exist."""
|
|
issues = []
|
|
|
|
# Check Poppler
|
|
if not os.path.exists(POPPLER_PATH):
|
|
issues.append(f"Poppler not found at: {POPPLER_PATH}")
|
|
issues.append(" Download from: https://github.com/oschwartz10612/poppler-windows/releases")
|
|
|
|
# Check Tesseract (if specified)
|
|
if TESSERACT_CMD and not os.path.exists(TESSERACT_CMD):
|
|
issues.append(f"Tesseract not found at: {TESSERACT_CMD}")
|
|
issues.append(" Download from: https://github.com/UB-Mannheim/tesseract/wiki")
|
|
|
|
if issues:
|
|
print("Configuration Issues Found:")
|
|
for issue in issues:
|
|
print(f" {issue}")
|
|
return False
|
|
|
|
return True
|
|
|
|
# ============================================================================
|
|
# Example: Environment Variable Override
|
|
# ============================================================================
|
|
# You can set these in your environment instead of modifying this file:
|
|
#
|
|
# Windows:
|
|
# set POPPLER_PATH=C:\poppler\bin
|
|
# set TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
|
|
#
|
|
# Linux/Mac:
|
|
# export POPPLER_PATH=/usr/bin
|
|
# export TESSERACT_CMD=/usr/bin/tesseract
|
|
# ============================================================================
|