Files
invoice-master/config.py
Yaojia Wang dafa86c588 Init
2025-10-26 20:41:11 +01:00

138 lines
4.7 KiB
Python

"""
Configuration file for system-dependent paths and settings.
This file contains paths that may vary between different systems.
Copy this file and modify the paths according to your local installation.
"""
import os
from pathlib import Path
# ============================================================================
# System Paths - Modify these according to your installation
# ============================================================================
# Poppler path (required for PDF to image conversion)
# Download from: https://github.com/oschwartz10612/poppler-windows/releases
# Example: r"C:\poppler-23.11.0\bin"
POPPLER_PATH = os.getenv("POPPLER_PATH", r"C:\Program Files\poppler-25.07.0\Library\bin")
# Tesseract path (optional - only needed if not in system PATH)
# Download from: https://github.com/UB-Mannheim/tesseract/wiki
# Example: r"C:\Program Files\Tesseract-OCR\tesseract.exe"
TESSERACT_CMD = os.getenv("TESSERACT_CMD", None)
# ============================================================================
# Project Paths - Generally don't need to modify these
# ============================================================================
# Project root directory
PROJECT_ROOT = Path(__file__).parent.absolute()
# Data directories
DATA_DIR = PROJECT_ROOT / "data"
RAW_INVOICES_DIR = DATA_DIR / "raw_invoices"
PROCESSED_IMAGES_DIR = DATA_DIR / "processed_images"
OCR_RESULTS_DIR = DATA_DIR / "ocr_results"
# YOLO dataset directories
YOLO_DATASET_DIR = DATA_DIR / "yolo_dataset"
YOLO_TEMP_IMAGES_DIR = YOLO_DATASET_DIR / "temp_all_images"
YOLO_TEMP_LABELS_DIR = YOLO_DATASET_DIR / "temp_all_labels"
YOLO_TRAIN_IMAGES_DIR = YOLO_DATASET_DIR / "images" / "train"
YOLO_TRAIN_LABELS_DIR = YOLO_DATASET_DIR / "labels" / "train"
YOLO_VAL_IMAGES_DIR = YOLO_DATASET_DIR / "images" / "val"
YOLO_VAL_LABELS_DIR = YOLO_DATASET_DIR / "labels" / "val"
# Model directories
MODELS_DIR = PROJECT_ROOT / "models"
DEFAULT_MODEL_PATH = MODELS_DIR / "payment_slip_detector_v1" / "weights" / "best.pt"
# ============================================================================
# OCR Settings
# ============================================================================
# Tesseract language (Swedish + English)
TESSERACT_LANG = "swe" # Ensure Swedish language pack is installed
# OCR confidence threshold (0-100)
OCR_CONFIDENCE_THRESHOLD = 0
# ============================================================================
# Training Settings
# ============================================================================
# YOLO model size: n (nano), s (small), m (medium), l (large), x (xlarge)
YOLO_MODEL_SIZE = "n"
# Training epochs
TRAINING_EPOCHS = 100
# Batch size
BATCH_SIZE = 16
# Image size for training
IMAGE_SIZE = 640
# Validation split ratio (0.0 to 1.0)
VALIDATION_SPLIT = 0.2
# Random seed for reproducibility
RANDOM_SEED = 42
# ============================================================================
# API Settings (for main.py FastAPI server)
# ============================================================================
# API host
API_HOST = "127.0.0.1"
# API port
API_PORT = 8000
# ============================================================================
# Helper Functions
# ============================================================================
def apply_tesseract_path():
"""Apply Tesseract path if configured."""
if TESSERACT_CMD:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
def validate_paths():
"""Validate that required system paths exist."""
issues = []
# Check Poppler
if not os.path.exists(POPPLER_PATH):
issues.append(f"Poppler not found at: {POPPLER_PATH}")
issues.append(" Download from: https://github.com/oschwartz10612/poppler-windows/releases")
# Check Tesseract (if specified)
if TESSERACT_CMD and not os.path.exists(TESSERACT_CMD):
issues.append(f"Tesseract not found at: {TESSERACT_CMD}")
issues.append(" Download from: https://github.com/UB-Mannheim/tesseract/wiki")
if issues:
print("Configuration Issues Found:")
for issue in issues:
print(f" {issue}")
return False
return True
# ============================================================================
# Example: Environment Variable Override
# ============================================================================
# You can set these in your environment instead of modifying this file:
#
# Windows:
# set POPPLER_PATH=C:\poppler\bin
# set TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
#
# Linux/Mac:
# export POPPLER_PATH=/usr/bin
# export TESSERACT_CMD=/usr/bin/tesseract
# ============================================================================