Init
This commit is contained in:
137
config.py
Normal file
137
config.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
Configuration file for system-dependent paths and settings.
|
||||
|
||||
This file contains paths that may vary between different systems.
|
||||
Copy this file and modify the paths according to your local installation.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# ============================================================================
|
||||
# System Paths - Modify these according to your installation
|
||||
# ============================================================================
|
||||
|
||||
# Poppler path (required for PDF to image conversion)
|
||||
# Download from: https://github.com/oschwartz10612/poppler-windows/releases
|
||||
# Example: r"C:\poppler-23.11.0\bin"
|
||||
POPPLER_PATH = os.getenv("POPPLER_PATH", r"C:\Program Files\poppler-25.07.0\Library\bin")
|
||||
|
||||
# Tesseract path (optional - only needed if not in system PATH)
|
||||
# Download from: https://github.com/UB-Mannheim/tesseract/wiki
|
||||
# Example: r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
||||
TESSERACT_CMD = os.getenv("TESSERACT_CMD", None)
|
||||
|
||||
# ============================================================================
|
||||
# Project Paths - Generally don't need to modify these
|
||||
# ============================================================================
|
||||
|
||||
# Project root directory
|
||||
PROJECT_ROOT = Path(__file__).parent.absolute()
|
||||
|
||||
# Data directories
|
||||
DATA_DIR = PROJECT_ROOT / "data"
|
||||
RAW_INVOICES_DIR = DATA_DIR / "raw_invoices"
|
||||
PROCESSED_IMAGES_DIR = DATA_DIR / "processed_images"
|
||||
OCR_RESULTS_DIR = DATA_DIR / "ocr_results"
|
||||
|
||||
# YOLO dataset directories
|
||||
YOLO_DATASET_DIR = DATA_DIR / "yolo_dataset"
|
||||
YOLO_TEMP_IMAGES_DIR = YOLO_DATASET_DIR / "temp_all_images"
|
||||
YOLO_TEMP_LABELS_DIR = YOLO_DATASET_DIR / "temp_all_labels"
|
||||
YOLO_TRAIN_IMAGES_DIR = YOLO_DATASET_DIR / "images" / "train"
|
||||
YOLO_TRAIN_LABELS_DIR = YOLO_DATASET_DIR / "labels" / "train"
|
||||
YOLO_VAL_IMAGES_DIR = YOLO_DATASET_DIR / "images" / "val"
|
||||
YOLO_VAL_LABELS_DIR = YOLO_DATASET_DIR / "labels" / "val"
|
||||
|
||||
# Model directories
|
||||
MODELS_DIR = PROJECT_ROOT / "models"
|
||||
DEFAULT_MODEL_PATH = MODELS_DIR / "payment_slip_detector_v1" / "weights" / "best.pt"
|
||||
|
||||
# ============================================================================
|
||||
# OCR Settings
|
||||
# ============================================================================
|
||||
|
||||
# Tesseract language (Swedish + English)
|
||||
TESSERACT_LANG = "swe" # Ensure Swedish language pack is installed
|
||||
|
||||
# OCR confidence threshold (0-100)
|
||||
OCR_CONFIDENCE_THRESHOLD = 0
|
||||
|
||||
# ============================================================================
|
||||
# Training Settings
|
||||
# ============================================================================
|
||||
|
||||
# YOLO model size: n (nano), s (small), m (medium), l (large), x (xlarge)
|
||||
YOLO_MODEL_SIZE = "n"
|
||||
|
||||
# Training epochs
|
||||
TRAINING_EPOCHS = 100
|
||||
|
||||
# Batch size
|
||||
BATCH_SIZE = 16
|
||||
|
||||
# Image size for training
|
||||
IMAGE_SIZE = 640
|
||||
|
||||
# Validation split ratio (0.0 to 1.0)
|
||||
VALIDATION_SPLIT = 0.2
|
||||
|
||||
# Random seed for reproducibility
|
||||
RANDOM_SEED = 42
|
||||
|
||||
# ============================================================================
|
||||
# API Settings (for main.py FastAPI server)
|
||||
# ============================================================================
|
||||
|
||||
# API host
|
||||
API_HOST = "127.0.0.1"
|
||||
|
||||
# API port
|
||||
API_PORT = 8000
|
||||
|
||||
# ============================================================================
|
||||
# Helper Functions
|
||||
# ============================================================================
|
||||
|
||||
def apply_tesseract_path():
|
||||
"""Apply Tesseract path if configured."""
|
||||
if TESSERACT_CMD:
|
||||
import pytesseract
|
||||
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
|
||||
|
||||
def validate_paths():
|
||||
"""Validate that required system paths exist."""
|
||||
issues = []
|
||||
|
||||
# Check Poppler
|
||||
if not os.path.exists(POPPLER_PATH):
|
||||
issues.append(f"Poppler not found at: {POPPLER_PATH}")
|
||||
issues.append(" Download from: https://github.com/oschwartz10612/poppler-windows/releases")
|
||||
|
||||
# Check Tesseract (if specified)
|
||||
if TESSERACT_CMD and not os.path.exists(TESSERACT_CMD):
|
||||
issues.append(f"Tesseract not found at: {TESSERACT_CMD}")
|
||||
issues.append(" Download from: https://github.com/UB-Mannheim/tesseract/wiki")
|
||||
|
||||
if issues:
|
||||
print("Configuration Issues Found:")
|
||||
for issue in issues:
|
||||
print(f" {issue}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# ============================================================================
|
||||
# Example: Environment Variable Override
|
||||
# ============================================================================
|
||||
# You can set these in your environment instead of modifying this file:
|
||||
#
|
||||
# Windows:
|
||||
# set POPPLER_PATH=C:\poppler\bin
|
||||
# set TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
|
||||
#
|
||||
# Linux/Mac:
|
||||
# export POPPLER_PATH=/usr/bin
|
||||
# export TESSERACT_CMD=/usr/bin/tesseract
|
||||
# ============================================================================
|
||||
Reference in New Issue
Block a user