82 lines
2.5 KiB
Python
82 lines
2.5 KiB
Python
"""
|
|
Configuration settings for the invoice extraction system.
|
|
"""
|
|
|
|
import os
|
|
import platform
|
|
from pathlib import Path
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables from .env file at project root
|
|
# Walk up from packages/shared/shared/config.py to find project root
|
|
_config_dir = Path(__file__).parent
|
|
for _candidate in [_config_dir.parent.parent.parent, _config_dir.parent.parent, _config_dir.parent]:
|
|
_env_path = _candidate / '.env'
|
|
if _env_path.exists():
|
|
load_dotenv(dotenv_path=_env_path)
|
|
break
|
|
else:
|
|
load_dotenv() # fallback: search cwd and parents
|
|
|
|
# Global DPI setting - must match training DPI for optimal model performance
|
|
DEFAULT_DPI = 150
|
|
|
|
|
|
def _is_wsl() -> bool:
|
|
"""Check if running inside WSL (Windows Subsystem for Linux)."""
|
|
if platform.system() != 'Linux':
|
|
return False
|
|
# Check for WSL-specific indicators
|
|
if os.environ.get('WSL_DISTRO_NAME'):
|
|
return True
|
|
try:
|
|
with open('/proc/version', 'r') as f:
|
|
return 'microsoft' in f.read().lower()
|
|
except (FileNotFoundError, PermissionError):
|
|
return False
|
|
|
|
|
|
# PostgreSQL Database Configuration
|
|
# Now loaded from environment variables for security
|
|
DATABASE = {
|
|
'host': os.getenv('DB_HOST', '192.168.68.31'),
|
|
'port': int(os.getenv('DB_PORT', '5432')),
|
|
'database': os.getenv('DB_NAME', 'docmaster'),
|
|
'user': os.getenv('DB_USER', 'docmaster'),
|
|
'password': os.getenv('DB_PASSWORD'), # No default for security
|
|
}
|
|
|
|
# Validate required configuration
|
|
if not DATABASE['password']:
|
|
raise ValueError(
|
|
"DB_PASSWORD environment variable is not set. "
|
|
"Please create a .env file based on .env.example and set DB_PASSWORD."
|
|
)
|
|
|
|
# Connection string for psycopg2
|
|
def get_db_connection_string():
|
|
return f"postgresql://{DATABASE['user']}:{DATABASE['password']}@{DATABASE['host']}:{DATABASE['port']}/{DATABASE['database']}"
|
|
|
|
|
|
# Paths Configuration - uses STORAGE_BASE_PATH for consistency
|
|
# All paths are relative to STORAGE_BASE_PATH (defaults to ~/invoice-data/data)
|
|
_storage_base = os.path.expanduser(os.getenv('STORAGE_BASE_PATH', '~/invoice-data/data'))
|
|
|
|
PATHS = {
|
|
'csv_dir': f'{_storage_base}/structured_data',
|
|
'pdf_dir': f'{_storage_base}/raw_pdfs',
|
|
'output_dir': f'{_storage_base}/datasets',
|
|
'reports_dir': 'reports', # Keep reports in project directory
|
|
}
|
|
|
|
# Auto-labeling Configuration
|
|
AUTOLABEL = {
|
|
'workers': 2,
|
|
'dpi': DEFAULT_DPI,
|
|
'min_confidence': 0.5,
|
|
'train_ratio': 0.8,
|
|
'val_ratio': 0.1,
|
|
'test_ratio': 0.1,
|
|
'max_records_per_report': 10000,
|
|
}
|