fix: change default OCR language from English to Swedish

Project targets Swedish invoice extraction. PaddleOCR sv model provides
better recognition of Swedish-specific characters (å, ä, ö).
This commit is contained in:
Yaojia Wang
2026-02-12 23:19:51 +01:00
parent 58d36c8927
commit d8f2acb762
6 changed files with 8 additions and 8 deletions

View File

@@ -54,8 +54,8 @@ def main():
)
parser.add_argument(
'--lang',
default='en',
help='OCR language (default: en)'
default='sv',
help='OCR language (default: sv)'
)
parser.add_argument(
'--gpu',

View File

@@ -85,7 +85,7 @@ class FieldExtractor:
def __init__(
self,
ocr_lang: str = 'en',
ocr_lang: str = 'sv',
use_gpu: bool = False,
bbox_padding: float = 0.1,
dpi: int = 300,

View File

@@ -209,7 +209,7 @@ class InferencePipeline:
self,
model_path: str | Path,
confidence_threshold: float = 0.5,
ocr_lang: str = 'en',
ocr_lang: str = 'sv',
use_gpu: bool = False,
dpi: int = 300,
enable_fallback: bool = True,

View File

@@ -37,7 +37,7 @@ class AutoLabelService:
def ocr_engine(self) -> OCREngine:
"""Lazy initialization of OCR engine."""
if self._ocr_engine is None:
self._ocr_engine = OCREngine(lang="en")
self._ocr_engine = OCREngine(lang="sv")
return self._ocr_engine
def auto_label_document(

View File

@@ -58,7 +58,7 @@ class OCREngine:
def __init__(
self,
lang: str = "en",
lang: str = "sv",
det_model_dir: str | None = None,
rec_model_dir: str | None = None,
use_doc_orientation_classify: bool = True,
@@ -387,7 +387,7 @@ class OCREngine:
def extract_ocr_tokens(
image_path: str | Path,
lang: str = "en",
lang: str = "sv",
page_no: int = 0
) -> list[OCRToken]:
"""

View File

@@ -48,7 +48,7 @@ def _init_gpu_worker(gpu_id: int = 0) -> None:
from paddleocr import PaddleOCR
# PaddleOCR 3.x init - minimal params, GPU controlled via paddle.set_device
_ocr_instance = PaddleOCR(lang="en")
_ocr_instance = PaddleOCR(lang="sv")
_gpu_initialized = True
logger.info(f"GPU worker initialized on GPU {gpu_id} in process {os.getpid()}")