fix: change default OCR language from English to Swedish

Project targets Swedish invoice extraction. PaddleOCR sv model provides
better recognition of Swedish-specific characters (å, ä, ö).
This commit is contained in:
Yaojia Wang
2026-02-12 23:19:51 +01:00
parent 58d36c8927
commit d8f2acb762
6 changed files with 8 additions and 8 deletions

View File

@@ -54,8 +54,8 @@ def main():
) )
parser.add_argument( parser.add_argument(
'--lang', '--lang',
default='en', default='sv',
help='OCR language (default: en)' help='OCR language (default: sv)'
) )
parser.add_argument( parser.add_argument(
'--gpu', '--gpu',

View File

@@ -85,7 +85,7 @@ class FieldExtractor:
def __init__( def __init__(
self, self,
ocr_lang: str = 'en', ocr_lang: str = 'sv',
use_gpu: bool = False, use_gpu: bool = False,
bbox_padding: float = 0.1, bbox_padding: float = 0.1,
dpi: int = 300, dpi: int = 300,

View File

@@ -209,7 +209,7 @@ class InferencePipeline:
self, self,
model_path: str | Path, model_path: str | Path,
confidence_threshold: float = 0.5, confidence_threshold: float = 0.5,
ocr_lang: str = 'en', ocr_lang: str = 'sv',
use_gpu: bool = False, use_gpu: bool = False,
dpi: int = 300, dpi: int = 300,
enable_fallback: bool = True, enable_fallback: bool = True,

View File

@@ -37,7 +37,7 @@ class AutoLabelService:
def ocr_engine(self) -> OCREngine: def ocr_engine(self) -> OCREngine:
"""Lazy initialization of OCR engine.""" """Lazy initialization of OCR engine."""
if self._ocr_engine is None: if self._ocr_engine is None:
self._ocr_engine = OCREngine(lang="en") self._ocr_engine = OCREngine(lang="sv")
return self._ocr_engine return self._ocr_engine
def auto_label_document( def auto_label_document(

View File

@@ -58,7 +58,7 @@ class OCREngine:
def __init__( def __init__(
self, self,
lang: str = "en", lang: str = "sv",
det_model_dir: str | None = None, det_model_dir: str | None = None,
rec_model_dir: str | None = None, rec_model_dir: str | None = None,
use_doc_orientation_classify: bool = True, use_doc_orientation_classify: bool = True,
@@ -387,7 +387,7 @@ class OCREngine:
def extract_ocr_tokens( def extract_ocr_tokens(
image_path: str | Path, image_path: str | Path,
lang: str = "en", lang: str = "sv",
page_no: int = 0 page_no: int = 0
) -> list[OCRToken]: ) -> list[OCRToken]:
""" """

View File

@@ -48,7 +48,7 @@ def _init_gpu_worker(gpu_id: int = 0) -> None:
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
# PaddleOCR 3.x init - minimal params, GPU controlled via paddle.set_device # PaddleOCR 3.x init - minimal params, GPU controlled via paddle.set_device
_ocr_instance = PaddleOCR(lang="en") _ocr_instance = PaddleOCR(lang="sv")
_gpu_initialized = True _gpu_initialized = True
logger.info(f"GPU worker initialized on GPU {gpu_id} in process {os.getpid()}") logger.info(f"GPU worker initialized on GPU {gpu_id} in process {os.getpid()}")