fix: change default OCR language from English to Swedish
Project targets Swedish invoice extraction. PaddleOCR sv model provides better recognition of Swedish-specific characters (å, ä, ö).
This commit is contained in:
@@ -54,8 +54,8 @@ def main():
|
||||
)
|
||||
parser.add_argument(
|
||||
'--lang',
|
||||
default='en',
|
||||
help='OCR language (default: en)'
|
||||
default='sv',
|
||||
help='OCR language (default: sv)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--gpu',
|
||||
|
||||
@@ -85,7 +85,7 @@ class FieldExtractor:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ocr_lang: str = 'en',
|
||||
ocr_lang: str = 'sv',
|
||||
use_gpu: bool = False,
|
||||
bbox_padding: float = 0.1,
|
||||
dpi: int = 300,
|
||||
|
||||
@@ -209,7 +209,7 @@ class InferencePipeline:
|
||||
self,
|
||||
model_path: str | Path,
|
||||
confidence_threshold: float = 0.5,
|
||||
ocr_lang: str = 'en',
|
||||
ocr_lang: str = 'sv',
|
||||
use_gpu: bool = False,
|
||||
dpi: int = 300,
|
||||
enable_fallback: bool = True,
|
||||
|
||||
@@ -37,7 +37,7 @@ class AutoLabelService:
|
||||
def ocr_engine(self) -> OCREngine:
|
||||
"""Lazy initialization of OCR engine."""
|
||||
if self._ocr_engine is None:
|
||||
self._ocr_engine = OCREngine(lang="en")
|
||||
self._ocr_engine = OCREngine(lang="sv")
|
||||
return self._ocr_engine
|
||||
|
||||
def auto_label_document(
|
||||
|
||||
@@ -58,7 +58,7 @@ class OCREngine:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
lang: str = "en",
|
||||
lang: str = "sv",
|
||||
det_model_dir: str | None = None,
|
||||
rec_model_dir: str | None = None,
|
||||
use_doc_orientation_classify: bool = True,
|
||||
@@ -387,7 +387,7 @@ class OCREngine:
|
||||
|
||||
def extract_ocr_tokens(
|
||||
image_path: str | Path,
|
||||
lang: str = "en",
|
||||
lang: str = "sv",
|
||||
page_no: int = 0
|
||||
) -> list[OCRToken]:
|
||||
"""
|
||||
|
||||
@@ -48,7 +48,7 @@ def _init_gpu_worker(gpu_id: int = 0) -> None:
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
# PaddleOCR 3.x init - minimal params, GPU controlled via paddle.set_device
|
||||
_ocr_instance = PaddleOCR(lang="en")
|
||||
_ocr_instance = PaddleOCR(lang="sv")
|
||||
_gpu_initialized = True
|
||||
logger.info(f"GPU worker initialized on GPU {gpu_id} in process {os.getpid()}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user