WIP

2025-08-13 23:30:22 +02:00
parent 87ba009bd7
commit 02290bb935
4 changed files with 13 additions and 11 deletions
--- a/app/core/ocr.py
+++ b/app/core/ocr.py
@@ -4,20 +4,16 @@ from typing import List


 def extract_text_from_images(images: List[Image.Image]) -> str:
-    """
-    使用Tesseract OCR从一系列图片中提取并合并所有文本。
-    """
-    print("--- [Core OCR] 正在从图片中提取文本用于向量化...")
+    print("--- [Core OCR] Extracting text...")
    full_text = []
    for img in images:
        try:
-            # lang='chi_sim+eng' 表示同时识别简体中文和英文
            text = pytesseract.image_to_string(img, lang='chi_sim+eng')
            full_text.append(text)
        except Exception as e:
-            print(f"--- [Core OCR] 单页处理失败: {e}")
+            print(f"--- [Core OCR] Processing image failed: {e}")
            continue

    combined_text = "\n\n--- Page Break ---\n\n".join(full_text)
-    print("--- [Core OCR] 文本提取成功。")
+    print("--- [Core OCR] Text extraction completed.")
    return combined_text