Prepare vectorizes

2025-08-11 16:42:36 +02:00
parent f077c6351d
commit 0c6d008368
5 changed files with 79 additions and 38 deletions
--- a/app/core/ocr.py
+++ b/app/core/ocr.py
@@ -0,0 +1,23 @@
+import pytesseract
+from PIL import Image
+from typing import List
+
+
+def extract_text_from_images(images: List[Image.Image]) -> str:
+    """
+    使用Tesseract OCR从一系列图片中提取并合并所有文本。
+    """
+    print("--- [Core OCR] 正在从图片中提取文本用于向量化...")
+    full_text = []
+    for img in images:
+        try:
+            # lang='chi_sim+eng' 表示同时识别简体中文和英文
+            text = pytesseract.image_to_string(img, lang='chi_sim+eng')
+            full_text.append(text)
+        except Exception as e:
+            print(f"--- [Core OCR] 单页处理失败: {e}")
+            continue
+
+    combined_text = "\n\n--- Page Break ---\n\n".join(full_text)
+    print("--- [Core OCR] 文本提取成功。")
+    return combined_text