Files
AmazingDoc/app/core/ocr.py
2025-08-11 16:42:36 +02:00

24 lines
788 B
Python

import pytesseract
from PIL import Image
from typing import List
def extract_text_from_images(images: List[Image.Image]) -> str:
"""
使用Tesseract OCR从一系列图片中提取并合并所有文本。
"""
print("--- [Core OCR] 正在从图片中提取文本用于向量化...")
full_text = []
for img in images:
try:
# lang='chi_sim+eng' 表示同时识别简体中文和英文
text = pytesseract.image_to_string(img, lang='chi_sim+eng')
full_text.append(text)
except Exception as e:
print(f"--- [Core OCR] 单页处理失败: {e}")
continue
combined_text = "\n\n--- Page Break ---\n\n".join(full_text)
print("--- [Core OCR] 文本提取成功。")
return combined_text