WIP
This commit is contained in:
@@ -4,20 +4,16 @@ from typing import List
|
||||
|
||||
|
||||
def extract_text_from_images(images: List[Image.Image]) -> str:
|
||||
"""
|
||||
使用Tesseract OCR从一系列图片中提取并合并所有文本。
|
||||
"""
|
||||
print("--- [Core OCR] 正在从图片中提取文本用于向量化...")
|
||||
print("--- [Core OCR] Extracting text...")
|
||||
full_text = []
|
||||
for img in images:
|
||||
try:
|
||||
# lang='chi_sim+eng' 表示同时识别简体中文和英文
|
||||
text = pytesseract.image_to_string(img, lang='chi_sim+eng')
|
||||
full_text.append(text)
|
||||
except Exception as e:
|
||||
print(f"--- [Core OCR] 单页处理失败: {e}")
|
||||
print(f"--- [Core OCR] Processing image failed: {e}")
|
||||
continue
|
||||
|
||||
combined_text = "\n\n--- Page Break ---\n\n".join(full_text)
|
||||
print("--- [Core OCR] 文本提取成功。")
|
||||
print("--- [Core OCR] Text extraction completed.")
|
||||
return combined_text
|
||||
|
||||
Reference in New Issue
Block a user