AmazingDoc/app/core/ocr.py

# app/core/ocr.py
import pytesseract
from PIL import Image


# 注意: 您需要先在您的系统中安装Google的Tesseract OCR引擎。
# 详情请参考之前的安装说明。

def extract_text_from_image(image: Image.Image) -> str:
    """
    使用Tesseract OCR从Pillow Image对象中提取文本。

    参数:
        image: Pillow Image对象。

    返回:
        从图片中提取出的字符串文本。
    """
    try:
        print("--- [Core OCR] 正在从图片中提取文本用于分类...")
        # lang='chi_sim+eng' 表示同时识别简体中文和英文
        text = pytesseract.image_to_string(image, lang='chi_sim+eng')
        print("--- [Core OCR] 文本提取成功。")
        return text
    except Exception as e:
        print(f"--- [Core OCR] OCR处理失败: {e}")
        raise IOError(f"OCR processing failed: {e}")