AmazingDoc/app/core/ocr.py

import pytesseract
from PIL import Image
from typing import List


def extract_text_from_images(images: List[Image.Image]) -> str:
    print("--- [Core OCR] Extracting text...")
    full_text = []
    for img in images:
        try:
            text = pytesseract.image_to_string(img, lang='chi_sim+eng')
            full_text.append(text)
        except Exception as e:
            print(f"--- [Core OCR] Processing image failed: {e}")
            continue

    combined_text = "\n\n--- Page Break ---\n\n".join(full_text)
    print("--- [Core OCR] Text extraction completed.")
    return combined_text