20 lines
604 B
Python
20 lines
604 B
Python
import pytesseract
|
|
from PIL import Image
|
|
from typing import List
|
|
|
|
|
|
def extract_text_from_images(images: List[Image.Image]) -> str:
|
|
print("--- [Core OCR] Extracting text...")
|
|
full_text = []
|
|
for img in images:
|
|
try:
|
|
text = pytesseract.image_to_string(img, lang='chi_sim+eng')
|
|
full_text.append(text)
|
|
except Exception as e:
|
|
print(f"--- [Core OCR] Processing image failed: {e}")
|
|
continue
|
|
|
|
combined_text = "\n\n--- Page Break ---\n\n".join(full_text)
|
|
print("--- [Core OCR] Text extraction completed.")
|
|
return combined_text
|