Init project

2025-08-11 00:07:41 +02:00
parent 840daf2d08
commit 0a80400720
23 changed files with 660 additions and 0 deletions
--- a/app/core/ocr.py
+++ b/app/core/ocr.py
@@ -0,0 +1,27 @@
+# app/core/ocr.py
+import pytesseract
+from PIL import Image
+
+
+# 注意: 您需要先在您的系统中安装Google的Tesseract OCR引擎。
+# 详情请参考之前的安装说明。
+
+def extract_text_from_image(image: Image.Image) -> str:
+    """
+    使用Tesseract OCR从Pillow Image对象中提取文本。
+
+    参数:
+        image: Pillow Image对象。
+
+    返回:
+        从图片中提取出的字符串文本。
+    """
+    try:
+        print("--- [Core OCR] 正在从图片中提取文本用于分类...")
+        # lang='chi_sim+eng' 表示同时识别简体中文和英文
+        text = pytesseract.image_to_string(image, lang='chi_sim+eng')
+        print("--- [Core OCR] 文本提取成功。")
+        return text
+    except Exception as e:
+        print(f"--- [Core OCR] OCR处理失败: {e}")
+        raise IOError(f"OCR processing failed: {e}")