Init

2025-10-26 20:41:11 +01:00
commit dafa86c588
11 changed files with 1171 additions and 0 deletions
--- a/scripts/01_process_invoices.py
+++ b/scripts/01_process_invoices.py
@@ -0,0 +1,106 @@
+# --- scripts/01_process_invoices.py ---
+
+import os
+import sys
+import json
+import pytesseract
+import pandas as pd
+from pdf2image import convert_from_path
+import cv2
+import numpy as np
+import shutil
+
+# Add parent directory to path to import config
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from config import POPPLER_PATH, apply_tesseract_path
+
+# Apply Tesseract path from config
+apply_tesseract_path()
+
+# 项目路径设置
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+RAW_DIR = os.path.join(BASE_DIR, "data", "raw_invoices")
+IMG_DIR = os.path.join(BASE_DIR, "data", "processed_images")
+OCR_DIR = os.path.join(BASE_DIR, "data", "ocr_results")
+
+# 创建输出目录
+os.makedirs(IMG_DIR, exist_ok=True)
+os.makedirs(OCR_DIR, exist_ok=True)
+
+def process_invoices():
+    print(f"开始处理 {RAW_DIR} 中的发票...")
+    
+    for filename in os.listdir(RAW_DIR):
+        filepath = os.path.join(RAW_DIR, filename)
+        base_name = os.path.splitext(filename)[0]
+        img_path = os.path.join(IMG_DIR, f"{base_name}.png")
+        json_path = os.path.join(OCR_DIR, f"{base_name}.json")
+
+        # 防止重复处理
+        if os.path.exists(img_path) and os.path.exists(json_path):
+            continue
+
+        try:
+            # 1. 加载图像 (PDF 或 图片)
+            if filename.lower().endswith(".pdf"):
+                images = convert_from_path(filepath, poppler_path=POPPLER_PATH)
+                if not images:
+                    print(f"警告: 无法从 {filename} 提取图像。")
+                    continue
+                img_pil = images[0] # 取第一页
+                img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
+            else:
+                img = cv2.imread(filepath)
+                if img is None:
+                    print(f"警告: 无法读取图像文件 {filename}。")
+                    continue
+            
+            img_h, img_w = img.shape[:2]
+            
+            # 2. 保存统一格式的图像
+            cv2.imwrite(img_path, img)
+            
+            # 3. 运行 Tesseract OCR
+            ocr_data_df = pytesseract.image_to_data(
+                img, 
+                lang='swe', # 确保已安装瑞典语包
+                output_type=pytesseract.Output.DATAFRAME
+            )
+            
+            # 4. 清理 OCR 结果
+            ocr_data_df = ocr_data_df[ocr_data_df.conf > 0]
+            ocr_data_df.dropna(subset=['text'], inplace=True)
+            ocr_data_df['text'] = ocr_data_df['text'].astype(str).str.strip()
+            ocr_data_df = ocr_data_df[ocr_data_df['text'] != ""]
+            
+            # 5. 转换为您在另一脚本中使用的 JSON 格式 (包含 text_boxes)
+            text_boxes = []
+            for i, row in ocr_data_df.iterrows():
+                text_boxes.append({
+                    "text": row["text"],
+                    "bbox": {
+                        "x_min": row["left"],
+                        "y_min": row["top"],
+                        "x_max": row["left"] + row["width"],
+                        "y_max": row["top"] + row["height"]
+                    },
+                    "confidence": row["conf"] / 100.0
+                })
+            
+            output_json = {
+                "image_name": f"{base_name}.png",
+                "width": img_w,
+                "height": img_h,
+                "text_boxes": text_boxes
+            }
+
+            with open(json_path, 'w', encoding='utf-8') as f:
+                json.dump(output_json, f, indent=4, ensure_ascii=False)
+            
+            print(f"已处理: {filename}")
+            
+        except Exception as e:
+            print(f"处理 {filename} 时出错: {e}")
+
+if __name__ == "__main__":
+    process_invoices()