invoice-master/scripts/01_process_invoices.py

# --- scripts/01_process_invoices.py ---

import os
import sys
import json
import pytesseract
import pandas as pd
from pdf2image import convert_from_path
import cv2
import numpy as np
import shutil

# Add parent directory to path to import config
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config import POPPLER_PATH, apply_tesseract_path

# Apply Tesseract path from config
apply_tesseract_path()

# 项目路径设置
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
RAW_DIR = os.path.join(BASE_DIR, "data", "raw_invoices")
IMG_DIR = os.path.join(BASE_DIR, "data", "processed_images")
OCR_DIR = os.path.join(BASE_DIR, "data", "ocr_results")

# 创建输出目录
os.makedirs(IMG_DIR, exist_ok=True)
os.makedirs(OCR_DIR, exist_ok=True)

def process_invoices():
    print(f"开始处理 {RAW_DIR} 中的发票...")

    for filename in os.listdir(RAW_DIR):
        filepath = os.path.join(RAW_DIR, filename)
        base_name = os.path.splitext(filename)[0]
        img_path = os.path.join(IMG_DIR, f"{base_name}.png")
        json_path = os.path.join(OCR_DIR, f"{base_name}.json")

        # 防止重复处理
        if os.path.exists(img_path) and os.path.exists(json_path):
            continue

        try:
            # 1. 加载图像 (PDF 或 图片)
            if filename.lower().endswith(".pdf"):
                images = convert_from_path(filepath, poppler_path=POPPLER_PATH)
                if not images:
                    print(f"警告: 无法从 {filename} 提取图像。")
                    continue
                img_pil = images[0] # 取第一页
                img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
            else:
                img = cv2.imread(filepath)
                if img is None:
                    print(f"警告: 无法读取图像文件 {filename}。")
                    continue

            img_h, img_w = img.shape[:2]

            # 2. 保存统一格式的图像
            cv2.imwrite(img_path, img)

            # 3. 运行 Tesseract OCR
            ocr_data_df = pytesseract.image_to_data(
                img,
                lang='swe', # 确保已安装瑞典语包
                output_type=pytesseract.Output.DATAFRAME
            )

            # 4. 清理 OCR 结果
            ocr_data_df = ocr_data_df[ocr_data_df.conf > 0]
            ocr_data_df.dropna(subset=['text'], inplace=True)
            ocr_data_df['text'] = ocr_data_df['text'].astype(str).str.strip()
            ocr_data_df = ocr_data_df[ocr_data_df['text'] != ""]

            # 5. 转换为您在另一脚本中使用的 JSON 格式 (包含 text_boxes)
            text_boxes = []
            for i, row in ocr_data_df.iterrows():
                text_boxes.append({
                    "text": row["text"],
                    "bbox": {
                        "x_min": row["left"],
                        "y_min": row["top"],
                        "x_max": row["left"] + row["width"],
                        "y_max": row["top"] + row["height"]
                    },
                    "confidence": row["conf"] / 100.0
                })

            output_json = {
                "image_name": f"{base_name}.png",
                "width": img_w,
                "height": img_h,
                "text_boxes": text_boxes
            }

            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(output_json, f, indent=4, ensure_ascii=False)

            print(f"已处理: {filename}")

        except Exception as e:
            print(f"处理 {filename} 时出错: {e}")

if __name__ == "__main__":
    process_invoices()