Files
invoice-master/scripts/01_process_invoices.py
Yaojia Wang dafa86c588 Init
2025-10-26 20:41:11 +01:00

106 lines
3.7 KiB
Python

# --- scripts/01_process_invoices.py ---
import os
import sys
import json
import pytesseract
import pandas as pd
from pdf2image import convert_from_path
import cv2
import numpy as np
import shutil
# Add parent directory to path to import config
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config import POPPLER_PATH, apply_tesseract_path
# Apply Tesseract path from config
apply_tesseract_path()
# 项目路径设置
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
RAW_DIR = os.path.join(BASE_DIR, "data", "raw_invoices")
IMG_DIR = os.path.join(BASE_DIR, "data", "processed_images")
OCR_DIR = os.path.join(BASE_DIR, "data", "ocr_results")
# 创建输出目录
os.makedirs(IMG_DIR, exist_ok=True)
os.makedirs(OCR_DIR, exist_ok=True)
def process_invoices():
print(f"开始处理 {RAW_DIR} 中的发票...")
for filename in os.listdir(RAW_DIR):
filepath = os.path.join(RAW_DIR, filename)
base_name = os.path.splitext(filename)[0]
img_path = os.path.join(IMG_DIR, f"{base_name}.png")
json_path = os.path.join(OCR_DIR, f"{base_name}.json")
# 防止重复处理
if os.path.exists(img_path) and os.path.exists(json_path):
continue
try:
# 1. 加载图像 (PDF 或 图片)
if filename.lower().endswith(".pdf"):
images = convert_from_path(filepath, poppler_path=POPPLER_PATH)
if not images:
print(f"警告: 无法从 {filename} 提取图像。")
continue
img_pil = images[0] # 取第一页
img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
else:
img = cv2.imread(filepath)
if img is None:
print(f"警告: 无法读取图像文件 {filename}")
continue
img_h, img_w = img.shape[:2]
# 2. 保存统一格式的图像
cv2.imwrite(img_path, img)
# 3. 运行 Tesseract OCR
ocr_data_df = pytesseract.image_to_data(
img,
lang='swe', # 确保已安装瑞典语包
output_type=pytesseract.Output.DATAFRAME
)
# 4. 清理 OCR 结果
ocr_data_df = ocr_data_df[ocr_data_df.conf > 0]
ocr_data_df.dropna(subset=['text'], inplace=True)
ocr_data_df['text'] = ocr_data_df['text'].astype(str).str.strip()
ocr_data_df = ocr_data_df[ocr_data_df['text'] != ""]
# 5. 转换为您在另一脚本中使用的 JSON 格式 (包含 text_boxes)
text_boxes = []
for i, row in ocr_data_df.iterrows():
text_boxes.append({
"text": row["text"],
"bbox": {
"x_min": row["left"],
"y_min": row["top"],
"x_max": row["left"] + row["width"],
"y_max": row["top"] + row["height"]
},
"confidence": row["conf"] / 100.0
})
output_json = {
"image_name": f"{base_name}.png",
"width": img_w,
"height": img_h,
"text_boxes": text_boxes
}
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(output_json, f, indent=4, ensure_ascii=False)
print(f"已处理: {filename}")
except Exception as e:
print(f"处理 {filename} 时出错: {e}")
if __name__ == "__main__":
process_invoices()