# --- scripts/01_process_invoices.py --- import os import sys import json import pytesseract import pandas as pd from pdf2image import convert_from_path import cv2 import numpy as np import shutil # Add parent directory to path to import config sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from config import POPPLER_PATH, apply_tesseract_path # Apply Tesseract path from config apply_tesseract_path() # 项目路径设置 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) RAW_DIR = os.path.join(BASE_DIR, "data", "raw_invoices") IMG_DIR = os.path.join(BASE_DIR, "data", "processed_images") OCR_DIR = os.path.join(BASE_DIR, "data", "ocr_results") # 创建输出目录 os.makedirs(IMG_DIR, exist_ok=True) os.makedirs(OCR_DIR, exist_ok=True) def process_invoices(): print(f"开始处理 {RAW_DIR} 中的发票...") for filename in os.listdir(RAW_DIR): filepath = os.path.join(RAW_DIR, filename) base_name = os.path.splitext(filename)[0] img_path = os.path.join(IMG_DIR, f"{base_name}.png") json_path = os.path.join(OCR_DIR, f"{base_name}.json") # 防止重复处理 if os.path.exists(img_path) and os.path.exists(json_path): continue try: # 1. 加载图像 (PDF 或 图片) if filename.lower().endswith(".pdf"): images = convert_from_path(filepath, poppler_path=POPPLER_PATH) if not images: print(f"警告: 无法从 {filename} 提取图像。") continue img_pil = images[0] # 取第一页 img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR) else: img = cv2.imread(filepath) if img is None: print(f"警告: 无法读取图像文件 {filename}。") continue img_h, img_w = img.shape[:2] # 2. 保存统一格式的图像 cv2.imwrite(img_path, img) # 3. 运行 Tesseract OCR ocr_data_df = pytesseract.image_to_data( img, lang='swe', # 确保已安装瑞典语包 output_type=pytesseract.Output.DATAFRAME ) # 4. 清理 OCR 结果 ocr_data_df = ocr_data_df[ocr_data_df.conf > 0] ocr_data_df.dropna(subset=['text'], inplace=True) ocr_data_df['text'] = ocr_data_df['text'].astype(str).str.strip() ocr_data_df = ocr_data_df[ocr_data_df['text'] != ""] # 5. 转换为您在另一脚本中使用的 JSON 格式 (包含 text_boxes) text_boxes = [] for i, row in ocr_data_df.iterrows(): text_boxes.append({ "text": row["text"], "bbox": { "x_min": row["left"], "y_min": row["top"], "x_max": row["left"] + row["width"], "y_max": row["top"] + row["height"] }, "confidence": row["conf"] / 100.0 }) output_json = { "image_name": f"{base_name}.png", "width": img_w, "height": img_h, "text_boxes": text_boxes } with open(json_path, 'w', encoding='utf-8') as f: json.dump(output_json, f, indent=4, ensure_ascii=False) print(f"已处理: {filename}") except Exception as e: print(f"处理 {filename} 时出错: {e}") if __name__ == "__main__": process_invoices()