106 lines
3.7 KiB
Python
106 lines
3.7 KiB
Python
# --- scripts/01_process_invoices.py ---
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import pytesseract
|
|
import pandas as pd
|
|
from pdf2image import convert_from_path
|
|
import cv2
|
|
import numpy as np
|
|
import shutil
|
|
|
|
# Add parent directory to path to import config
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
from config import POPPLER_PATH, apply_tesseract_path
|
|
|
|
# Apply Tesseract path from config
|
|
apply_tesseract_path()
|
|
|
|
# 项目路径设置
|
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
RAW_DIR = os.path.join(BASE_DIR, "data", "raw_invoices")
|
|
IMG_DIR = os.path.join(BASE_DIR, "data", "processed_images")
|
|
OCR_DIR = os.path.join(BASE_DIR, "data", "ocr_results")
|
|
|
|
# 创建输出目录
|
|
os.makedirs(IMG_DIR, exist_ok=True)
|
|
os.makedirs(OCR_DIR, exist_ok=True)
|
|
|
|
def process_invoices():
|
|
print(f"开始处理 {RAW_DIR} 中的发票...")
|
|
|
|
for filename in os.listdir(RAW_DIR):
|
|
filepath = os.path.join(RAW_DIR, filename)
|
|
base_name = os.path.splitext(filename)[0]
|
|
img_path = os.path.join(IMG_DIR, f"{base_name}.png")
|
|
json_path = os.path.join(OCR_DIR, f"{base_name}.json")
|
|
|
|
# 防止重复处理
|
|
if os.path.exists(img_path) and os.path.exists(json_path):
|
|
continue
|
|
|
|
try:
|
|
# 1. 加载图像 (PDF 或 图片)
|
|
if filename.lower().endswith(".pdf"):
|
|
images = convert_from_path(filepath, poppler_path=POPPLER_PATH)
|
|
if not images:
|
|
print(f"警告: 无法从 {filename} 提取图像。")
|
|
continue
|
|
img_pil = images[0] # 取第一页
|
|
img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
|
|
else:
|
|
img = cv2.imread(filepath)
|
|
if img is None:
|
|
print(f"警告: 无法读取图像文件 {filename}。")
|
|
continue
|
|
|
|
img_h, img_w = img.shape[:2]
|
|
|
|
# 2. 保存统一格式的图像
|
|
cv2.imwrite(img_path, img)
|
|
|
|
# 3. 运行 Tesseract OCR
|
|
ocr_data_df = pytesseract.image_to_data(
|
|
img,
|
|
lang='swe', # 确保已安装瑞典语包
|
|
output_type=pytesseract.Output.DATAFRAME
|
|
)
|
|
|
|
# 4. 清理 OCR 结果
|
|
ocr_data_df = ocr_data_df[ocr_data_df.conf > 0]
|
|
ocr_data_df.dropna(subset=['text'], inplace=True)
|
|
ocr_data_df['text'] = ocr_data_df['text'].astype(str).str.strip()
|
|
ocr_data_df = ocr_data_df[ocr_data_df['text'] != ""]
|
|
|
|
# 5. 转换为您在另一脚本中使用的 JSON 格式 (包含 text_boxes)
|
|
text_boxes = []
|
|
for i, row in ocr_data_df.iterrows():
|
|
text_boxes.append({
|
|
"text": row["text"],
|
|
"bbox": {
|
|
"x_min": row["left"],
|
|
"y_min": row["top"],
|
|
"x_max": row["left"] + row["width"],
|
|
"y_max": row["top"] + row["height"]
|
|
},
|
|
"confidence": row["conf"] / 100.0
|
|
})
|
|
|
|
output_json = {
|
|
"image_name": f"{base_name}.png",
|
|
"width": img_w,
|
|
"height": img_h,
|
|
"text_boxes": text_boxes
|
|
}
|
|
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
json.dump(output_json, f, indent=4, ensure_ascii=False)
|
|
|
|
print(f"已处理: {filename}")
|
|
|
|
except Exception as e:
|
|
print(f"处理 {filename} 时出错: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
process_invoices() |