Init
This commit is contained in:
106
scripts/01_process_invoices.py
Normal file
106
scripts/01_process_invoices.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# --- scripts/01_process_invoices.py ---
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import pytesseract
|
||||
import pandas as pd
|
||||
from pdf2image import convert_from_path
|
||||
import cv2
|
||||
import numpy as np
|
||||
import shutil
|
||||
|
||||
# Add parent directory to path to import config
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from config import POPPLER_PATH, apply_tesseract_path
|
||||
|
||||
# Apply Tesseract path from config
|
||||
apply_tesseract_path()
|
||||
|
||||
# 项目路径设置
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
RAW_DIR = os.path.join(BASE_DIR, "data", "raw_invoices")
|
||||
IMG_DIR = os.path.join(BASE_DIR, "data", "processed_images")
|
||||
OCR_DIR = os.path.join(BASE_DIR, "data", "ocr_results")
|
||||
|
||||
# 创建输出目录
|
||||
os.makedirs(IMG_DIR, exist_ok=True)
|
||||
os.makedirs(OCR_DIR, exist_ok=True)
|
||||
|
||||
def process_invoices():
|
||||
print(f"开始处理 {RAW_DIR} 中的发票...")
|
||||
|
||||
for filename in os.listdir(RAW_DIR):
|
||||
filepath = os.path.join(RAW_DIR, filename)
|
||||
base_name = os.path.splitext(filename)[0]
|
||||
img_path = os.path.join(IMG_DIR, f"{base_name}.png")
|
||||
json_path = os.path.join(OCR_DIR, f"{base_name}.json")
|
||||
|
||||
# 防止重复处理
|
||||
if os.path.exists(img_path) and os.path.exists(json_path):
|
||||
continue
|
||||
|
||||
try:
|
||||
# 1. 加载图像 (PDF 或 图片)
|
||||
if filename.lower().endswith(".pdf"):
|
||||
images = convert_from_path(filepath, poppler_path=POPPLER_PATH)
|
||||
if not images:
|
||||
print(f"警告: 无法从 {filename} 提取图像。")
|
||||
continue
|
||||
img_pil = images[0] # 取第一页
|
||||
img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
|
||||
else:
|
||||
img = cv2.imread(filepath)
|
||||
if img is None:
|
||||
print(f"警告: 无法读取图像文件 {filename}。")
|
||||
continue
|
||||
|
||||
img_h, img_w = img.shape[:2]
|
||||
|
||||
# 2. 保存统一格式的图像
|
||||
cv2.imwrite(img_path, img)
|
||||
|
||||
# 3. 运行 Tesseract OCR
|
||||
ocr_data_df = pytesseract.image_to_data(
|
||||
img,
|
||||
lang='swe', # 确保已安装瑞典语包
|
||||
output_type=pytesseract.Output.DATAFRAME
|
||||
)
|
||||
|
||||
# 4. 清理 OCR 结果
|
||||
ocr_data_df = ocr_data_df[ocr_data_df.conf > 0]
|
||||
ocr_data_df.dropna(subset=['text'], inplace=True)
|
||||
ocr_data_df['text'] = ocr_data_df['text'].astype(str).str.strip()
|
||||
ocr_data_df = ocr_data_df[ocr_data_df['text'] != ""]
|
||||
|
||||
# 5. 转换为您在另一脚本中使用的 JSON 格式 (包含 text_boxes)
|
||||
text_boxes = []
|
||||
for i, row in ocr_data_df.iterrows():
|
||||
text_boxes.append({
|
||||
"text": row["text"],
|
||||
"bbox": {
|
||||
"x_min": row["left"],
|
||||
"y_min": row["top"],
|
||||
"x_max": row["left"] + row["width"],
|
||||
"y_max": row["top"] + row["height"]
|
||||
},
|
||||
"confidence": row["conf"] / 100.0
|
||||
})
|
||||
|
||||
output_json = {
|
||||
"image_name": f"{base_name}.png",
|
||||
"width": img_w,
|
||||
"height": img_h,
|
||||
"text_boxes": text_boxes
|
||||
}
|
||||
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(output_json, f, indent=4, ensure_ascii=False)
|
||||
|
||||
print(f"已处理: {filename}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理 {filename} 时出错: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
process_invoices()
|
||||
Reference in New Issue
Block a user