Initial commit: Invoice field extraction system using YOLO + OCR
Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
55
scripts/run_autolabel.sh
Normal file
55
scripts/run_autolabel.sh
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
# 自动标注运行脚本
|
||||
# 使用方法: bash scripts/run_autolabel.sh
|
||||
|
||||
set -e
|
||||
|
||||
# 项目根目录
|
||||
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
# 激活虚拟环境
|
||||
if [ -f "venv/bin/activate" ]; then
|
||||
source venv/bin/activate
|
||||
else
|
||||
echo "错误: 虚拟环境不存在,请先运行 setup_wsl.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 默认参数
|
||||
CSV_FILE="${CSV_FILE:-data/structured_data/invoices.csv}"
|
||||
PDF_DIR="${PDF_DIR:-data/raw_pdfs}"
|
||||
OUTPUT_DIR="${OUTPUT_DIR:-data/dataset}"
|
||||
REPORT_FILE="${REPORT_FILE:-reports/autolabel_report.jsonl}"
|
||||
DPI="${DPI:-300}"
|
||||
MIN_CONFIDENCE="${MIN_CONFIDENCE:-0.7}"
|
||||
|
||||
# 显示配置
|
||||
echo "=========================================="
|
||||
echo "自动标注配置"
|
||||
echo "=========================================="
|
||||
echo "CSV 文件: $CSV_FILE"
|
||||
echo "PDF 目录: $PDF_DIR"
|
||||
echo "输出目录: $OUTPUT_DIR"
|
||||
echo "报告文件: $REPORT_FILE"
|
||||
echo "DPI: $DPI"
|
||||
echo "最小置信度: $MIN_CONFIDENCE"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
# 创建必要目录
|
||||
mkdir -p "$(dirname "$REPORT_FILE")"
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
# 运行自动标注
|
||||
python -m src.cli.autolabel \
|
||||
--csv "$CSV_FILE" \
|
||||
--pdf-dir "$PDF_DIR" \
|
||||
--output "$OUTPUT_DIR" \
|
||||
--report "$REPORT_FILE" \
|
||||
--dpi "$DPI" \
|
||||
--min-confidence "$MIN_CONFIDENCE" \
|
||||
--verbose
|
||||
|
||||
echo ""
|
||||
echo "完成! 数据集已生成到: $OUTPUT_DIR"
|
||||
67
scripts/run_train.sh
Normal file
67
scripts/run_train.sh
Normal file
@@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
# 训练运行脚本
|
||||
# 使用方法: bash scripts/run_train.sh
|
||||
|
||||
set -e
|
||||
|
||||
# 项目根目录
|
||||
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
# 激活虚拟环境
|
||||
if [ -f "venv/bin/activate" ]; then
|
||||
source venv/bin/activate
|
||||
else
|
||||
echo "错误: 虚拟环境不存在,请先运行 setup_wsl.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 默认参数
|
||||
DATA_YAML="${DATA_YAML:-data/dataset/dataset.yaml}"
|
||||
MODEL="${MODEL:-yolov8s.pt}"
|
||||
EPOCHS="${EPOCHS:-100}"
|
||||
BATCH_SIZE="${BATCH_SIZE:-16}"
|
||||
IMG_SIZE="${IMG_SIZE:-1280}"
|
||||
DEVICE="${DEVICE:-0}"
|
||||
|
||||
# 检查数据集是否存在
|
||||
if [ ! -f "$DATA_YAML" ]; then
|
||||
echo "错误: 数据集配置文件不存在: $DATA_YAML"
|
||||
echo "请先运行自动标注: bash scripts/run_autolabel.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 显示配置
|
||||
echo "=========================================="
|
||||
echo "训练配置"
|
||||
echo "=========================================="
|
||||
echo "数据集: $DATA_YAML"
|
||||
echo "基础模型: $MODEL"
|
||||
echo "Epochs: $EPOCHS"
|
||||
echo "Batch Size: $BATCH_SIZE"
|
||||
echo "图像尺寸: $IMG_SIZE"
|
||||
echo "设备: $DEVICE"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
# 检查 GPU
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
echo "GPU 状态:"
|
||||
nvidia-smi --query-gpu=name,memory.used,memory.total --format=csv,noheader
|
||||
echo ""
|
||||
else
|
||||
echo "警告: 未检测到 GPU,将使用 CPU 训练 (较慢)"
|
||||
DEVICE="cpu"
|
||||
fi
|
||||
|
||||
# 运行训练
|
||||
python -m src.cli.train \
|
||||
--data "$DATA_YAML" \
|
||||
--model "$MODEL" \
|
||||
--epochs "$EPOCHS" \
|
||||
--batch "$BATCH_SIZE" \
|
||||
--imgsz "$IMG_SIZE" \
|
||||
--device "$DEVICE"
|
||||
|
||||
echo ""
|
||||
echo "训练完成! 模型保存在: runs/train/invoice_fields/weights/"
|
||||
80
scripts/setup_wsl.sh
Normal file
80
scripts/setup_wsl.sh
Normal file
@@ -0,0 +1,80 @@
|
||||
#!/bin/bash
|
||||
# WSL 环境安装脚本
|
||||
# 使用方法: bash scripts/setup_wsl.sh
|
||||
|
||||
set -e
|
||||
|
||||
echo "=========================================="
|
||||
echo "Invoice Master POC v2 - WSL 安装脚本"
|
||||
echo "=========================================="
|
||||
|
||||
# 检查是否在 WSL 中运行
|
||||
if ! grep -qi microsoft /proc/version 2>/dev/null; then
|
||||
echo "警告: 未检测到 WSL 环境,请在 WSL 中运行此脚本"
|
||||
echo "提示: 在 Windows 终端中输入 'wsl' 进入 WSL"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "[1/5] 更新系统包..."
|
||||
sudo apt update
|
||||
|
||||
echo ""
|
||||
echo "[2/5] 安装系统依赖..."
|
||||
sudo apt install -y \
|
||||
python3.10 \
|
||||
python3.10-venv \
|
||||
python3-pip \
|
||||
libgl1-mesa-glx \
|
||||
libglib2.0-0 \
|
||||
libsm6 \
|
||||
libxrender1 \
|
||||
libxext6 \
|
||||
libgomp1
|
||||
|
||||
echo ""
|
||||
echo "[3/5] 创建 Python 虚拟环境..."
|
||||
if [ -d "venv" ]; then
|
||||
echo "虚拟环境已存在,跳过创建"
|
||||
else
|
||||
python3 -m venv venv
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "[4/5] 激活虚拟环境并安装依赖..."
|
||||
source venv/bin/activate
|
||||
pip install --upgrade pip
|
||||
|
||||
echo ""
|
||||
echo "安装 Python 依赖包..."
|
||||
pip install -r requirements.txt
|
||||
|
||||
echo ""
|
||||
echo "[5/5] 验证安装..."
|
||||
python3 -c "import fitz; print(f'PyMuPDF: {fitz.version}')"
|
||||
python3 -c "from ultralytics import YOLO; print('Ultralytics: OK')"
|
||||
python3 -c "from paddleocr import PaddleOCR; print('PaddleOCR: OK')"
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "安装完成!"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "使用方法:"
|
||||
echo " 1. 激活虚拟环境: source venv/bin/activate"
|
||||
echo " 2. 运行自动标注: python -m src.cli.autolabel --help"
|
||||
echo " 3. 训练模型: python -m src.cli.train --help"
|
||||
echo " 4. 推理: python -m src.cli.infer --help"
|
||||
echo ""
|
||||
|
||||
# 检查 GPU
|
||||
echo "检查 GPU 支持..."
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
echo "检测到 NVIDIA GPU:"
|
||||
nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
|
||||
echo ""
|
||||
echo "提示: 运行以下命令启用 GPU 加速:"
|
||||
echo " pip install paddlepaddle-gpu"
|
||||
else
|
||||
echo "未检测到 GPU,将使用 CPU 模式"
|
||||
fi
|
||||
Reference in New Issue
Block a user