Files
invoice-master-poc-v2/scripts/run_train.sh
Yaojia Wang 8938661850 Initial commit: Invoice field extraction system using YOLO + OCR
Features:
- Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations
- Flexible date matching: year-month match, nearby date tolerance
- PDF text extraction with PyMuPDF
- OCR support for scanned documents (PaddleOCR)
- YOLO training and inference pipeline
- 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 17:44:14 +01:00

68 lines
1.6 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 训练运行脚本
# 使用方法: bash scripts/run_train.sh
set -e
# 项目根目录
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$PROJECT_DIR"
# 激活虚拟环境
if [ -f "venv/bin/activate" ]; then
source venv/bin/activate
else
echo "错误: 虚拟环境不存在,请先运行 setup_wsl.sh"
exit 1
fi
# 默认参数
DATA_YAML="${DATA_YAML:-data/dataset/dataset.yaml}"
MODEL="${MODEL:-yolov8s.pt}"
EPOCHS="${EPOCHS:-100}"
BATCH_SIZE="${BATCH_SIZE:-16}"
IMG_SIZE="${IMG_SIZE:-1280}"
DEVICE="${DEVICE:-0}"
# 检查数据集是否存在
if [ ! -f "$DATA_YAML" ]; then
echo "错误: 数据集配置文件不存在: $DATA_YAML"
echo "请先运行自动标注: bash scripts/run_autolabel.sh"
exit 1
fi
# 显示配置
echo "=========================================="
echo "训练配置"
echo "=========================================="
echo "数据集: $DATA_YAML"
echo "基础模型: $MODEL"
echo "Epochs: $EPOCHS"
echo "Batch Size: $BATCH_SIZE"
echo "图像尺寸: $IMG_SIZE"
echo "设备: $DEVICE"
echo "=========================================="
echo ""
# 检查 GPU
if command -v nvidia-smi &> /dev/null; then
echo "GPU 状态:"
nvidia-smi --query-gpu=name,memory.used,memory.total --format=csv,noheader
echo ""
else
echo "警告: 未检测到 GPU将使用 CPU 训练 (较慢)"
DEVICE="cpu"
fi
# 运行训练
python -m src.cli.train \
--data "$DATA_YAML" \
--model "$MODEL" \
--epochs "$EPOCHS" \
--batch "$BATCH_SIZE" \
--imgsz "$IMG_SIZE" \
--device "$DEVICE"
echo ""
echo "训练完成! 模型保存在: runs/train/invoice_fields/weights/"