Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
68 lines
1.6 KiB
Bash
68 lines
1.6 KiB
Bash
#!/bin/bash
|
||
# 训练运行脚本
|
||
# 使用方法: bash scripts/run_train.sh
|
||
|
||
set -e
|
||
|
||
# 项目根目录
|
||
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||
cd "$PROJECT_DIR"
|
||
|
||
# 激活虚拟环境
|
||
if [ -f "venv/bin/activate" ]; then
|
||
source venv/bin/activate
|
||
else
|
||
echo "错误: 虚拟环境不存在,请先运行 setup_wsl.sh"
|
||
exit 1
|
||
fi
|
||
|
||
# 默认参数
|
||
DATA_YAML="${DATA_YAML:-data/dataset/dataset.yaml}"
|
||
MODEL="${MODEL:-yolov8s.pt}"
|
||
EPOCHS="${EPOCHS:-100}"
|
||
BATCH_SIZE="${BATCH_SIZE:-16}"
|
||
IMG_SIZE="${IMG_SIZE:-1280}"
|
||
DEVICE="${DEVICE:-0}"
|
||
|
||
# 检查数据集是否存在
|
||
if [ ! -f "$DATA_YAML" ]; then
|
||
echo "错误: 数据集配置文件不存在: $DATA_YAML"
|
||
echo "请先运行自动标注: bash scripts/run_autolabel.sh"
|
||
exit 1
|
||
fi
|
||
|
||
# 显示配置
|
||
echo "=========================================="
|
||
echo "训练配置"
|
||
echo "=========================================="
|
||
echo "数据集: $DATA_YAML"
|
||
echo "基础模型: $MODEL"
|
||
echo "Epochs: $EPOCHS"
|
||
echo "Batch Size: $BATCH_SIZE"
|
||
echo "图像尺寸: $IMG_SIZE"
|
||
echo "设备: $DEVICE"
|
||
echo "=========================================="
|
||
echo ""
|
||
|
||
# 检查 GPU
|
||
if command -v nvidia-smi &> /dev/null; then
|
||
echo "GPU 状态:"
|
||
nvidia-smi --query-gpu=name,memory.used,memory.total --format=csv,noheader
|
||
echo ""
|
||
else
|
||
echo "警告: 未检测到 GPU,将使用 CPU 训练 (较慢)"
|
||
DEVICE="cpu"
|
||
fi
|
||
|
||
# 运行训练
|
||
python -m src.cli.train \
|
||
--data "$DATA_YAML" \
|
||
--model "$MODEL" \
|
||
--epochs "$EPOCHS" \
|
||
--batch "$BATCH_SIZE" \
|
||
--imgsz "$IMG_SIZE" \
|
||
--device "$DEVICE"
|
||
|
||
echo ""
|
||
echo "训练完成! 模型保存在: runs/train/invoice_fields/weights/"
|