Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
56 lines
1.4 KiB
Bash
56 lines
1.4 KiB
Bash
#!/bin/bash
|
|
# 自动标注运行脚本
|
|
# 使用方法: bash scripts/run_autolabel.sh
|
|
|
|
set -e
|
|
|
|
# 项目根目录
|
|
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
cd "$PROJECT_DIR"
|
|
|
|
# 激活虚拟环境
|
|
if [ -f "venv/bin/activate" ]; then
|
|
source venv/bin/activate
|
|
else
|
|
echo "错误: 虚拟环境不存在,请先运行 setup_wsl.sh"
|
|
exit 1
|
|
fi
|
|
|
|
# 默认参数
|
|
CSV_FILE="${CSV_FILE:-data/structured_data/invoices.csv}"
|
|
PDF_DIR="${PDF_DIR:-data/raw_pdfs}"
|
|
OUTPUT_DIR="${OUTPUT_DIR:-data/dataset}"
|
|
REPORT_FILE="${REPORT_FILE:-reports/autolabel_report.jsonl}"
|
|
DPI="${DPI:-300}"
|
|
MIN_CONFIDENCE="${MIN_CONFIDENCE:-0.7}"
|
|
|
|
# 显示配置
|
|
echo "=========================================="
|
|
echo "自动标注配置"
|
|
echo "=========================================="
|
|
echo "CSV 文件: $CSV_FILE"
|
|
echo "PDF 目录: $PDF_DIR"
|
|
echo "输出目录: $OUTPUT_DIR"
|
|
echo "报告文件: $REPORT_FILE"
|
|
echo "DPI: $DPI"
|
|
echo "最小置信度: $MIN_CONFIDENCE"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
# 创建必要目录
|
|
mkdir -p "$(dirname "$REPORT_FILE")"
|
|
mkdir -p "$OUTPUT_DIR"
|
|
|
|
# 运行自动标注
|
|
python -m src.cli.autolabel \
|
|
--csv "$CSV_FILE" \
|
|
--pdf-dir "$PDF_DIR" \
|
|
--output "$OUTPUT_DIR" \
|
|
--report "$REPORT_FILE" \
|
|
--dpi "$DPI" \
|
|
--min-confidence "$MIN_CONFIDENCE" \
|
|
--verbose
|
|
|
|
echo ""
|
|
echo "完成! 数据集已生成到: $OUTPUT_DIR"
|