#!/bin/bash # 自动标注运行脚本 # 使用方法: bash scripts/run_autolabel.sh set -e # 项目根目录 PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "$PROJECT_DIR" # 激活虚拟环境 if [ -f "venv/bin/activate" ]; then source venv/bin/activate else echo "错误: 虚拟环境不存在,请先运行 setup_wsl.sh" exit 1 fi # 默认参数 CSV_FILE="${CSV_FILE:-data/structured_data/invoices.csv}" PDF_DIR="${PDF_DIR:-data/raw_pdfs}" OUTPUT_DIR="${OUTPUT_DIR:-data/dataset}" REPORT_FILE="${REPORT_FILE:-reports/autolabel_report.jsonl}" DPI="${DPI:-300}" MIN_CONFIDENCE="${MIN_CONFIDENCE:-0.7}" # 显示配置 echo "==========================================" echo "自动标注配置" echo "==========================================" echo "CSV 文件: $CSV_FILE" echo "PDF 目录: $PDF_DIR" echo "输出目录: $OUTPUT_DIR" echo "报告文件: $REPORT_FILE" echo "DPI: $DPI" echo "最小置信度: $MIN_CONFIDENCE" echo "==========================================" echo "" # 创建必要目录 mkdir -p "$(dirname "$REPORT_FILE")" mkdir -p "$OUTPUT_DIR" # 运行自动标注 python -m src.cli.autolabel \ --csv "$CSV_FILE" \ --pdf-dir "$PDF_DIR" \ --output "$OUTPUT_DIR" \ --report "$REPORT_FILE" \ --dpi "$DPI" \ --min-confidence "$MIN_CONFIDENCE" \ --verbose echo "" echo "完成! 数据集已生成到: $OUTPUT_DIR"