Init

2025-10-26 20:41:11 +01:00
commit dafa86c588
11 changed files with 1171 additions and 0 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -0,0 +1,28 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(mkdir:*)",
+      "Bash(python:*)",
+      "Bash(pip install:*)",
+      "Bash(dir:*)",
+      "Bash(tesseract:*)",
+      "Bash(nvidia-smi:*)",
+      "Bash(pip uninstall:*)",
+      "Bash(for img in ../../images/train/*.jpg)",
+      "Bash(do basename=\"$img##*/\")",
+      "Bash(labelname=\"$basename%.jpg.txt\")",
+      "Bash(if [ -f \"../../temp_visual_labels/$labelname\" ])",
+      "Bash(then cp \"../../temp_visual_labels/$labelname\" .)",
+      "Bash(fi)",
+      "Bash(done)",
+      "Bash(awk:*)",
+      "Bash(chcp 65001)",
+      "Bash(PYTHONIOENCODING=utf-8 python:*)",
+      "Bash(timeout 600 tail:*)",
+      "Bash(cat:*)",
+      "Bash(powershell -Command \"$response = Invoke-WebRequest -Uri ''http://127.0.0.1:8000/extract_invoice/'' -Method POST -Form @{file=Get-Item ''data\\processed_images\\4BC5E5B3-E561-4A73-BC9C-46D4F08F89C3.png''} -UseBasicParsing; $response.Content\")"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,11 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+/data
+/.idea
+/__pycache__/
--- a/config.py
+++ b/config.py
@@ -0,0 +1,137 @@
+"""
+Configuration file for system-dependent paths and settings.
+
+This file contains paths that may vary between different systems.
+Copy this file and modify the paths according to your local installation.
+"""
+
+import os
+from pathlib import Path
+
+# ============================================================================
+# System Paths - Modify these according to your installation
+# ============================================================================
+
+# Poppler path (required for PDF to image conversion)
+# Download from: https://github.com/oschwartz10612/poppler-windows/releases
+# Example: r"C:\poppler-23.11.0\bin"
+POPPLER_PATH = os.getenv("POPPLER_PATH", r"C:\Program Files\poppler-25.07.0\Library\bin")
+
+# Tesseract path (optional - only needed if not in system PATH)
+# Download from: https://github.com/UB-Mannheim/tesseract/wiki
+# Example: r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+TESSERACT_CMD = os.getenv("TESSERACT_CMD", None)
+
+# ============================================================================
+# Project Paths - Generally don't need to modify these
+# ============================================================================
+
+# Project root directory
+PROJECT_ROOT = Path(__file__).parent.absolute()
+
+# Data directories
+DATA_DIR = PROJECT_ROOT / "data"
+RAW_INVOICES_DIR = DATA_DIR / "raw_invoices"
+PROCESSED_IMAGES_DIR = DATA_DIR / "processed_images"
+OCR_RESULTS_DIR = DATA_DIR / "ocr_results"
+
+# YOLO dataset directories
+YOLO_DATASET_DIR = DATA_DIR / "yolo_dataset"
+YOLO_TEMP_IMAGES_DIR = YOLO_DATASET_DIR / "temp_all_images"
+YOLO_TEMP_LABELS_DIR = YOLO_DATASET_DIR / "temp_all_labels"
+YOLO_TRAIN_IMAGES_DIR = YOLO_DATASET_DIR / "images" / "train"
+YOLO_TRAIN_LABELS_DIR = YOLO_DATASET_DIR / "labels" / "train"
+YOLO_VAL_IMAGES_DIR = YOLO_DATASET_DIR / "images" / "val"
+YOLO_VAL_LABELS_DIR = YOLO_DATASET_DIR / "labels" / "val"
+
+# Model directories
+MODELS_DIR = PROJECT_ROOT / "models"
+DEFAULT_MODEL_PATH = MODELS_DIR / "payment_slip_detector_v1" / "weights" / "best.pt"
+
+# ============================================================================
+# OCR Settings
+# ============================================================================
+
+# Tesseract language (Swedish + English)
+TESSERACT_LANG = "swe"  # Ensure Swedish language pack is installed
+
+# OCR confidence threshold (0-100)
+OCR_CONFIDENCE_THRESHOLD = 0
+
+# ============================================================================
+# Training Settings
+# ============================================================================
+
+# YOLO model size: n (nano), s (small), m (medium), l (large), x (xlarge)
+YOLO_MODEL_SIZE = "n"
+
+# Training epochs
+TRAINING_EPOCHS = 100
+
+# Batch size
+BATCH_SIZE = 16
+
+# Image size for training
+IMAGE_SIZE = 640
+
+# Validation split ratio (0.0 to 1.0)
+VALIDATION_SPLIT = 0.2
+
+# Random seed for reproducibility
+RANDOM_SEED = 42
+
+# ============================================================================
+# API Settings (for main.py FastAPI server)
+# ============================================================================
+
+# API host
+API_HOST = "127.0.0.1"
+
+# API port
+API_PORT = 8000
+
+# ============================================================================
+# Helper Functions
+# ============================================================================
+
+def apply_tesseract_path():
+    """Apply Tesseract path if configured."""
+    if TESSERACT_CMD:
+        import pytesseract
+        pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
+
+def validate_paths():
+    """Validate that required system paths exist."""
+    issues = []
+
+    # Check Poppler
+    if not os.path.exists(POPPLER_PATH):
+        issues.append(f"Poppler not found at: {POPPLER_PATH}")
+        issues.append("  Download from: https://github.com/oschwartz10612/poppler-windows/releases")
+
+    # Check Tesseract (if specified)
+    if TESSERACT_CMD and not os.path.exists(TESSERACT_CMD):
+        issues.append(f"Tesseract not found at: {TESSERACT_CMD}")
+        issues.append("  Download from: https://github.com/UB-Mannheim/tesseract/wiki")
+
+    if issues:
+        print("Configuration Issues Found:")
+        for issue in issues:
+            print(f"  {issue}")
+        return False
+
+    return True
+
+# ============================================================================
+# Example: Environment Variable Override
+# ============================================================================
+# You can set these in your environment instead of modifying this file:
+#
+# Windows:
+#   set POPPLER_PATH=C:\poppler\bin
+#   set TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
+#
+# Linux/Mac:
+#   export POPPLER_PATH=/usr/bin
+#   export TESSERACT_CMD=/usr/bin/tesseract
+# ============================================================================
--- a/extraction_results.json
+++ b/extraction_results.json
@@ -0,0 +1,17 @@
+[
+  {
+    "image": "data\\processed_images\\20250917.03.1.011299_c328f5a8-06f9-4093-85b5-e3a40f24bd30_page_1.jpg",
+    "fields": {},
+    "all_detections": []
+  },
+  {
+    "image": "data\\processed_images\\64A80892-8A9E-454C-9AEB-B740E8C3ACB3_page_1.jpg",
+    "fields": {},
+    "all_detections": []
+  },
+  {
+    "image": "data\\processed_images\\9fb6129f-671d-4aa1-9bad-096e84e6ded3_page_1.jpg",
+    "fields": {},
+    "all_detections": []
+  }
+]
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,45 @@
+# Core dependencies
+ultralytics>=8.0.0        # YOLOv8
+pytesseract>=0.3.10       # Tesseract OCR Python wrapper
+
+# Image processing
+pdf2image>=1.16.0         # PDF to image conversion
+Pillow>=10.0.0            # Image manipulation
+opencv-python>=4.8.0      # Computer vision
+
+# Data handling
+numpy>=1.24.0
+pandas>=2.0.0
+scikit-learn>=1.3.0       # For DBSCAN clustering in 02_create_labels.py
+
+# API dependencies (for main.py)
+fastapi>=0.104.0          # FastAPI web framework
+uvicorn>=0.24.0           # ASGI server
+python-multipart>=0.0.6   # For file upload support
+
+# System utilities
+# IMPORTANT: Requires system-level installation of:
+#
+# 1. Tesseract OCR:
+#    - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki
+#              After installation, add to PATH or set: pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+#    - Linux: sudo apt-get install tesseract-ocr tesseract-ocr-swe tesseract-ocr-eng
+#    - macOS: brew install tesseract tesseract-lang
+#
+# 2. Poppler (for pdf2image):
+#    - Windows: Download from https://github.com/oschwartz10612/poppler-windows/releases
+#    - Linux: sudo apt-get install poppler-utils
+#    - macOS: brew install poppler
+#
+# 3. Swedish language data for Tesseract:
+#    After installing Tesseract, you may need to download Swedish language files (swe.traineddata)
+#    from https://github.com/tesseract-ocr/tessdata
+
+# Optional: GPU support
+# torch>=2.0.0              # PyTorch with CUDA support
+# torchvision>=0.15.0
+
+# Development tools (optional)
+# jupyter>=1.0.0
+# matplotlib>=3.7.0
+# seaborn>=0.12.0
--- a/scripts/01_process_invoices.py
+++ b/scripts/01_process_invoices.py
@@ -0,0 +1,106 @@
+# --- scripts/01_process_invoices.py ---
+
+import os
+import sys
+import json
+import pytesseract
+import pandas as pd
+from pdf2image import convert_from_path
+import cv2
+import numpy as np
+import shutil
+
+# Add parent directory to path to import config
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from config import POPPLER_PATH, apply_tesseract_path
+
+# Apply Tesseract path from config
+apply_tesseract_path()
+
+# 项目路径设置
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+RAW_DIR = os.path.join(BASE_DIR, "data", "raw_invoices")
+IMG_DIR = os.path.join(BASE_DIR, "data", "processed_images")
+OCR_DIR = os.path.join(BASE_DIR, "data", "ocr_results")
+
+# 创建输出目录
+os.makedirs(IMG_DIR, exist_ok=True)
+os.makedirs(OCR_DIR, exist_ok=True)
+
+def process_invoices():
+    print(f"开始处理 {RAW_DIR} 中的发票...")
+    
+    for filename in os.listdir(RAW_DIR):
+        filepath = os.path.join(RAW_DIR, filename)
+        base_name = os.path.splitext(filename)[0]
+        img_path = os.path.join(IMG_DIR, f"{base_name}.png")
+        json_path = os.path.join(OCR_DIR, f"{base_name}.json")
+
+        # 防止重复处理
+        if os.path.exists(img_path) and os.path.exists(json_path):
+            continue
+
+        try:
+            # 1. 加载图像 (PDF 或 图片)
+            if filename.lower().endswith(".pdf"):
+                images = convert_from_path(filepath, poppler_path=POPPLER_PATH)
+                if not images:
+                    print(f"警告: 无法从 {filename} 提取图像。")
+                    continue
+                img_pil = images[0] # 取第一页
+                img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
+            else:
+                img = cv2.imread(filepath)
+                if img is None:
+                    print(f"警告: 无法读取图像文件 {filename}。")
+                    continue
+            
+            img_h, img_w = img.shape[:2]
+            
+            # 2. 保存统一格式的图像
+            cv2.imwrite(img_path, img)
+            
+            # 3. 运行 Tesseract OCR
+            ocr_data_df = pytesseract.image_to_data(
+                img, 
+                lang='swe', # 确保已安装瑞典语包
+                output_type=pytesseract.Output.DATAFRAME
+            )
+            
+            # 4. 清理 OCR 结果
+            ocr_data_df = ocr_data_df[ocr_data_df.conf > 0]
+            ocr_data_df.dropna(subset=['text'], inplace=True)
+            ocr_data_df['text'] = ocr_data_df['text'].astype(str).str.strip()
+            ocr_data_df = ocr_data_df[ocr_data_df['text'] != ""]
+            
+            # 5. 转换为您在另一脚本中使用的 JSON 格式 (包含 text_boxes)
+            text_boxes = []
+            for i, row in ocr_data_df.iterrows():
+                text_boxes.append({
+                    "text": row["text"],
+                    "bbox": {
+                        "x_min": row["left"],
+                        "y_min": row["top"],
+                        "x_max": row["left"] + row["width"],
+                        "y_max": row["top"] + row["height"]
+                    },
+                    "confidence": row["conf"] / 100.0
+                })
+            
+            output_json = {
+                "image_name": f"{base_name}.png",
+                "width": img_w,
+                "height": img_h,
+                "text_boxes": text_boxes
+            }
+
+            with open(json_path, 'w', encoding='utf-8') as f:
+                json.dump(output_json, f, indent=4, ensure_ascii=False)
+            
+            print(f"已处理: {filename}")
+            
+        except Exception as e:
+            print(f"处理 {filename} 时出错: {e}")
+
+if __name__ == "__main__":
+    process_invoices()
--- a/scripts/02_create_labels.py
+++ b/scripts/02_create_labels.py
@@ -0,0 +1,223 @@
+# --- scripts/02_create_labels.py ---
+#
+# **重要**: 此脚本用于训练 "阶段一" 的区域检测器.
+# 它只生成 1 个类别 (class_id = 0), 即 "payment_slip" 的 *整个* 区域.
+# 它不使用您的高级 classify_text 逻辑, 而是使用启发式规则 (关键词, 线条) 来找到大区域.
+#
+
+import os
+import pandas as pd
+import numpy as np
+import cv2
+import re
+import shutil
+import json
+from sklearn.cluster import DBSCAN
+
+# --- 锚点词典 (技术二) ---
+# 这些是用来 *定位* 凭证区域的词, 不是用来提取的
+KEYWORDS = [
+    "Bankgirot", "PlusGirot", "OCR-nummer", "Att betala", "Mottagare",
+    "Betalningsmottagare", "Tillhanda senast", "Förfallodag", "Belopp",
+    "BG-nr", "PG-nr", "Meddelande", "OCR-kod", "Inbetalningskort", "Betalningsavi"
+]
+KEYWORDS_REGEX = '|'.join(KEYWORDS)
+
+REGEX_PATTERNS = {
+    'bg_pg': r'(\b\d{2,4}[- ]\d{4}\b)|(\b\d{2,7}[- ]\d\b)', # Bankgiro/PlusGiro
+    'long_num': r'\b\d{10,}\b', # 可能是 OCR
+    'machine_code': r'#[0-9\s>#]+#' # 机读码
+}
+FULL_REGEX = f"({KEYWORDS_REGEX})|{REGEX_PATTERNS['bg_pg']}|{REGEX_PATTERNS['long_num']}|{REGEX_PATTERNS['machine_code']}"
+
+# --- 路径设置 ---
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+OCR_DIR = os.path.join(BASE_DIR, "data", "ocr_results")
+IMG_DIR = os.path.join(BASE_DIR, "data", "processed_images")
+# Use temp directories for initial label generation (before train/val split)
+LABEL_DIR = os.path.join(BASE_DIR, "data", "yolo_dataset", "temp_all_labels")
+IMAGE_DIR = os.path.join(BASE_DIR, "data", "yolo_dataset", "temp_all_images")
+
+os.makedirs(LABEL_DIR, exist_ok=True)
+os.makedirs(IMAGE_DIR, exist_ok=True)
+
+def find_text_anchors(text_boxes, img_height):
+    """技术一 (位置) + 技术二 (词典)"""
+    anchors = []
+    if not text_boxes:
+        return []
+    
+    # 技术一: 只在页面下半部分 (40% 处开始) 查找
+    page_midpoint = img_height * 0.4
+    
+    for box in text_boxes:
+        # 检查位置
+        if box["bbox"]["y_min"] > page_midpoint:
+            # 检查文本内容
+            if re.search(FULL_REGEX, box["text"], re.IGNORECASE):
+                bbox = box["bbox"]
+                anchors.append(pd.Series({
+                    'left': bbox["x_min"], 
+                    'top': bbox["y_min"], 
+                    'width': bbox["x_max"] - bbox["x_min"], 
+                    'height': bbox["y_max"] - bbox["y_min"]
+                }))
+    return anchors
+
+def find_visual_anchors(image, img_height, img_width):
+    """技术三 (视觉锚点 - 找线)"""
+    anchors = []
+    try:
+        # 1. 只看下半页
+        crop_y_start = img_height // 2
+        crop = image[crop_y_start:, :]
+        
+        gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
+        thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)[1]
+        
+        # 2. 查找长的水平线
+        min_line_length = img_width // 4
+        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (min_line_length, 1))
+        detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
+        
+        contours, _ = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        
+        for c in contours:
+            x, y, w, h = cv2.boundingRect(c)
+            if w > min_line_length: # 确保线足够长
+                original_y = y + crop_y_start # 转换回原图坐标
+                anchors.append(pd.Series({
+                    'left': x, 'top': original_y, 'width': w, 'height': h
+                }))
+    except Exception as e:
+        print(f"查找视觉锚点时出错: {e}")
+    return anchors
+
+def cluster_anchors(all_anchors, img_height):
+    """技术四 (聚类)"""
+    if len(all_anchors) < 2:
+        return all_anchors # 锚点太少，无法聚类
+        
+    # 1. 获取中心点
+    points = []
+    for anchor in all_anchors:
+        x_center = anchor.left + anchor.width / 2
+        y_center = anchor.top + anchor.height / 2
+        points.append((x_center, y_center))
+    
+    points = np.array(points)
+    
+    # 2. 运行 DBSCAN
+    eps_dist = img_height * 0.2
+    clustering = DBSCAN(eps=eps_dist, min_samples=2).fit(points)
+    labels = clustering.labels_
+
+    if len(labels) == 0 or np.all(labels == -1):
+        return all_anchors # 聚类失败
+
+    # 3. 找到最大的那个簇
+    unique_labels, counts = np.unique(labels[labels != -1], return_counts=True)
+    if len(counts) == 0:
+        return all_anchors # 只有噪声
+
+    largest_cluster_label = unique_labels[np.argmax(counts)]
+    
+    # 4. 只返回属于最大簇的锚点
+    main_cluster_anchors = [
+        anchor for i, anchor in enumerate(all_anchors) 
+        if labels[i] == largest_cluster_label
+    ]
+    return main_cluster_anchors
+
+def create_labels():
+    print("开始生成弱标签 (用于区域检测器)...")
+    processed_count = 0
+    skipped_count = 0
+
+    for ocr_filename in os.listdir(OCR_DIR):
+        if not ocr_filename.endswith(".json"):
+            continue
+            
+        base_name = os.path.splitext(ocr_filename)[0]
+        json_path = os.path.join(OCR_DIR, ocr_filename)
+        img_path = os.path.join(IMG_DIR, f"{base_name}.png")
+        
+        if not os.path.exists(img_path):
+            continue
+            
+        try:
+            # 1. 加载数据
+            image = cv2.imread(img_path)
+            img_h, img_w = image.shape[:2]
+            with open(json_path, 'r', encoding='utf-8') as f:
+                ocr_data = json.load(f)
+            text_boxes = ocr_data.get("text_boxes", [])
+
+            # 2. 查找所有锚点
+            text_anchors = find_text_anchors(text_boxes, img_h)
+            visual_anchors = find_visual_anchors(image, img_h, img_w)
+            all_anchors = text_anchors + visual_anchors
+            
+            if not all_anchors:
+                print(f"SKIPPING: {base_name} (未找到任何锚点)")
+                skipped_count += 1
+                continue
+                
+            # 3. 聚类锚点
+            final_anchors = cluster_anchors(all_anchors, img_h)
+            
+            if not final_anchors:
+                print(f"SKIPPING: {base_name} (未找到有效聚类)")
+                skipped_count += 1
+                continue
+                
+            # 4. 聚合坐标
+            min_x = min(a.left for a in final_anchors)
+            min_y = min(a.top for a in final_anchors)
+            max_x = max(a.left + a.width for a in final_anchors)
+            max_y = max(a.top + a.height for a in final_anchors)
+            
+            # 5. 添加边距 (Padding)
+            padding = 10
+            min_x = max(0, min_x - padding)
+            min_y = max(0, min_y - padding)
+            max_x = min(img_w, max_x + padding)
+            max_y = min(img_h, max_y + padding)
+            
+            # 6. 转换为 YOLO 格式
+            box_w = max_x - min_x
+            box_h = max_y - min_y
+            x_center = (min_x + box_w / 2) / img_w
+            y_center = (min_y + box_h / 2) / img_h
+            norm_w = box_w / img_w
+            norm_h = box_h / img_h
+            
+            # ** 关键: 类别 ID 永远是 0 **
+            class_id = 0 # 唯一的类别: 'payment_slip'
+            
+            yolo_label = f"{class_id} {x_center} {y_center} {norm_w} {norm_h}\n"
+            
+            # 7. 保存标签和图片
+            label_path = os.path.join(LABEL_DIR, f"{base_name}.txt")
+            with open(label_path, 'w', encoding='utf-8') as f:
+                f.write(yolo_label)
+                
+            shutil.copy(
+                img_path,
+                os.path.join(IMAGE_DIR, f"{base_name}.png")
+            )
+            
+            processed_count += 1
+            if processed_count % 20 == 0:
+                print(f"已生成 {processed_count} 个标签...")
+
+        except Exception as e:
+            print(f"处理 {base_name} 时出错: {e}")
+            skipped_count += 1
+            
+    print("--- 弱标签生成完成 ---")
+    print(f"成功生成: {processed_count} 个")
+    print(f"跳过: {skipped_count} 个")
+
+if __name__ == "__main__":
+    create_labels()
--- a/scripts/03_split_dataset.py
+++ b/scripts/03_split_dataset.py
@@ -0,0 +1,123 @@
+"""
+Dataset Split Script - Step 3
+Splits images and labels into training and validation sets
+"""
+
+import shutil
+import random
+from pathlib import Path
+import os
+
+# Paths
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 
+YOLO_DATASET_DIR = Path(BASE_DIR + "/data/yolo_dataset")
+TEMP_IMAGES_DIR = YOLO_DATASET_DIR / "temp_all_images"
+TEMP_LABELS_DIR = YOLO_DATASET_DIR / "temp_all_labels"
+
+TRAIN_IMAGES_DIR = YOLO_DATASET_DIR / "images" / "train"
+VAL_IMAGES_DIR = YOLO_DATASET_DIR / "images" / "val"
+TRAIN_LABELS_DIR = YOLO_DATASET_DIR / "labels" / "train"
+VAL_LABELS_DIR = YOLO_DATASET_DIR / "labels" / "val"
+
+# Configuration
+VALIDATION_SPLIT = 0.2  # 20% for validation
+RANDOM_SEED = 42
+
+
+def split_dataset(val_split=VALIDATION_SPLIT, seed=RANDOM_SEED):
+    """
+    Split dataset into training and validation sets
+
+    Args:
+        val_split: Fraction of data to use for validation (0.0 to 1.0)
+        seed: Random seed for reproducibility
+    """
+    print("="*60)
+    print("Splitting Dataset into Train/Val Sets")
+    print("="*60)
+    print(f"Validation split: {val_split*100:.1f}%")
+    print(f"Random seed: {seed}\n")
+
+    # Check if temp directories exist
+    if not TEMP_IMAGES_DIR.exists() or not TEMP_LABELS_DIR.exists():
+        print(BASE_DIR)
+        print(YOLO_DATASET_DIR)
+        print(TEMP_IMAGES_DIR)
+        print(f"Error: Temporary directories not found")
+        print(f"Please run 02_create_labels.py first")
+        return
+
+    # Get all image files
+    image_files = list(TEMP_IMAGES_DIR.glob("*.jpg")) + list(TEMP_IMAGES_DIR.glob("*.png"))
+
+    if not image_files:
+        print(f"No image files found in {TEMP_IMAGES_DIR}")
+        return
+
+    # Filter images that have corresponding labels
+    valid_pairs = []
+    for image_file in image_files:
+        label_file = TEMP_LABELS_DIR / (image_file.stem + ".txt")
+        if label_file.exists():
+            valid_pairs.append({
+                "image": image_file,
+                "label": label_file
+            })
+
+    if not valid_pairs:
+        print("No valid image-label pairs found")
+        return
+
+    print(f"Found {len(valid_pairs)} image-label pair(s)")
+
+    # Shuffle and split
+    random.seed(seed)
+    random.shuffle(valid_pairs)
+
+    split_index = int(len(valid_pairs) * (1 - val_split))
+    train_pairs = valid_pairs[:split_index]
+    val_pairs = valid_pairs[split_index:]
+
+    print(f"\nSplit results:")
+    print(f"  Training set: {len(train_pairs)} samples")
+    print(f"  Validation set: {len(val_pairs)} samples")
+    print()
+
+    # Clear existing train/val directories
+    for directory in [TRAIN_IMAGES_DIR, VAL_IMAGES_DIR, TRAIN_LABELS_DIR, VAL_LABELS_DIR]:
+        if directory.exists():
+            shutil.rmtree(directory)
+        directory.mkdir(parents=True, exist_ok=True)
+
+    # Copy training files
+    print("Copying training files...")
+    for pair in train_pairs:
+        shutil.copy(pair["image"], TRAIN_IMAGES_DIR / pair["image"].name)
+        shutil.copy(pair["label"], TRAIN_LABELS_DIR / pair["label"].name)
+    print(f"  Copied {len(train_pairs)} image-label pairs to train/")
+
+    # Copy validation files
+    print("Copying validation files...")
+    for pair in val_pairs:
+        shutil.copy(pair["image"], VAL_IMAGES_DIR / pair["image"].name)
+        shutil.copy(pair["label"], VAL_LABELS_DIR / pair["label"].name)
+    print(f"  Copied {len(val_pairs)} image-label pairs to val/")
+
+    print("\n" + "="*60)
+    print("Dataset split complete!")
+    print(f"\nDataset structure:")
+    print(f"  {TRAIN_IMAGES_DIR}")
+    print(f"  {TRAIN_LABELS_DIR}")
+    print(f"  {VAL_IMAGES_DIR}")
+    print(f"  {VAL_LABELS_DIR}")
+    print(f"\nNext step: Run 04_train_yolo.py to train the model")
+    print("="*60)
+
+
+def main():
+    """Main function"""
+    split_dataset()
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/04_train_yolo.py
+++ b/scripts/04_train_yolo.py
@@ -0,0 +1,230 @@
+"""
+YOLO Training Script - Step 4
+Trains YOLOv8 model on the prepared invoice dataset
+"""
+
+from pathlib import Path
+from ultralytics import YOLO
+import torch
+import os
+
+# Paths
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 
+DATASET_YAML = Path(BASE_DIR + "/data/yolo_dataset/dataset.yaml")
+MODELS_DIR = Path(BASE_DIR + "/models")
+
+# Training configuration
+MODEL_SIZE = "n"  # Options: n (nano), s (small), m (medium), l (large), x (xlarge)
+EPOCHS = 100
+BATCH_SIZE = 16
+IMAGE_SIZE = 640
+DEVICE = 0 if torch.cuda.is_available() else "cpu"  # Use GPU with PyTorch 2.7 + CUDA 12.8
+
+# Create models directory
+MODELS_DIR.mkdir(exist_ok=True)
+
+
+def train_model(
+    model_size=MODEL_SIZE,
+    epochs=EPOCHS,
+    batch_size=BATCH_SIZE,
+    img_size=IMAGE_SIZE,
+    device=DEVICE
+):
+    """
+    Train YOLOv8 model on invoice dataset
+
+    Args:
+        model_size: Size of YOLO model (n, s, m, l, x)
+        epochs: Number of training epochs
+        batch_size: Batch size for training
+        img_size: Input image size
+        device: Device to use for training (cuda or cpu)
+    """
+    print("="*60)
+    print("YOLOv8 Invoice Detection Training")
+    print("="*60)
+
+    # Check if dataset.yaml exists
+    if not DATASET_YAML.exists():
+        print(f"Error: {DATASET_YAML} not found")
+        print("Please ensure the dataset.yaml file exists")
+        return
+
+    # Print configuration
+    print(f"\nConfiguration:")
+    print(f"  Model: YOLOv8{model_size}")
+    print(f"  Epochs: {epochs}")
+    print(f"  Batch size: {batch_size}")
+    print(f"  Image size: {img_size}")
+    print(f"  Device: {device}")
+    print(f"  Dataset config: {DATASET_YAML}")
+    print()
+
+    # Initialize model
+    print(f"Loading YOLOv8{model_size} model...")
+    model = YOLO(f"yolov8{model_size}.pt")  # Load pretrained model
+
+    # Print device info
+    if device == 0:
+        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
+        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+    else:
+        print("Using CPU (training will be slower)")
+
+    print("\nStarting training...")
+    print("-" * 60)
+
+    # Train the model
+    results = model.train(
+        data=str(DATASET_YAML),
+        epochs=epochs,
+        imgsz=img_size,
+        batch=batch_size,
+        device=device,
+        project=str(MODELS_DIR),
+        name="payment_slip_detector_v1",
+        exist_ok=True,
+        patience=20,  # Early stopping patience
+        save=True,
+        save_period=10,  # Save checkpoint every 10 epochs
+        verbose=True,
+        plots=True  # Generate training plots
+    )
+
+    print("\n" + "="*60)
+    print("Training complete!")
+    print("="*60)
+
+    # Print results
+    best_model_path = MODELS_DIR / "payment_slip_detector_v1" / "weights" / "best.pt"
+    last_model_path = MODELS_DIR / "payment_slip_detector_v1" / "weights" / "last.pt"
+
+    print(f"\nTrained models saved to:")
+    print(f"  Best model: {best_model_path}")
+    print(f"  Last model: {last_model_path}")
+
+    print(f"\nTraining plots saved to:")
+    print(f"  {MODELS_DIR / 'payment_slip_detector_v1'}")
+
+    print("\n" + "="*60)
+
+
+def validate_model(model_path=None):
+    """
+    Validate trained model on validation set
+
+    Args:
+        model_path: Path to model weights (default: best.pt from last training)
+    """
+    if model_path is None:
+        model_path = MODELS_DIR / "payment_slip_detector_v1" / "weights" / "best.pt"
+
+    if not Path(model_path).exists():
+        print(f"Error: Model not found at {model_path}")
+        print("Please train a model first")
+        return
+
+    print("="*60)
+    print("Validating Model")
+    print("="*60)
+    print(f"Model: {model_path}\n")
+
+    # Load model
+    model = YOLO(str(model_path))
+
+    # Validate
+    results = model.val(data=str(DATASET_YAML))
+
+    print("\n" + "="*60)
+    print("Validation complete!")
+    print("="*60)
+
+
+def predict_sample(model_path=None, image_path=None, conf_threshold=0.25):
+    """
+    Run prediction on a sample image
+
+    Args:
+        model_path: Path to model weights
+        image_path: Path to image to predict on
+        conf_threshold: Confidence threshold for detections
+    """
+    if model_path is None:
+        model_path = MODELS_DIR / "payment_slip_detector_v1" / "weights" / "best.pt"
+
+    if not Path(model_path).exists():
+        print(f"Error: Model not found at {model_path}")
+        return
+
+    if image_path is None:
+        # Try to get a sample from validation set
+        val_images_dir = Path("data/yolo_dataset/images/val")
+        sample_images = list(val_images_dir.glob("*.jpg")) + list(val_images_dir.glob("*.png"))
+        if sample_images:
+            image_path = sample_images[0]
+        else:
+            print("No sample images found")
+            return
+
+    print("="*60)
+    print("Running Prediction")
+    print("="*60)
+    print(f"Model: {model_path}")
+    print(f"Image: {image_path}")
+    print(f"Confidence threshold: {conf_threshold}\n")
+
+    # Load model
+    model = YOLO(str(model_path))
+
+    # Predict
+    results = model.predict(
+        source=str(image_path),
+        conf=conf_threshold,
+        save=True,
+        project=str(MODELS_DIR / "predictions"),
+        name="sample"
+    )
+
+    print(f"\nPrediction saved to: {MODELS_DIR / 'predictions' / 'sample'}")
+    print("="*60)
+
+
+def main():
+    """Main training function"""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Train YOLOv8 on invoice dataset")
+    parser.add_argument("--mode", type=str, default="train", choices=["train", "validate", "predict"],
+                        help="Mode: train, validate, or predict")
+    parser.add_argument("--model-size", type=str, default=MODEL_SIZE,
+                        choices=["n", "s", "m", "l", "x"],
+                        help="YOLO model size")
+    parser.add_argument("--epochs", type=int, default=EPOCHS,
+                        help="Number of training epochs")
+    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE,
+                        help="Batch size")
+    parser.add_argument("--img-size", type=int, default=IMAGE_SIZE,
+                        help="Image size")
+    parser.add_argument("--model-path", type=str, default=None,
+                        help="Path to model weights (for validate/predict)")
+    parser.add_argument("--image-path", type=str, default=None,
+                        help="Path to image (for predict)")
+
+    args = parser.parse_args()
+
+    if args.mode == "train":
+        train_model(
+            model_size=args.model_size,
+            epochs=args.epochs,
+            batch_size=args.batch_size,
+            img_size=args.img_size
+        )
+    elif args.mode == "validate":
+        validate_model(model_path=args.model_path)
+    elif args.mode == "predict":
+        predict_sample(model_path=args.model_path, image_path=args.image_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/main.py
+++ b/scripts/main.py
@@ -0,0 +1,237 @@
+# --- main.py (已升级) ---
+
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from ultralytics import YOLO
+import cv2
+import numpy as np
+import pytesseract
+import re
+import io
+import os
+from contextlib import asynccontextmanager
+
+# --- 配置 ---
+# TODO: 确保此路径指向您训练好的最佳模型
+MODEL_PATH = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)),
+    "models", "invoice_detector_v1", "weights", "best.pt"
+)
+
+# 定义一个字典来在 FastAPI 启动时加载模型
+ml_models = {}
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # 启动时加载模型
+    print(MODEL_PATH)
+    if not os.path.exists(MODEL_PATH):
+        print(f"警告: 找不到模型 {MODEL_PATH}。API 将无法工作。")
+        ml_models["yolo"] = None
+    else:
+        # 加载您的 "payment_slip" 区域检测器
+        ml_models["yolo"] = YOLO(MODEL_PATH) 
+        print("YOLOv8 区域检测模型加载成功。")
+    yield
+    # 清理模型
+    ml_models.clear()
+
+app = FastAPI(lifespan=lifespan)
+
+# --- Luhn (Modulus 10) 校验函数 ---
+def luhn_validate(number_str: str, expected_check_digit: str) -> bool:
+    """
+    使用 Modulus 10 (Luhn) 算法验证一个数字字符串。
+    (从右到左, 权重 1, 2, 1, 2...)
+    """
+    try:
+        digits = [int(d) for d in number_str]
+        weights = [1, 2] * (len(digits) // 2 + 1)
+        weights = weights[:len(digits)] # 确保权重列表长度一致
+        
+        sum_val = 0
+        # 从右到左计算
+        for d, w in zip(reversed(digits), weights):
+            product = d * w
+            if product >= 10:
+                sum_val += (product // 10) + (product % 10)
+            else:
+                sum_val += product
+                
+        calculated_check_digit = (10 - (sum_val % 10)) % 10
+        return str(calculated_check_digit) == expected_check_digit
+        
+    except Exception:
+        return False
+
+# --- 提取逻辑 ---
+
+def parse_ocr_rad(ocr_text: str) -> dict:
+    """
+    Plan A: 尝试解析机器可读码 (OCR-rad)
+    示例: # 400299582421 # 4603 00 7 > 48180020 #14#
+    """
+    # 移除所有空格以简化匹配
+    text_no_space = re.sub(r'\s+', '', ocr_text)
+    
+    # 定义一个更健壮的正则表达式
+    # 组1: OCR号 (在 #...# 之间)
+    # 组2: 金额 (Kronor + Öre) (在 #... 之后)
+    # 组3: 校验码 (1位数字)
+    # 组4: 账户号 (在 >...# 之间)
+    rad_regex = r'#([\d>]+)#(\d{2,})(\d)(\d{1})>([\d]+)#'
+    
+    match = re.search(rad_regex, text_no_space)
+    
+    if not match:
+        return None # 未找到机读码行
+
+    try:
+        ocr_num = match.group(1).replace(">", "") # 移除 > (如果有)
+        amount_base = match.group(2) + match.group(3) # "4603" + "00" = "460300"
+        check_digit = match.group(4) # "7"
+        account = match.group(5) # "48180020"
+        
+        # 运行Luhn校验
+        if luhn_validate(amount_base, check_digit):
+            # 校验成功! 这是高置信度数据
+            amount_kronor = amount_base[:-2]
+            amount_ore = amount_base[-2:]
+            
+            return {
+                "source": "OCR-rad (High Confidence)",
+                "ocr_number": ocr_num,
+                "amount_due": f"{amount_kronor}.{amount_ore}",
+                "bankgiro_plusgiro": account,
+                "due_date": None # 机读码行通常不包含日期
+            }
+        else:
+            # 校验失败!
+            print(f"Luhn 校验失败: 基础={amount_base}, 期望={check_digit}")
+            return None
+            
+    except Exception as e:
+        print(f"解析 OCR-rad 时出错: {e}")
+        return None
+
+def parse_human_readable(ocr_text: str) -> dict:
+    """
+    Plan B: 回退到人工可读区域
+    (这里我们使用之前版本中的简单 Regex, 
+     您也可以替换为您那个更复杂的 classify_text 逻辑)
+    """
+    data = {"source": "Human-Readable (Fallback)"}
+    
+    # 查找 BG/PG (Bankgiro/Plusgiro)
+    bg_match = re.search(r'Bankgiro\D*(\d{2,4}[- ]\d{4})', ocr_text, re.IGNORECASE)
+    pg_match = re.search(r'PlusGiro\D*(\d{2,7}[- ]\d)', ocr_text, re.IGNORECASE)
+    if bg_match:
+        data["bankgiro_plusgiro"] = bg_match.group(1).replace(" ", "")
+    elif pg_match:
+        data["bankgiro_plusgiro"] = pg_match.group(1).replace(" ", "")
+    else:
+        # 备用查找
+        bg_pg_alt = re.search(r'(\b\d{2,4}[- ]\d{4}\b)|(\b\d{2,7}[- ]\d\b)', ocr_text)
+        if bg_pg_alt:
+            data["bankgiro_plusgiro"] = bg_pg_alt.group(0).replace(" ", "")
+
+    # 查找 OCR
+    ocr_match = re.search(r'(OCR|Fakturanummer|Referens)\D*(\d[\d\s]{5,}\d)', ocr_text, re.IGNORECASE)
+    if ocr_match:
+        data["ocr_number"] = re.sub(r'\s', '', ocr_match.group(2))
+
+    # 查找金额
+    amount_match = re.search(r'(Att betala|Belopp)\D*([\d\s,.]+)\s*(kr|SEK)?', ocr_text, re.IGNORECASE)
+    if amount_match:
+        amount_str = amount_match.group(2).strip().replace(" ", "").replace(",", ".")
+        if amount_str.count('.') > 1:
+            amount_str = amount_str.replace(".", "", amount_str.count('.') - 1)
+        data["amount_due"] = amount_str
+        
+    # 查找截止日期
+    date_match = re.search(r'(senast|Förfallodag)\D*(\d{4}[- ]\d{2}[- ]\d{2})', ocr_text, re.IGNORECASE)
+    if date_match:
+        data["due_date"] = date_match.group(2).replace(" ", "-")
+
+    return data
+
+
+def extract_info_from_crop(crop_image: np.ndarray) -> dict:
+    """
+    主提取函数: 执行 Plan A 和 Plan B
+    """
+    try:
+        # 1. 预处理并运行 OCR (获取所有文本)
+        gray = cv2.cvtColor(crop_image, cv2.COLOR_BGR2GRAY)
+        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
+        
+        # 使用 psm 6 假设是一个统一的文本块, 这对 OCR-rad 很友好
+        ocr_text = pytesseract.image_to_string(thresh, lang='swe', config='--psm 6')
+
+        # 2. --- Plan A: 尝试解析机读码 ---
+        rad_data = parse_ocr_rad(ocr_text)
+        
+        if rad_data:
+            # Plan A 成功!
+            return rad_data
+
+        # 3. --- Plan B: Plan A 失败, 回退到人工读取 ---
+        # 我们重新运行 OCR, 使用 psm 3 (自动布局), 这对人工区域更友好
+        ocr_text_human = pytesseract.image_to_string(thresh, lang='swe', config='--psm 3')
+        
+        human_data = parse_human_readable(ocr_text_human)
+        
+        # 即使回退失败, 也返回空字典 (或部分数据)
+        return human_data
+
+    except Exception as e:
+        return {"error": f"提取时出错: {e}"}
+
+
+@app.post("/extract_invoice/")
+async def extract_invoice_data(file: UploadFile = File(...)):
+    
+    if ml_models.get("yolo") is None:
+        raise HTTPException(status_code=503, detail="模型未加载。请检查模型路径。")
+
+    try:
+        contents = await file.read()
+        nparr = np.frombuffer(contents, np.uint8)
+        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        if img is None:
+            raise HTTPException(status_code=400, detail="无法解码图像文件。")
+            
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"读取文件时出错: {e}")
+
+    # 1. 运行 YOLO 检测 (阶段一)
+    results = ml_models["yolo"](img, verbose=False)
+    
+    all_extractions = []
+    
+    if not results or not results[0].boxes:
+         return {"message": "未在图像中检测到支付凭证区域。"}
+    
+    for res in results:
+        # 遍历所有检测到的凭证 (通常只有一个)
+        for box_coords in res.boxes.xyxy.cpu().numpy().astype(int):
+            xmin, ymin, xmax, ymax = box_coords
+            
+            # 2. 裁剪图像
+            crop = img[ymin:ymax, xmin:xmax]
+            
+            # 3. 在裁剪图上运行 "Plan A/B" 提取 (阶段二)
+            extracted_data = extract_info_from_crop(crop)
+            extracted_data["bounding_box"] = [xmin, ymin, xmax, ymax]
+            all_extractions.append(extracted_data)
+
+    if not all_extractions:
+        return {"message": "检测到支付凭证, 但未能提取任何信息。"}
+
+    return {"invoice_extractions": all_extractions}
+
+if __name__ == "__main__":
+    import uvicorn
+    print(f"--- 启动 FastAPI 服务 ---")
+    print(f"加载模型: {MODEL_PATH}")
+    print(f"访问 http://127.0.0.1:8000/docs 查看 API 文档")
+    uvicorn.run(app, host="127.0.0.1", port=8000)
--- a/test_api.py
+++ b/test_api.py
@@ -0,0 +1,14 @@
+import requests
+import json
+
+# Test the API with the problematic invoice
+url = "http://127.0.0.1:8000/extract_invoice/"
+file_path = r"data\processed_images\4BC5E5B3-E561-4A73-BC9C-46D4F08F89C3.png"
+
+with open(file_path, 'rb') as f:
+    files = {'file': f}
+    response = requests.post(url, files=files)
+
+print("Status Code:", response.status_code)
+print("\nResponse JSON:")
+print(json.dumps(response.json(), indent=2, ensure_ascii=False))