This commit is contained in:
2025-08-13 23:30:22 +02:00
parent 87ba009bd7
commit 02290bb935
4 changed files with 13 additions and 11 deletions

View File

@@ -15,11 +15,14 @@ The document's primary language is '{language}'.
## Instructions: ## Instructions:
Carefully analyze the invoice image and extract the following fields according to these specific rules. Do not invent information. If a field is not found or is unclear, follow the specific instruction for that field. Carefully analyze the invoice image and extract the following fields according to these specific rules. Do not invent information. If a field is not found or is unclear, follow the specific instruction for that field.
- `date`: Extract in YYYY-MM-DD format. If unclear, leave as an empty string. - `invoice_date`: The invoice date. Extract in YYYY-MM-DD format. If unclear, leave as an empty string.
- `invoice_due_date`: The invoice due date.Extract in YYYY-MM-DD format. If unclear, leave as an empty string.
- `invoice_number`: If not found or unclear, leave as an empty string. - `invoice_number`: If not found or unclear, leave as an empty string.
- `ocr_number`: The OCR number from the invoice. If not found or unclear, leave as an empty string.
- `supplier_number`: This is the organisation number. If not found or unclear, leave as an empty string. - `supplier_number`: This is the organisation number. If not found or unclear, leave as an empty string.
- `biller_name`: This is the sender's name. If not found or unclear, leave as an empty string. - `biller_name`: This is the sender's name. If not found or unclear, leave as an empty string.
- `amount`: Extract the final total amount and format it to a decimal number. If not present, leave as null. - `amount`: Extract the final total amount and format it to a decimal number. If not present, leave as null.
- `tax_exclusive_amount`: Extract the the amount excluding taxes and format it to a decimal number. If not present, leave as null.
- `customer_name`: This is the receiver's name. Ensure it is a name and clear any special characters. If not found or unclear, leave as an empty string. - `customer_name`: This is the receiver's name. Ensure it is a name and clear any special characters. If not found or unclear, leave as an empty string.
- `customer_address`: This is the receiver's full address. Put it in one line. If not found or unclear, leave as an empty string. - `customer_address`: This is the receiver's full address. Put it in one line. If not found or unclear, leave as an empty string.
- `customer_address_line`: This is only the street address line from the receiver's address. If not found or unclear, leave as an empty string. - `customer_address_line`: This is only the street address line from the receiver's address. If not found or unclear, leave as an empty string.
@@ -33,7 +36,7 @@ Carefully analyze the invoice image and extract the following fields according t
- `bank_giro`: If found, extract the bank giro number. It often follows patterns like 'ddd-dddd', 'dddd-dddd', or 'dddddddd #41#'. If not found or unclear, leave as an empty string. - `bank_giro`: If found, extract the bank giro number. It often follows patterns like 'ddd-dddd', 'dddd-dddd', or 'dddddddd #41#'. If not found or unclear, leave as an empty string.
- `plus_giro`: If found, extract the plus giro number. It often follows patterns like 'ddddddd-d #16#', 'ddddddd-d', or 'ddd dd dd-d'. If not found or unclear, leave as an empty string. - `plus_giro`: If found, extract the plus giro number. It often follows patterns like 'ddddddd-d #16#', 'ddddddd-d', or 'ddd dd dd-d'. If not found or unclear, leave as an empty string.
- `customer_ssn`: If found, extract the customer social security number (personnummer). It follows the pattern 'YYYYMMDD-XXXX' or 'YYMMDD-XXXX'. If not found or unclear, leave as an empty string. - `customer_ssn`: If found, extract the customer social security number (personnummer). It follows the pattern 'YYYYMMDD-XXXX' or 'YYMMDD-XXXX'. If not found or unclear, leave as an empty string.
- `line_items`: Extract all line items from the invoice. For each item, extract the `description`, `quantity`, `unit_price`, and `total_price`. If a value is not present, leave it as null. - `line_items`: Extract all line items from the invoice. For each item, extract the `description`, `quantity`, `unit_price`, and `total_price`. A list of all line items from the invoice. Make sure all of them are extracted. If a value is not present, leave it as null.
## Example: ## Example:
If the invoice shows a line item "Consulting Services | 2 hours | $100.00/hr | $200.00", the output for that line item should be: If the invoice shows a line item "Consulting Services | 2 hours | $100.00/hr | $200.00", the output for that line item should be:

View File

@@ -4,20 +4,16 @@ from typing import List
def extract_text_from_images(images: List[Image.Image]) -> str: def extract_text_from_images(images: List[Image.Image]) -> str:
""" print("--- [Core OCR] Extracting text...")
使用Tesseract OCR从一系列图片中提取并合并所有文本。
"""
print("--- [Core OCR] 正在从图片中提取文本用于向量化...")
full_text = [] full_text = []
for img in images: for img in images:
try: try:
# lang='chi_sim+eng' 表示同时识别简体中文和英文
text = pytesseract.image_to_string(img, lang='chi_sim+eng') text = pytesseract.image_to_string(img, lang='chi_sim+eng')
full_text.append(text) full_text.append(text)
except Exception as e: except Exception as e:
print(f"--- [Core OCR] 单页处理失败: {e}") print(f"--- [Core OCR] Processing image failed: {e}")
continue continue
combined_text = "\n\n--- Page Break ---\n\n".join(full_text) combined_text = "\n\n--- Page Break ---\n\n".join(full_text)
print("--- [Core OCR] 文本提取成功。") print("--- [Core OCR] Text extraction completed.")
return combined_text return combined_text

View File

@@ -32,11 +32,14 @@ class LineItem(BaseModel):
class InvoiceInfo(BaseModel): class InvoiceInfo(BaseModel):
"""Defines the detailed, structured information to be extracted from an invoice.""" """Defines the detailed, structured information to be extracted from an invoice."""
date: Optional[str] = Field("", description="Extract in YYYY-MM-DD format. If unclear, leave as an empty string.") invoice_date: Optional[str] = Field("", description="The invoice date. Extract in YYYY-MM-DD format. If unclear, leave as an empty string.")
invoice_due_date: Optional[str] = Field("", description="The invoice due date.Extract in YYYY-MM-DD format. If unclear, leave as an empty string.")
invoice_number: Optional[str] = Field("", description="If not found or unclear, leave as an empty string.") invoice_number: Optional[str] = Field("", description="If not found or unclear, leave as an empty string.")
ocr_number: Optional[str] = Field("", description="The OCR number from the invoice. If not found or unclear, leave as an empty string.")
supplier_number: Optional[str] = Field("", description="It's the organisation number. If not found or unclear, leave as an empty string.") supplier_number: Optional[str] = Field("", description="It's the organisation number. If not found or unclear, leave as an empty string.")
biller_name: Optional[str] = Field("", description="It's the sender's name. If not found or unclear, leave as an empty string.") biller_name: Optional[str] = Field("", description="It's the sender's name. If not found or unclear, leave as an empty string.")
amount: Optional[float] = Field(None, description="Extract and format to decimal. If not present, leave as null.") amount: Optional[float] = Field(None, description="Extract and format to decimal. If not present, leave as null.")
tax_exclusive_amount: Optional[float] = Field(None, description="Extract the the amount excluding taxes and format it to a decimal number. If not present, leave as null.")
customer_name: Optional[str] = Field("", description="It's the receiver's name. Clean any special chars from the name. If not found or unclear, leave as an empty string.") customer_name: Optional[str] = Field("", description="It's the receiver's name. Clean any special chars from the name. If not found or unclear, leave as an empty string.")
customer_address: Optional[str] = Field("", description="It's the receiver's address. Put it in one line. If not found or unclear, leave as an empty string.") customer_address: Optional[str] = Field("", description="It's the receiver's address. Put it in one line. If not found or unclear, leave as an empty string.")
customer_address_line: Optional[str] = Field("", description="It's the receiver's address line, not the whole address. If not found or unclear, leave as an empty string.") customer_address_line: Optional[str] = Field("", description="It's the receiver's address line, not the whole address. If not found or unclear, leave as an empty string.")
@@ -50,4 +53,4 @@ class InvoiceInfo(BaseModel):
bank_giro: Optional[str] = Field("", description="BankGiro number, e.g., '123-4567'. If not found, leave as an empty string.") bank_giro: Optional[str] = Field("", description="BankGiro number, e.g., '123-4567'. If not found, leave as an empty string.")
plus_giro: Optional[str] = Field("", description="PlusGiro number, e.g., '123456-7'. If not found, leave as an empty string.") plus_giro: Optional[str] = Field("", description="PlusGiro number, e.g., '123456-7'. If not found, leave as an empty string.")
customer_ssn: Optional[str] = Field("", description="Customer's social security number, e.g., 'YYYYMMDD-XXXX'. If not found, leave as an empty string.") customer_ssn: Optional[str] = Field("", description="Customer's social security number, e.g., 'YYYYMMDD-XXXX'. If not found, leave as an empty string.")
line_items: List[LineItem] = Field([], description="A list of all line items from the invoice.") line_items: List[LineItem] = Field([], description="A list of all line items from the invoice. Make sure all of them are extracted.")

Binary file not shown.