Add payment line parser and fix OCR override from payment_line

- Add MachineCodeParser for Swedish invoice payment line parsing
- Fix OCR Reference extraction by normalizing account number spaces
- Add cross-validation tests for pipeline and field_extractor
- Update UI layout for compact upload and full-width results

Key changes:
- machine_code_parser.py: Handle spaces in Bankgiro numbers (e.g. "78 2 1 713")
- pipeline.py: OCR and Amount override from payment_line, BG/PG comparison only
- field_extractor.py: Improved invoice number normalization
- app.py: Responsive UI layout changes

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-01-21 21:47:02 +01:00
parent e9460e9f34
commit 4ea4bc96d4
33 changed files with 7530 additions and 562 deletions

View File

@@ -81,6 +81,9 @@ def create_app(config: AppConfig | None = None) -> FastAPI:
- Bankgiro
- Plusgiro
- Amount
- supplier_org_number (Swedish organization number)
- customer_number
- payment_line (machine-readable payment code)
""",
version="1.0.0",
lifespan=lifespan,
@@ -161,17 +164,11 @@ def get_html_ui() -> str:
}
.main-content {
display: grid;
grid-template-columns: 1fr 1fr;
display: flex;
flex-direction: column;
gap: 20px;
}
@media (max-width: 900px) {
.main-content {
grid-template-columns: 1fr;
}
}
.card {
background: white;
border-radius: 16px;
@@ -188,14 +185,28 @@ def get_html_ui() -> str:
gap: 10px;
}
.upload-card {
display: flex;
align-items: center;
gap: 20px;
flex-wrap: wrap;
}
.upload-card h2 {
margin-bottom: 0;
white-space: nowrap;
}
.upload-area {
border: 3px dashed #ddd;
border-radius: 12px;
padding: 40px;
border: 2px dashed #ddd;
border-radius: 10px;
padding: 15px 25px;
text-align: center;
cursor: pointer;
transition: all 0.3s;
background: #fafafa;
flex: 1;
min-width: 200px;
}
.upload-area:hover, .upload-area.dragover {
@@ -209,17 +220,21 @@ def get_html_ui() -> str:
}
.upload-icon {
font-size: 48px;
margin-bottom: 15px;
font-size: 24px;
display: inline;
margin-right: 8px;
}
.upload-area p {
color: #666;
margin-bottom: 10px;
margin: 0;
display: inline;
}
.upload-area small {
color: #999;
display: block;
margin-top: 5px;
}
#file-input {
@@ -237,10 +252,10 @@ def get_html_ui() -> str:
.btn {
display: inline-block;
padding: 14px 28px;
padding: 12px 24px;
border: none;
border-radius: 10px;
font-size: 1rem;
font-size: 0.9rem;
font-weight: 600;
cursor: pointer;
transition: all 0.3s;
@@ -251,8 +266,6 @@ def get_html_ui() -> str:
.btn-primary {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
width: 100%;
margin-top: 20px;
}
.btn-primary:hover:not(:disabled) {
@@ -267,22 +280,21 @@ def get_html_ui() -> str:
.loading {
display: none;
text-align: center;
padding: 20px;
align-items: center;
gap: 10px;
}
.loading.active {
display: block;
display: flex;
}
.spinner {
width: 40px;
height: 40px;
border: 4px solid #f3f3f3;
border-top: 4px solid #667eea;
width: 24px;
height: 24px;
border: 3px solid #f3f3f3;
border-top: 3px solid #667eea;
border-radius: 50%;
animation: spin 1s linear infinite;
margin: 0 auto 15px;
}
@keyframes spin {
@@ -331,7 +343,7 @@ def get_html_ui() -> str:
.fields-grid {
display: grid;
grid-template-columns: repeat(2, 1fr);
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 12px;
}
@@ -380,6 +392,84 @@ def get_html_ui() -> str:
margin-top: 15px;
}
.cross-validation {
background: #f8fafc;
border: 1px solid #e2e8f0;
border-radius: 10px;
padding: 15px;
margin-top: 20px;
}
.cross-validation h3 {
margin: 0 0 10px 0;
color: #334155;
font-size: 1rem;
}
.cv-status {
font-weight: 600;
padding: 8px 12px;
border-radius: 6px;
margin-bottom: 10px;
display: inline-block;
}
.cv-status.valid {
background: #dcfce7;
color: #166534;
}
.cv-status.invalid {
background: #fef3c7;
color: #92400e;
}
.cv-details {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 10px;
}
.cv-item {
background: white;
border: 1px solid #e2e8f0;
border-radius: 6px;
padding: 6px 12px;
font-size: 0.85rem;
display: flex;
align-items: center;
gap: 6px;
}
.cv-item.match {
border-color: #86efac;
background: #f0fdf4;
}
.cv-item.mismatch {
border-color: #fca5a5;
background: #fef2f2;
}
.cv-icon {
font-weight: bold;
}
.cv-item.match .cv-icon {
color: #16a34a;
}
.cv-item.mismatch .cv-icon {
color: #dc2626;
}
.cv-summary {
margin-top: 10px;
font-size: 0.8rem;
color: #64748b;
}
.error-message {
background: #fee2e2;
color: #991b1b;
@@ -405,33 +495,35 @@ def get_html_ui() -> str:
</header>
<div class="main-content">
<div class="card">
<h2>📤 Upload Document</h2>
<!-- Upload Section - Compact -->
<div class="card upload-card">
<h2>📤 Upload</h2>
<div class="upload-area" id="upload-area">
<div class="upload-icon">📁</div>
<p>Drag & drop your file here</p>
<p>or <strong>click to browse</strong></p>
<small>Supports PDF, PNG, JPG (max 50MB)</small>
<span class="upload-icon">📁</span>
<p>Drag & drop or <strong>click to browse</strong></p>
<small>PDF, PNG, JPG (max 50MB)</small>
<input type="file" id="file-input" accept=".pdf,.png,.jpg,.jpeg">
<div class="file-name" id="file-name" style="display: none;"></div>
</div>
<div class="file-name" id="file-name" style="display: none;"></div>
<button class="btn btn-primary" id="submit-btn" disabled>
🚀 Extract Fields
🚀 Extract
</button>
<div class="loading" id="loading">
<div class="spinner"></div>
<p>Processing document...</p>
<p>Processing...</p>
</div>
</div>
<!-- Results Section - Full Width -->
<div class="card">
<h2>📊 Extraction Results</h2>
<div id="placeholder" style="text-align: center; padding: 40px; color: #999;">
<div style="font-size: 64px; margin-bottom: 15px;">🔍</div>
<div id="placeholder" style="text-align: center; padding: 30px; color: #999;">
<div style="font-size: 48px; margin-bottom: 10px;">🔍</div>
<p>Upload a document to see extraction results</p>
</div>
@@ -445,6 +537,8 @@ def get_html_ui() -> str:
<div class="processing-time" id="processing-time"></div>
<div class="cross-validation" id="cross-validation" style="display: none;"></div>
<div class="error-message" id="error-message" style="display: none;"></div>
<div class="visualization" id="visualization" style="display: none;">
@@ -566,7 +660,11 @@ def get_html_ui() -> str:
const fieldsGrid = document.getElementById('fields-grid');
fieldsGrid.innerHTML = '';
const fieldOrder = ['InvoiceNumber', 'InvoiceDate', 'InvoiceDueDate', 'OCR', 'Amount', 'Bankgiro', 'Plusgiro'];
const fieldOrder = [
'InvoiceNumber', 'InvoiceDate', 'InvoiceDueDate', 'OCR',
'Amount', 'Bankgiro', 'Plusgiro',
'supplier_org_number', 'customer_number', 'payment_line'
];
fieldOrder.forEach(field => {
const value = result.fields[field];
@@ -588,6 +686,45 @@ def get_html_ui() -> str:
document.getElementById('processing-time').textContent =
`⏱️ Processed in ${result.processing_time_ms.toFixed(0)}ms`;
// Cross-validation results
const cvDiv = document.getElementById('cross-validation');
if (result.cross_validation) {
const cv = result.cross_validation;
let cvHtml = '<h3>🔍 Cross-Validation (Payment Line)</h3>';
cvHtml += `<div class="cv-status ${cv.is_valid ? 'valid' : 'invalid'}">`;
cvHtml += cv.is_valid ? '✅ Valid' : '⚠️ Mismatch Detected';
cvHtml += '</div>';
cvHtml += '<div class="cv-details">';
if (cv.payment_line_ocr) {
const matchIcon = cv.ocr_match === true ? '' : (cv.ocr_match === false ? '' : '');
cvHtml += `<div class="cv-item ${cv.ocr_match === true ? 'match' : (cv.ocr_match === false ? 'mismatch' : '')}">`;
cvHtml += `<span class="cv-icon">${matchIcon}</span> OCR: ${cv.payment_line_ocr}</div>`;
}
if (cv.payment_line_amount) {
const matchIcon = cv.amount_match === true ? '' : (cv.amount_match === false ? '' : '');
cvHtml += `<div class="cv-item ${cv.amount_match === true ? 'match' : (cv.amount_match === false ? 'mismatch' : '')}">`;
cvHtml += `<span class="cv-icon">${matchIcon}</span> Amount: ${cv.payment_line_amount}</div>`;
}
if (cv.payment_line_account) {
const accountType = cv.payment_line_account_type === 'bankgiro' ? 'Bankgiro' : 'Plusgiro';
const matchField = cv.payment_line_account_type === 'bankgiro' ? cv.bankgiro_match : cv.plusgiro_match;
const matchIcon = matchField === true ? '' : (matchField === false ? '' : '');
cvHtml += `<div class="cv-item ${matchField === true ? 'match' : (matchField === false ? 'mismatch' : '')}">`;
cvHtml += `<span class="cv-icon">${matchIcon}</span> ${accountType}: ${cv.payment_line_account}</div>`;
}
cvHtml += '</div>';
if (cv.details && cv.details.length > 0) {
cvHtml += '<div class="cv-summary">' + cv.details[cv.details.length - 1] + '</div>';
}
cvDiv.innerHTML = cvHtml;
cvDiv.style.display = 'block';
} else {
cvDiv.style.display = 'none';
}
// Visualization
if (result.visualization_url) {
const vizDiv = document.getElementById('visualization');
@@ -608,7 +745,19 @@ def get_html_ui() -> str:
}
function formatFieldName(name) {
return name.replace(/([A-Z])/g, ' $1').trim();
const nameMap = {
'InvoiceNumber': 'Invoice Number',
'InvoiceDate': 'Invoice Date',
'InvoiceDueDate': 'Due Date',
'OCR': 'OCR Reference',
'Amount': 'Amount',
'Bankgiro': 'Bankgiro',
'Plusgiro': 'Plusgiro',
'supplier_org_number': 'Supplier Org Number',
'customer_number': 'Customer Number',
'payment_line': 'Payment Line'
};
return nameMap[name] || name.replace(/([A-Z])/g, ' $1').replace(/_/g, ' ').trim();
}
</script>
</body>