Files
Yaojia Wang 26d0c41203 feat: integrate invoice-master-poc-v2 inference API
Rewrite OcrService to match the actual inference API response format
(nested status/result structure with PascalCase/snake_case field names).
Register IOcrService in DI with typed HttpClient and Polly v8 resilience
(retry, timeout, circuit breaker via AddStandardResilienceHandler).

Key changes:
- Fix response model to match real API (InferenceApiResponse)
- Map correct field names (InvoiceNumber, InvoiceDueDate, OCR, Amount, etc.)
- Add extract_line_items=true for VAT summary extraction
- Copy stream before sending to avoid disposal conflicts with retries
- Add JsonException handling for malformed responses
- Remove sensitive data from error logs
- Add 35 unit tests covering field mapping, VAT parsing, error handling,
  decimal/date formats, and content type detection
2026-02-13 10:30:50 +01:00

807 lines
27 KiB
C#

using System.Net;
using System.Text;
using System.Text.Json;
using FiscalFlow.Core.Interfaces;
using FiscalFlow.Infrastructure.Services;
using FluentAssertions;
using Microsoft.Extensions.Logging;
using Moq;
using Xunit;
namespace FiscalFlow.UnitTests.Services;
public sealed class OcrServiceTests
{
private readonly Mock<ILogger<OcrService>> _loggerMock;
public OcrServiceTests()
{
_loggerMock = new Mock<ILogger<OcrService>>();
}
[Fact]
public async Task ExtractAsync_SuccessfulExtraction_ReturnsCompleteOcrResult()
{
// Arrange
var apiResponse = new
{
status = "success",
message = "Processed document abc123",
result = new
{
document_id = "abc123",
success = true,
document_type = "invoice",
fields = new Dictionary<string, string?>
{
["InvoiceNumber"] = "INV-001",
["InvoiceDate"] = "2024-01-15",
["InvoiceDueDate"] = "2024-02-15",
["OCR"] = "123456789",
["Bankgiro"] = "1234-5678",
["Plusgiro"] = null,
["Amount"] = "12500.00",
["SupplierName"] = "Test AB",
["supplier_org_number"] = "556677-8899",
["customer_number"] = "C-001",
["payment_line"] = "# 123456789 # 12500 00 #",
["Currency"] = "SEK",
},
confidence = new Dictionary<string, decimal>
{
["InvoiceNumber"] = 0.95m,
["InvoiceDate"] = 0.88m,
["Amount"] = 0.92m,
},
detections = new List<object>(),
processing_time_ms = 1234.5,
errors = new List<string>(),
vat_summary = new
{
breakdowns = new[]
{
new
{
rate = 25.0m,
vat_amount = "2500.00",
base_amount = "10000.00",
source = "regex",
},
},
total_excl_vat = "10000.00",
total_vat = "2500.00",
total_incl_vat = "12500.00",
confidence = 0.85m,
},
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Should().NotBeNull();
result.Success.Should().BeTrue();
result.Confidence.Should().Be(0.9167m);
result.ProcessingTimeMs.Should().Be(1234.5);
result.DocumentType.Should().Be("invoice");
result.FieldConfidences.Should().ContainKey("InvoiceNumber").WhoseValue.Should().Be(0.95m);
result.FieldConfidences.Should().ContainKey("InvoiceDate").WhoseValue.Should().Be(0.88m);
result.FieldConfidences.Should().ContainKey("Amount").WhoseValue.Should().Be(0.92m);
var data = result.Data;
data.Should().NotBeNull();
data!.InvoiceNumber.Should().Be("INV-001");
data.InvoiceDate.Should().Be(new DateTime(2024, 1, 15));
data.DueDate.Should().Be(new DateTime(2024, 2, 15));
data.OcrNumber.Should().Be("123456789");
data.Bankgiro.Should().Be("1234-5678");
data.Plusgiro.Should().BeNull();
data.AmountTotal.Should().Be(12500.00m);
data.SupplierName.Should().Be("Test AB");
data.SupplierOrgNumber.Should().Be("556677-8899");
data.CustomerNumber.Should().Be("C-001");
data.PaymentLine.Should().Be("# 123456789 # 12500 00 #");
data.Currency.Should().Be("SEK");
data.AmountVat.Should().Be(2500.00m);
data.VatRate.Should().Be(25);
}
[Fact]
public async Task ExtractAsync_FieldMapping_MapsAllFieldsCorrectly()
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>
{
["InvoiceNumber"] = "TEST-123",
["InvoiceDate"] = "2024-03-20",
["InvoiceDueDate"] = "2024-04-20",
["OCR"] = "987654321",
["Bankgiro"] = "9999-8888",
["Plusgiro"] = "123456-7",
["Amount"] = "5000.00",
["supplier_org_number"] = "111222-3333",
["customer_number"] = "CUST-999",
["payment_line"] = "Payment reference line",
},
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 500.0,
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
var data = result.Data;
data.Should().NotBeNull();
data!.InvoiceNumber.Should().Be("TEST-123");
data.InvoiceDate.Should().Be(new DateTime(2024, 3, 20));
data.DueDate.Should().Be(new DateTime(2024, 4, 20));
data.OcrNumber.Should().Be("987654321");
data.Bankgiro.Should().Be("9999-8888");
data.Plusgiro.Should().Be("123456-7");
data.AmountTotal.Should().Be(5000.00m);
data.SupplierOrgNumber.Should().Be("111222-3333");
data.CustomerNumber.Should().Be("CUST-999");
data.PaymentLine.Should().Be("Payment reference line");
}
[Fact]
public async Task ExtractAsync_VatSummaryParsing_PopulatesVatFieldsFromTotalVat()
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>(),
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 100.0,
vat_summary = new
{
total_vat = "1250.50",
breakdowns = new[]
{
new
{
rate = 25.0m,
vat_amount = "1250.50",
base_amount = "5000.00",
source = "table",
},
},
},
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
var data = result.Data;
data.Should().NotBeNull();
data!.AmountVat.Should().Be(1250.50m);
data.VatRate.Should().Be(25);
}
[Fact]
public async Task ExtractAsync_VatSummaryWithoutTotalVat_SumsFromBreakdowns()
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>(),
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 100.0,
vat_summary = new
{
breakdowns = new[]
{
new
{
rate = 25.0m,
vat_amount = "1000.00",
base_amount = "4000.00",
source = "table",
},
new
{
rate = 12.0m,
vat_amount = "240.00",
base_amount = "2000.00",
source = "table",
},
},
},
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
var data = result.Data;
data.Should().NotBeNull();
data!.AmountVat.Should().Be(1240.00m);
data.VatRate.Should().Be(25);
}
[Fact]
public async Task ExtractAsync_ConfidenceCalculation_AveragesFieldConfidencesAndRoundsToFourDecimals()
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>(),
confidence = new Dictionary<string, decimal>
{
["Field1"] = 0.123456789m,
["Field2"] = 0.987654321m,
["Field3"] = 0.555555555m,
},
processing_time_ms = 100.0,
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Confidence.Should().Be(0.5556m);
}
[Fact]
public async Task ExtractAsync_NoConfidenceScores_ReturnsZeroConfidence()
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>(),
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 100.0,
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Confidence.Should().Be(0m);
}
[Fact]
public async Task ExtractAsync_HttpError500_ReturnsFailureWithErrorMessage()
{
// Arrange
var httpClient = CreateHttpClient(HttpStatusCode.InternalServerError, "Internal server error");
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Should().NotBeNull();
result.Success.Should().BeFalse();
result.ErrorMessage.Should().Be("OCR API returned InternalServerError");
}
[Fact]
public async Task ExtractAsync_NullResponse_ReturnsFailure()
{
// Arrange
var httpClient = CreateHttpClient(HttpStatusCode.OK, "{}");
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Should().NotBeNull();
result.Success.Should().BeFalse();
result.ErrorMessage.Should().NotBeNullOrEmpty();
}
[Fact]
public async Task ExtractAsync_NullResult_ReturnsFailureWithMessage()
{
// Arrange
var apiResponse = new
{
status = "error",
message = "Processing failed",
result = (object?)null,
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Should().NotBeNull();
result.Success.Should().BeFalse();
result.ErrorMessage.Should().Be("Processing failed");
}
[Fact]
public async Task ExtractAsync_ErrorStatus_ReturnsFailureWithMessage()
{
// Arrange
var apiResponse = new
{
status = "error",
message = "Document format not supported",
result = new
{
document_id = "test",
success = false,
fields = new Dictionary<string, string?>(),
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 10.0,
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Should().NotBeNull();
result.Success.Should().BeFalse();
result.ErrorMessage.Should().Be("Document format not supported");
}
[Fact]
public async Task ExtractAsync_PartialStatus_TreatedAsSuccess()
{
// Arrange
var apiResponse = new
{
status = "partial",
message = "Some fields not detected",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>
{
["InvoiceNumber"] = "PARTIAL-001",
},
confidence = new Dictionary<string, decimal>
{
["InvoiceNumber"] = 0.75m,
},
processing_time_ms = 200.0,
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Should().NotBeNull();
result.Success.Should().BeTrue();
result.Data.Should().NotBeNull();
result.Data!.InvoiceNumber.Should().Be("PARTIAL-001");
}
[Fact]
public async Task ExtractAsync_TaskCanceledException_NotUserCancellation_ReturnsTimeoutError()
{
// Arrange
var handler = new TimeoutHttpMessageHandler();
var httpClient = new HttpClient(handler) { BaseAddress = new Uri("http://localhost") };
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Should().NotBeNull();
result.Success.Should().BeFalse();
result.ErrorMessage.Should().Be("OCR API request timed out");
}
[Fact]
public async Task ExtractAsync_HttpRequestException_ReturnsConnectionError()
{
// Arrange
var handler = new ConnectionErrorHttpMessageHandler();
var httpClient = new HttpClient(handler) { BaseAddress = new Uri("http://localhost") };
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Should().NotBeNull();
result.Success.Should().BeFalse();
result.ErrorMessage.Should().Contain("OCR API connection error");
}
[Theory]
[InlineData("1234.56", 1234.56)]
[InlineData("1 234,56", 1234.56)]
[InlineData("1,234.56", 1234.56)]
[InlineData("1234,56", 1234.56)]
[InlineData("12345", 12345)]
[InlineData("0.99", 0.99)]
public async Task ExtractAsync_DecimalParsing_HandlesVariousFormats(string input, decimal expected)
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>
{
["Amount"] = input,
},
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 100.0,
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Data.Should().NotBeNull();
result.Data!.AmountTotal.Should().Be(expected);
}
[Theory]
[InlineData("2024-01-15", 2024, 1, 15)]
[InlineData("15/01/2024", 2024, 1, 15)]
[InlineData("15.01.2024", 2024, 1, 15)]
[InlineData("2024/01/15", 2024, 1, 15)]
public async Task ExtractAsync_DateParsing_HandlesVariousFormats(string input, int year, int month, int day)
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>
{
["InvoiceDate"] = input,
},
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 100.0,
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Data.Should().NotBeNull();
result.Data!.InvoiceDate.Should().Be(new DateTime(year, month, day));
}
[Theory]
[InlineData("test.pdf")]
[InlineData("invoice.PDF")]
[InlineData("scan.png")]
[InlineData("photo.PNG")]
[InlineData("image.jpg")]
[InlineData("photo.jpeg")]
[InlineData("document.JPG")]
[InlineData("unknown.txt")]
public async Task ExtractAsync_DifferentFileTypes_ProcessesSuccessfully(string fileName)
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>(),
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 100.0,
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, fileName, CancellationToken.None);
// Assert
result.Should().NotBeNull();
result.Success.Should().BeTrue();
}
[Fact]
public async Task ExtractAsync_ErrorsInResult_IncludedInErrorMessage()
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>(),
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 100.0,
errors = new List<string> { "Low quality image", "Missing page 2" },
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Should().NotBeNull();
result.ErrorMessage.Should().Be("Low quality image; Missing page 2");
}
[Fact]
public async Task ExtractAsync_DefaultCurrency_SetToSEK()
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>(),
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 100.0,
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Data.Should().NotBeNull();
result.Data!.Currency.Should().Be("SEK");
}
[Fact]
public async Task ExtractAsync_CustomCurrency_OverridesDefault()
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>
{
["Currency"] = "EUR",
},
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 100.0,
},
};
var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse);
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream();
// Act
var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
result.Data.Should().NotBeNull();
result.Data!.Currency.Should().Be("EUR");
}
[Fact]
public async Task ExtractAsync_MultipartFormData_IncludesFileAndExtractLineItemsParameter()
{
// Arrange
var apiResponse = new
{
status = "success",
message = "OK",
result = new
{
document_id = "test",
success = true,
fields = new Dictionary<string, string?>(),
confidence = new Dictionary<string, decimal>(),
processing_time_ms = 100.0,
},
};
var captureHandler = new CaptureHttpMessageHandler(HttpStatusCode.OK, apiResponse);
var httpClient = new HttpClient(captureHandler) { BaseAddress = new Uri("http://localhost") };
var service = new OcrService(httpClient, _loggerMock.Object);
using var stream = new MemoryStream(Encoding.UTF8.GetBytes("test file content"));
// Act
await service.ExtractAsync(stream, "test.pdf", CancellationToken.None);
// Assert
captureHandler.CapturedRequest.Should().NotBeNull();
captureHandler.CapturedRequest!.RequestUri.Should().NotBeNull();
captureHandler.CapturedRequest.RequestUri!.ToString().Should().EndWith("infer");
captureHandler.CapturedRequest.Content.Should().NotBeNull();
captureHandler.CapturedRequest.Content.Should().BeOfType<MultipartFormDataContent>();
}
private static HttpClient CreateHttpClient(HttpStatusCode statusCode, object response)
{
var json = JsonSerializer.Serialize(response);
var handler = new MockHttpMessageHandler(statusCode, json);
return new HttpClient(handler) { BaseAddress = new Uri("http://localhost") };
}
private static HttpClient CreateHttpClient(HttpStatusCode statusCode, string responseBody)
{
var handler = new MockHttpMessageHandler(statusCode, responseBody);
return new HttpClient(handler) { BaseAddress = new Uri("http://localhost") };
}
private sealed class MockHttpMessageHandler : HttpMessageHandler
{
private readonly HttpStatusCode _statusCode;
private readonly string _responseBody;
public MockHttpMessageHandler(HttpStatusCode statusCode, string responseBody)
{
_statusCode = statusCode;
_responseBody = responseBody;
}
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
{
var response = new HttpResponseMessage(_statusCode)
{
Content = new StringContent(_responseBody, Encoding.UTF8, "application/json"),
};
return Task.FromResult(response);
}
}
private sealed class TimeoutHttpMessageHandler : HttpMessageHandler
{
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
{
throw new TaskCanceledException("Request timed out");
}
}
private sealed class ConnectionErrorHttpMessageHandler : HttpMessageHandler
{
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
{
throw new HttpRequestException("Unable to connect to server");
}
}
private sealed class CaptureHttpMessageHandler : HttpMessageHandler
{
private readonly HttpStatusCode _statusCode;
private readonly string _responseBody;
public HttpRequestMessage? CapturedRequest { get; private set; }
public CaptureHttpMessageHandler(HttpStatusCode statusCode, object response)
{
_statusCode = statusCode;
_responseBody = JsonSerializer.Serialize(response);
}
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
{
CapturedRequest = request;
var response = new HttpResponseMessage(_statusCode)
{
Content = new StringContent(_responseBody, Encoding.UTF8, "application/json"),
};
return Task.FromResult(response);
}
}
}