From 26d0c412036cc73acc7807b9c63808b4db5c3d17 Mon Sep 17 00:00:00 2001 From: Yaojia Wang Date: Fri, 13 Feb 2026 10:30:50 +0100 Subject: [PATCH] feat: integrate invoice-master-poc-v2 inference API Rewrite OcrService to match the actual inference API response format (nested status/result structure with PascalCase/snake_case field names). Register IOcrService in DI with typed HttpClient and Polly v8 resilience (retry, timeout, circuit breaker via AddStandardResilienceHandler). Key changes: - Fix response model to match real API (InferenceApiResponse) - Map correct field names (InvoiceNumber, InvoiceDueDate, OCR, Amount, etc.) - Add extract_line_items=true for VAT summary extraction - Copy stream before sending to avoid disposal conflicts with retries - Add JsonException handling for malformed responses - Remove sensitive data from error logs - Add 35 unit tests covering field mapping, VAT parsing, error handling, decimal/date formats, and content type detection --- .../FiscalFlow.Core/Interfaces/IOcrService.cs | 5 + .../Extensions/DependencyInjection.cs | 24 +- .../FiscalFlow.Infrastructure.csproj | 7 +- .../Services/OcrService.cs | 278 ++++-- .../FiscalFlow.UnitTests.csproj | 2 + .../Services/OcrServiceTests.cs | 806 ++++++++++++++++++ 6 files changed, 1044 insertions(+), 78 deletions(-) create mode 100644 backend/tests/FiscalFlow.UnitTests/Services/OcrServiceTests.cs diff --git a/backend/src/FiscalFlow.Core/Interfaces/IOcrService.cs b/backend/src/FiscalFlow.Core/Interfaces/IOcrService.cs index 08c945d..5a2be77 100644 --- a/backend/src/FiscalFlow.Core/Interfaces/IOcrService.cs +++ b/backend/src/FiscalFlow.Core/Interfaces/IOcrService.cs @@ -11,6 +11,9 @@ public class OcrResult public string? ErrorMessage { get; set; } public InvoiceData? Data { get; set; } public decimal Confidence { get; set; } + public Dictionary FieldConfidences { get; set; } = new(); + public double ProcessingTimeMs { get; set; } + public string? DocumentType { get; set; } } public class InvoiceData @@ -26,5 +29,7 @@ public class InvoiceData public string? OcrNumber { get; set; } public string? Bankgiro { get; set; } public string? Plusgiro { get; set; } + public string? CustomerNumber { get; set; } + public string? PaymentLine { get; set; } public string Currency { get; set; } = "SEK"; } diff --git a/backend/src/FiscalFlow.Infrastructure/Extensions/DependencyInjection.cs b/backend/src/FiscalFlow.Infrastructure/Extensions/DependencyInjection.cs index a82431b..93dc388 100644 --- a/backend/src/FiscalFlow.Infrastructure/Extensions/DependencyInjection.cs +++ b/backend/src/FiscalFlow.Infrastructure/Extensions/DependencyInjection.cs @@ -5,13 +5,14 @@ using FiscalFlow.Infrastructure.Services; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Http.Resilience; namespace FiscalFlow.Infrastructure.Extensions; public static class DependencyInjection { public static IServiceCollection AddInfrastructureServices( - this IServiceCollection services, + this IServiceCollection services, IConfiguration configuration) { services.AddDbContext(options => @@ -25,6 +26,27 @@ public static class DependencyInjection services.AddScoped(); services.AddSingleton(); + services + .AddHttpClient(client => + { + var baseUrl = configuration["Ocr:ApiUrl"] ?? "http://localhost:8000/api/v1"; + client.BaseAddress = new Uri(baseUrl.TrimEnd('/') + "/"); + client.Timeout = TimeSpan.FromSeconds(60); + + var apiKey = configuration["Ocr:ApiKey"]; + if (!string.IsNullOrEmpty(apiKey)) + { + client.DefaultRequestHeaders.Add("X-API-Key", apiKey); + } + }) + .AddStandardResilienceHandler(options => + { + options.Retry.MaxRetryAttempts = 3; + options.Retry.Delay = TimeSpan.FromSeconds(2); + options.AttemptTimeout.Timeout = TimeSpan.FromSeconds(30); + options.TotalRequestTimeout.Timeout = TimeSpan.FromSeconds(90); + }); + return services; } } diff --git a/backend/src/FiscalFlow.Infrastructure/FiscalFlow.Infrastructure.csproj b/backend/src/FiscalFlow.Infrastructure/FiscalFlow.Infrastructure.csproj index 110849a..477f7be 100644 --- a/backend/src/FiscalFlow.Infrastructure/FiscalFlow.Infrastructure.csproj +++ b/backend/src/FiscalFlow.Infrastructure/FiscalFlow.Infrastructure.csproj @@ -7,16 +7,15 @@ - + + all runtime; build; native; contentfiles; analyzers - - - + diff --git a/backend/src/FiscalFlow.Infrastructure/Services/OcrService.cs b/backend/src/FiscalFlow.Infrastructure/Services/OcrService.cs index d38c228..715df23 100644 --- a/backend/src/FiscalFlow.Infrastructure/Services/OcrService.cs +++ b/backend/src/FiscalFlow.Infrastructure/Services/OcrService.cs @@ -1,154 +1,286 @@ -using FiscalFlow.Core.Interfaces; -using Microsoft.Extensions.Configuration; -using Microsoft.Extensions.Logging; +using System.Globalization; using System.Net.Http.Headers; -using System.Net.Http.Json; using System.Text.Json; +using System.Text.Json.Serialization; +using FiscalFlow.Core.Interfaces; +using Microsoft.Extensions.Logging; namespace FiscalFlow.Infrastructure.Services; public class OcrService : IOcrService { + private static readonly JsonSerializerOptions JsonOptions = new() + { + PropertyNameCaseInsensitive = true, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + }; + private readonly HttpClient _httpClient; private readonly ILogger _logger; - private readonly string _apiUrl; - private readonly string? _apiKey; - public OcrService(IHttpClientFactory httpClientFactory, IConfiguration configuration, ILogger logger) + public OcrService(HttpClient httpClient, ILogger logger) { - _httpClient = httpClientFactory.CreateClient(); + _httpClient = httpClient; _logger = logger; - _apiUrl = configuration["Ocr:ApiUrl"] ?? "http://localhost:8000/api/v1"; - _apiKey = configuration["Ocr:ApiKey"]; } public async Task ExtractAsync(Stream fileStream, string fileName, CancellationToken cancellationToken = default) { try { - var content = new MultipartFormDataContent(); - var streamContent = new StreamContent(fileStream); - streamContent.Headers.ContentType = new MediaTypeHeaderValue("application/pdf"); + using var streamCopy = new MemoryStream(); + await fileStream.CopyToAsync(streamCopy, cancellationToken); + streamCopy.Position = 0; + + using var content = new MultipartFormDataContent(); + var streamContent = new StreamContent(streamCopy); + streamContent.Headers.ContentType = new MediaTypeHeaderValue(GetContentType(fileName)); content.Add(streamContent, "file", fileName); + content.Add(new StringContent("true"), "extract_line_items"); - var request = new HttpRequestMessage(HttpMethod.Post, $"{_apiUrl}/infer") - { - Content = content - }; - - if (!string.IsNullOrEmpty(_apiKey)) - { - request.Headers.Add("X-API-Key", _apiKey); - } - - var response = await _httpClient.SendAsync(request, cancellationToken); - var responseContent = await response.Content.ReadAsStringAsync(cancellationToken); + var response = await _httpClient.PostAsync("infer", content, cancellationToken); + var responseBody = await response.Content.ReadAsStringAsync(cancellationToken); if (!response.IsSuccessStatusCode) { - _logger.LogError("OCR API error: {StatusCode} - {Content}", response.StatusCode, responseContent); + _logger.LogError("OCR API error: {StatusCode}, response length: {Length} bytes", response.StatusCode, responseBody.Length); return new OcrResult { Success = false, - ErrorMessage = $"OCR API returned {response.StatusCode}" + ErrorMessage = $"OCR API returned {response.StatusCode}", }; } - var result = JsonSerializer.Deserialize(responseContent, new JsonSerializerOptions + InferenceApiResponse? apiResponse; + try { - PropertyNameCaseInsensitive = true - }); + apiResponse = JsonSerializer.Deserialize(responseBody, JsonOptions); + } + catch (JsonException ex) + { + _logger.LogError(ex, "Failed to parse OCR API response"); + return new OcrResult + { + Success = false, + ErrorMessage = "Invalid response format from OCR API", + }; + } - if (result == null) + if (apiResponse?.Result == null) { return new OcrResult { Success = false, - ErrorMessage = "Invalid OCR response" + ErrorMessage = apiResponse?.Message ?? "Invalid OCR response", }; } + if (!string.Equals(apiResponse.Status, "success", StringComparison.OrdinalIgnoreCase) && + !string.Equals(apiResponse.Status, "partial", StringComparison.OrdinalIgnoreCase)) + { + return new OcrResult + { + Success = false, + ErrorMessage = apiResponse.Message ?? $"OCR status: {apiResponse.Status}", + }; + } + + var result = apiResponse.Result; + var fieldConfidences = result.Confidence ?? new Dictionary(); + var averageConfidence = fieldConfidences.Count > 0 + ? fieldConfidences.Values.Average() + : 0m; + return new OcrResult { Success = result.Success, - Data = MapToInvoiceData(result.Fields), - Confidence = result.Confidence, - ErrorMessage = result.Error + Data = MapToInvoiceData(result.Fields, result.VatSummary), + Confidence = Math.Round(averageConfidence, 4), + FieldConfidences = fieldConfidences, + ProcessingTimeMs = result.ProcessingTimeMs, + DocumentType = result.DocumentType, + ErrorMessage = result.Errors?.Count > 0 ? string.Join("; ", result.Errors) : null, }; } - catch (Exception ex) + catch (TaskCanceledException ex) when (!cancellationToken.IsCancellationRequested) { - _logger.LogError(ex, "Error calling OCR API"); + _logger.LogError(ex, "OCR API request timed out"); return new OcrResult { Success = false, - ErrorMessage = ex.Message + ErrorMessage = "OCR API request timed out", + }; + } + catch (HttpRequestException ex) + { + _logger.LogError(ex, "Error connecting to OCR API"); + return new OcrResult + { + Success = false, + ErrorMessage = $"OCR API connection error: {ex.Message}", }; } } - private static InvoiceData MapToInvoiceData(Dictionary? fields) + private static InvoiceData MapToInvoiceData(Dictionary? fields, VatSummaryResult? vatSummary) { if (fields == null) { return new InvoiceData(); } - return new InvoiceData + var data = new InvoiceData { - SupplierName = GetFieldValue(fields, "supplier_name"), - SupplierOrgNumber = GetFieldValue(fields, "supplier_org_number"), - InvoiceNumber = GetFieldValue(fields, "invoice_number"), - InvoiceDate = ParseDate(GetFieldValue(fields, "invoice_date")), - DueDate = ParseDate(GetFieldValue(fields, "due_date")), - AmountTotal = ParseDecimal(GetFieldValue(fields, "amount_total")), - AmountVat = ParseDecimal(GetFieldValue(fields, "amount_vat")), - VatRate = ParseInt(GetFieldValue(fields, "vat_rate")), - OcrNumber = GetFieldValue(fields, "ocr_number"), - Bankgiro = GetFieldValue(fields, "bankgiro"), - Plusgiro = GetFieldValue(fields, "plusgiro"), - Currency = GetFieldValue(fields, "currency") ?? "SEK" + SupplierName = GetField(fields, "SupplierName"), + SupplierOrgNumber = GetField(fields, "supplier_org_number"), + InvoiceNumber = GetField(fields, "InvoiceNumber"), + InvoiceDate = ParseDate(GetField(fields, "InvoiceDate")), + DueDate = ParseDate(GetField(fields, "InvoiceDueDate")), + AmountTotal = ParseDecimal(GetField(fields, "Amount")), + OcrNumber = GetField(fields, "OCR"), + Bankgiro = GetField(fields, "Bankgiro"), + Plusgiro = GetField(fields, "Plusgiro"), + CustomerNumber = GetField(fields, "customer_number"), + PaymentLine = GetField(fields, "payment_line"), + Currency = GetField(fields, "Currency") ?? "SEK", }; + + if (vatSummary != null) + { + data.AmountVat = ParseDecimal(vatSummary.TotalVat); + + if (data.AmountVat == null && vatSummary.Breakdowns?.Count > 0) + { + data.AmountVat = vatSummary.Breakdowns + .Select(b => ParseDecimal(b.VatAmount)) + .Where(v => v.HasValue) + .Aggregate((decimal?)null, (sum, v) => (sum ?? 0) + v); + } + + if (vatSummary.Breakdowns?.Count > 0) + { + var primaryRate = vatSummary.Breakdowns[0].Rate; + data.VatRate = primaryRate.HasValue ? (int)Math.Round(primaryRate.Value) : null; + } + } + + return data; } - private static string? GetFieldValue(Dictionary fields, string key) + private static string? GetField(Dictionary fields, string key) { - return fields.TryGetValue(key, out var field) ? field.Value : null; + return fields.TryGetValue(key, out var value) ? value : null; + } + + private static string GetContentType(string fileName) + { + var extension = Path.GetExtension(fileName)?.ToLowerInvariant(); + return extension switch + { + ".pdf" => "application/pdf", + ".png" => "image/png", + ".jpg" or ".jpeg" => "image/jpeg", + _ => "application/octet-stream", + }; } private static DateTime? ParseDate(string? value) { - if (string.IsNullOrEmpty(value)) return null; - if (DateTime.TryParse(value, out var date)) return date; - return null; + if (string.IsNullOrWhiteSpace(value)) return null; + + string[] formats = ["yyyy-MM-dd", "yyyy/MM/dd", "dd/MM/yyyy", "dd.MM.yyyy"]; + if (DateTime.TryParseExact(value, formats, CultureInfo.InvariantCulture, DateTimeStyles.None, out var date)) + { + return date; + } + + return DateTime.TryParse(value, CultureInfo.InvariantCulture, DateTimeStyles.None, out date) ? date : null; } private static decimal? ParseDecimal(string? value) { - if (string.IsNullOrEmpty(value)) return null; - value = value.Replace(",", "").Replace(" ", ""); - if (decimal.TryParse(value, out var result)) return result; - return null; - } + if (string.IsNullOrWhiteSpace(value)) return null; + var cleaned = value.Replace(" ", "").Replace("\u00a0", ""); - private static int? ParseInt(string? value) - { - if (string.IsNullOrEmpty(value)) return null; - if (int.TryParse(value, out var result)) return result; - return null; + if (cleaned.Contains(',') && cleaned.Contains('.')) + { + cleaned = cleaned.Replace(",", ""); + } + else if (cleaned.Contains(',')) + { + cleaned = cleaned.Replace(",", "."); + } + + return decimal.TryParse(cleaned, NumberStyles.Number, CultureInfo.InvariantCulture, out var result) + ? result + : null; } } -public class OcrApiResponse +internal sealed class InferenceApiResponse { + public string? Status { get; set; } + public string? Message { get; set; } + public InferenceApiResult? Result { get; set; } +} + +internal sealed class InferenceApiResult +{ + [JsonPropertyName("document_id")] + public string? DocumentId { get; set; } + public bool Success { get; set; } - public string? Error { get; set; } - public Dictionary? Fields { get; set; } + + [JsonPropertyName("document_type")] + public string? DocumentType { get; set; } + + public Dictionary? Fields { get; set; } + + public Dictionary? Confidence { get; set; } + + public List? Detections { get; set; } + + [JsonPropertyName("processing_time_ms")] + public double ProcessingTimeMs { get; set; } + + public List? Errors { get; set; } + + [JsonPropertyName("vat_summary")] + public VatSummaryResult? VatSummary { get; set; } +} + +internal sealed class DetectionItem +{ + public string? Field { get; set; } + public decimal Confidence { get; set; } + public List? Bbox { get; set; } +} + +internal sealed class VatSummaryResult +{ + public List? Breakdowns { get; set; } + + [JsonPropertyName("total_excl_vat")] + public string? TotalExclVat { get; set; } + + [JsonPropertyName("total_vat")] + public string? TotalVat { get; set; } + + [JsonPropertyName("total_incl_vat")] + public string? TotalInclVat { get; set; } + public decimal Confidence { get; set; } } -public class OcrField +internal sealed class VatBreakdownResult { - public string? Value { get; set; } - public decimal Confidence { get; set; } + public decimal? Rate { get; set; } + + [JsonPropertyName("base_amount")] + public string? BaseAmount { get; set; } + + [JsonPropertyName("vat_amount")] + public string? VatAmount { get; set; } + + public string? Source { get; set; } } diff --git a/backend/tests/FiscalFlow.UnitTests/FiscalFlow.UnitTests.csproj b/backend/tests/FiscalFlow.UnitTests/FiscalFlow.UnitTests.csproj index f8d208e..7454ad8 100644 --- a/backend/tests/FiscalFlow.UnitTests/FiscalFlow.UnitTests.csproj +++ b/backend/tests/FiscalFlow.UnitTests/FiscalFlow.UnitTests.csproj @@ -5,6 +5,7 @@ + @@ -22,6 +23,7 @@ + diff --git a/backend/tests/FiscalFlow.UnitTests/Services/OcrServiceTests.cs b/backend/tests/FiscalFlow.UnitTests/Services/OcrServiceTests.cs new file mode 100644 index 0000000..29718c1 --- /dev/null +++ b/backend/tests/FiscalFlow.UnitTests/Services/OcrServiceTests.cs @@ -0,0 +1,806 @@ +using System.Net; +using System.Text; +using System.Text.Json; +using FiscalFlow.Core.Interfaces; +using FiscalFlow.Infrastructure.Services; +using FluentAssertions; +using Microsoft.Extensions.Logging; +using Moq; +using Xunit; + +namespace FiscalFlow.UnitTests.Services; + +public sealed class OcrServiceTests +{ + private readonly Mock> _loggerMock; + + public OcrServiceTests() + { + _loggerMock = new Mock>(); + } + + [Fact] + public async Task ExtractAsync_SuccessfulExtraction_ReturnsCompleteOcrResult() + { + // Arrange + var apiResponse = new + { + status = "success", + message = "Processed document abc123", + result = new + { + document_id = "abc123", + success = true, + document_type = "invoice", + fields = new Dictionary + { + ["InvoiceNumber"] = "INV-001", + ["InvoiceDate"] = "2024-01-15", + ["InvoiceDueDate"] = "2024-02-15", + ["OCR"] = "123456789", + ["Bankgiro"] = "1234-5678", + ["Plusgiro"] = null, + ["Amount"] = "12500.00", + ["SupplierName"] = "Test AB", + ["supplier_org_number"] = "556677-8899", + ["customer_number"] = "C-001", + ["payment_line"] = "# 123456789 # 12500 00 #", + ["Currency"] = "SEK", + }, + confidence = new Dictionary + { + ["InvoiceNumber"] = 0.95m, + ["InvoiceDate"] = 0.88m, + ["Amount"] = 0.92m, + }, + detections = new List(), + processing_time_ms = 1234.5, + errors = new List(), + vat_summary = new + { + breakdowns = new[] + { + new + { + rate = 25.0m, + vat_amount = "2500.00", + base_amount = "10000.00", + source = "regex", + }, + }, + total_excl_vat = "10000.00", + total_vat = "2500.00", + total_incl_vat = "12500.00", + confidence = 0.85m, + }, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Should().NotBeNull(); + result.Success.Should().BeTrue(); + result.Confidence.Should().Be(0.9167m); + result.ProcessingTimeMs.Should().Be(1234.5); + result.DocumentType.Should().Be("invoice"); + result.FieldConfidences.Should().ContainKey("InvoiceNumber").WhoseValue.Should().Be(0.95m); + result.FieldConfidences.Should().ContainKey("InvoiceDate").WhoseValue.Should().Be(0.88m); + result.FieldConfidences.Should().ContainKey("Amount").WhoseValue.Should().Be(0.92m); + + var data = result.Data; + data.Should().NotBeNull(); + data!.InvoiceNumber.Should().Be("INV-001"); + data.InvoiceDate.Should().Be(new DateTime(2024, 1, 15)); + data.DueDate.Should().Be(new DateTime(2024, 2, 15)); + data.OcrNumber.Should().Be("123456789"); + data.Bankgiro.Should().Be("1234-5678"); + data.Plusgiro.Should().BeNull(); + data.AmountTotal.Should().Be(12500.00m); + data.SupplierName.Should().Be("Test AB"); + data.SupplierOrgNumber.Should().Be("556677-8899"); + data.CustomerNumber.Should().Be("C-001"); + data.PaymentLine.Should().Be("# 123456789 # 12500 00 #"); + data.Currency.Should().Be("SEK"); + data.AmountVat.Should().Be(2500.00m); + data.VatRate.Should().Be(25); + } + + [Fact] + public async Task ExtractAsync_FieldMapping_MapsAllFieldsCorrectly() + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary + { + ["InvoiceNumber"] = "TEST-123", + ["InvoiceDate"] = "2024-03-20", + ["InvoiceDueDate"] = "2024-04-20", + ["OCR"] = "987654321", + ["Bankgiro"] = "9999-8888", + ["Plusgiro"] = "123456-7", + ["Amount"] = "5000.00", + ["supplier_org_number"] = "111222-3333", + ["customer_number"] = "CUST-999", + ["payment_line"] = "Payment reference line", + }, + confidence = new Dictionary(), + processing_time_ms = 500.0, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + var data = result.Data; + data.Should().NotBeNull(); + data!.InvoiceNumber.Should().Be("TEST-123"); + data.InvoiceDate.Should().Be(new DateTime(2024, 3, 20)); + data.DueDate.Should().Be(new DateTime(2024, 4, 20)); + data.OcrNumber.Should().Be("987654321"); + data.Bankgiro.Should().Be("9999-8888"); + data.Plusgiro.Should().Be("123456-7"); + data.AmountTotal.Should().Be(5000.00m); + data.SupplierOrgNumber.Should().Be("111222-3333"); + data.CustomerNumber.Should().Be("CUST-999"); + data.PaymentLine.Should().Be("Payment reference line"); + } + + [Fact] + public async Task ExtractAsync_VatSummaryParsing_PopulatesVatFieldsFromTotalVat() + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary(), + confidence = new Dictionary(), + processing_time_ms = 100.0, + vat_summary = new + { + total_vat = "1250.50", + breakdowns = new[] + { + new + { + rate = 25.0m, + vat_amount = "1250.50", + base_amount = "5000.00", + source = "table", + }, + }, + }, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + var data = result.Data; + data.Should().NotBeNull(); + data!.AmountVat.Should().Be(1250.50m); + data.VatRate.Should().Be(25); + } + + [Fact] + public async Task ExtractAsync_VatSummaryWithoutTotalVat_SumsFromBreakdowns() + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary(), + confidence = new Dictionary(), + processing_time_ms = 100.0, + vat_summary = new + { + breakdowns = new[] + { + new + { + rate = 25.0m, + vat_amount = "1000.00", + base_amount = "4000.00", + source = "table", + }, + new + { + rate = 12.0m, + vat_amount = "240.00", + base_amount = "2000.00", + source = "table", + }, + }, + }, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + var data = result.Data; + data.Should().NotBeNull(); + data!.AmountVat.Should().Be(1240.00m); + data.VatRate.Should().Be(25); + } + + [Fact] + public async Task ExtractAsync_ConfidenceCalculation_AveragesFieldConfidencesAndRoundsToFourDecimals() + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary(), + confidence = new Dictionary + { + ["Field1"] = 0.123456789m, + ["Field2"] = 0.987654321m, + ["Field3"] = 0.555555555m, + }, + processing_time_ms = 100.0, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Confidence.Should().Be(0.5556m); + } + + [Fact] + public async Task ExtractAsync_NoConfidenceScores_ReturnsZeroConfidence() + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary(), + confidence = new Dictionary(), + processing_time_ms = 100.0, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Confidence.Should().Be(0m); + } + + [Fact] + public async Task ExtractAsync_HttpError500_ReturnsFailureWithErrorMessage() + { + // Arrange + var httpClient = CreateHttpClient(HttpStatusCode.InternalServerError, "Internal server error"); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Should().NotBeNull(); + result.Success.Should().BeFalse(); + result.ErrorMessage.Should().Be("OCR API returned InternalServerError"); + } + + [Fact] + public async Task ExtractAsync_NullResponse_ReturnsFailure() + { + // Arrange + var httpClient = CreateHttpClient(HttpStatusCode.OK, "{}"); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Should().NotBeNull(); + result.Success.Should().BeFalse(); + result.ErrorMessage.Should().NotBeNullOrEmpty(); + } + + [Fact] + public async Task ExtractAsync_NullResult_ReturnsFailureWithMessage() + { + // Arrange + var apiResponse = new + { + status = "error", + message = "Processing failed", + result = (object?)null, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Should().NotBeNull(); + result.Success.Should().BeFalse(); + result.ErrorMessage.Should().Be("Processing failed"); + } + + [Fact] + public async Task ExtractAsync_ErrorStatus_ReturnsFailureWithMessage() + { + // Arrange + var apiResponse = new + { + status = "error", + message = "Document format not supported", + result = new + { + document_id = "test", + success = false, + fields = new Dictionary(), + confidence = new Dictionary(), + processing_time_ms = 10.0, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Should().NotBeNull(); + result.Success.Should().BeFalse(); + result.ErrorMessage.Should().Be("Document format not supported"); + } + + [Fact] + public async Task ExtractAsync_PartialStatus_TreatedAsSuccess() + { + // Arrange + var apiResponse = new + { + status = "partial", + message = "Some fields not detected", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary + { + ["InvoiceNumber"] = "PARTIAL-001", + }, + confidence = new Dictionary + { + ["InvoiceNumber"] = 0.75m, + }, + processing_time_ms = 200.0, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Should().NotBeNull(); + result.Success.Should().BeTrue(); + result.Data.Should().NotBeNull(); + result.Data!.InvoiceNumber.Should().Be("PARTIAL-001"); + } + + [Fact] + public async Task ExtractAsync_TaskCanceledException_NotUserCancellation_ReturnsTimeoutError() + { + // Arrange + var handler = new TimeoutHttpMessageHandler(); + var httpClient = new HttpClient(handler) { BaseAddress = new Uri("http://localhost") }; + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Should().NotBeNull(); + result.Success.Should().BeFalse(); + result.ErrorMessage.Should().Be("OCR API request timed out"); + } + + [Fact] + public async Task ExtractAsync_HttpRequestException_ReturnsConnectionError() + { + // Arrange + var handler = new ConnectionErrorHttpMessageHandler(); + var httpClient = new HttpClient(handler) { BaseAddress = new Uri("http://localhost") }; + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Should().NotBeNull(); + result.Success.Should().BeFalse(); + result.ErrorMessage.Should().Contain("OCR API connection error"); + } + + [Theory] + [InlineData("1234.56", 1234.56)] + [InlineData("1 234,56", 1234.56)] + [InlineData("1,234.56", 1234.56)] + [InlineData("1234,56", 1234.56)] + [InlineData("12345", 12345)] + [InlineData("0.99", 0.99)] + public async Task ExtractAsync_DecimalParsing_HandlesVariousFormats(string input, decimal expected) + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary + { + ["Amount"] = input, + }, + confidence = new Dictionary(), + processing_time_ms = 100.0, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Data.Should().NotBeNull(); + result.Data!.AmountTotal.Should().Be(expected); + } + + [Theory] + [InlineData("2024-01-15", 2024, 1, 15)] + [InlineData("15/01/2024", 2024, 1, 15)] + [InlineData("15.01.2024", 2024, 1, 15)] + [InlineData("2024/01/15", 2024, 1, 15)] + public async Task ExtractAsync_DateParsing_HandlesVariousFormats(string input, int year, int month, int day) + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary + { + ["InvoiceDate"] = input, + }, + confidence = new Dictionary(), + processing_time_ms = 100.0, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Data.Should().NotBeNull(); + result.Data!.InvoiceDate.Should().Be(new DateTime(year, month, day)); + } + + [Theory] + [InlineData("test.pdf")] + [InlineData("invoice.PDF")] + [InlineData("scan.png")] + [InlineData("photo.PNG")] + [InlineData("image.jpg")] + [InlineData("photo.jpeg")] + [InlineData("document.JPG")] + [InlineData("unknown.txt")] + public async Task ExtractAsync_DifferentFileTypes_ProcessesSuccessfully(string fileName) + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary(), + confidence = new Dictionary(), + processing_time_ms = 100.0, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, fileName, CancellationToken.None); + + // Assert + result.Should().NotBeNull(); + result.Success.Should().BeTrue(); + } + + [Fact] + public async Task ExtractAsync_ErrorsInResult_IncludedInErrorMessage() + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary(), + confidence = new Dictionary(), + processing_time_ms = 100.0, + errors = new List { "Low quality image", "Missing page 2" }, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Should().NotBeNull(); + result.ErrorMessage.Should().Be("Low quality image; Missing page 2"); + } + + [Fact] + public async Task ExtractAsync_DefaultCurrency_SetToSEK() + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary(), + confidence = new Dictionary(), + processing_time_ms = 100.0, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Data.Should().NotBeNull(); + result.Data!.Currency.Should().Be("SEK"); + } + + [Fact] + public async Task ExtractAsync_CustomCurrency_OverridesDefault() + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary + { + ["Currency"] = "EUR", + }, + confidence = new Dictionary(), + processing_time_ms = 100.0, + }, + }; + + var httpClient = CreateHttpClient(HttpStatusCode.OK, apiResponse); + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(); + + // Act + var result = await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + result.Data.Should().NotBeNull(); + result.Data!.Currency.Should().Be("EUR"); + } + + [Fact] + public async Task ExtractAsync_MultipartFormData_IncludesFileAndExtractLineItemsParameter() + { + // Arrange + var apiResponse = new + { + status = "success", + message = "OK", + result = new + { + document_id = "test", + success = true, + fields = new Dictionary(), + confidence = new Dictionary(), + processing_time_ms = 100.0, + }, + }; + + var captureHandler = new CaptureHttpMessageHandler(HttpStatusCode.OK, apiResponse); + var httpClient = new HttpClient(captureHandler) { BaseAddress = new Uri("http://localhost") }; + var service = new OcrService(httpClient, _loggerMock.Object); + using var stream = new MemoryStream(Encoding.UTF8.GetBytes("test file content")); + + // Act + await service.ExtractAsync(stream, "test.pdf", CancellationToken.None); + + // Assert + captureHandler.CapturedRequest.Should().NotBeNull(); + captureHandler.CapturedRequest!.RequestUri.Should().NotBeNull(); + captureHandler.CapturedRequest.RequestUri!.ToString().Should().EndWith("infer"); + captureHandler.CapturedRequest.Content.Should().NotBeNull(); + captureHandler.CapturedRequest.Content.Should().BeOfType(); + } + + private static HttpClient CreateHttpClient(HttpStatusCode statusCode, object response) + { + var json = JsonSerializer.Serialize(response); + var handler = new MockHttpMessageHandler(statusCode, json); + return new HttpClient(handler) { BaseAddress = new Uri("http://localhost") }; + } + + private static HttpClient CreateHttpClient(HttpStatusCode statusCode, string responseBody) + { + var handler = new MockHttpMessageHandler(statusCode, responseBody); + return new HttpClient(handler) { BaseAddress = new Uri("http://localhost") }; + } + + private sealed class MockHttpMessageHandler : HttpMessageHandler + { + private readonly HttpStatusCode _statusCode; + private readonly string _responseBody; + + public MockHttpMessageHandler(HttpStatusCode statusCode, string responseBody) + { + _statusCode = statusCode; + _responseBody = responseBody; + } + + protected override Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + { + var response = new HttpResponseMessage(_statusCode) + { + Content = new StringContent(_responseBody, Encoding.UTF8, "application/json"), + }; + return Task.FromResult(response); + } + } + + private sealed class TimeoutHttpMessageHandler : HttpMessageHandler + { + protected override Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + { + throw new TaskCanceledException("Request timed out"); + } + } + + private sealed class ConnectionErrorHttpMessageHandler : HttpMessageHandler + { + protected override Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + { + throw new HttpRequestException("Unable to connect to server"); + } + } + + private sealed class CaptureHttpMessageHandler : HttpMessageHandler + { + private readonly HttpStatusCode _statusCode; + private readonly string _responseBody; + + public HttpRequestMessage? CapturedRequest { get; private set; } + + public CaptureHttpMessageHandler(HttpStatusCode statusCode, object response) + { + _statusCode = statusCode; + _responseBody = JsonSerializer.Serialize(response); + } + + protected override Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + { + CapturedRequest = request; + + var response = new HttpResponseMessage(_statusCode) + { + Content = new StringContent(_responseBody, Encoding.UTF8, "application/json"), + }; + return Task.FromResult(response); + } + } +}