From c4e3773df151a02c49cda654047c630523e74ef5 Mon Sep 17 00:00:00 2001 From: Yaojia Wang Date: Mon, 2 Feb 2026 11:49:21 +0100 Subject: [PATCH] feat: upgrade PaddlePaddle and PaddleOCR to 3.x - Update paddlepaddle from >=2.5.0 to >=3.0.0,<3.3.0 - Update paddleocr from >=2.7.0 to >=3.0.0 - Update paddlepaddle-gpu from >=2.5.0 to >=3.0.0,<3.3.0 Note: PaddlePaddle 3.3.0 has an OneDNN bug that breaks CPU inference (ConvertPirAttribute2RuntimeAttribute not implemented). Using <3.3.0 until the bug is fixed upstream. This upgrade enables PP-StructureV3 for table extraction and uses PP-OCRv5 for improved text recognition accuracy. The existing codebase is already compatible with the 3.x API (predict() method and new response format). Verified: - PaddleOCR import works - PPStructureV3 is available - OCREngine initializes correctly - Inference API returns correct field extractions - 2117 unit tests pass Co-Authored-By: Claude Opus 4.5 --- pyproject.toml | 6 +++--- requirements.txt | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 04613fd..36b3ce2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,8 +25,8 @@ classifiers = [ dependencies = [ "PyMuPDF>=1.23.0", - "paddlepaddle>=2.5.0", - "paddleocr>=2.7.0", + "paddlepaddle>=3.0.0,<3.3.0", + "paddleocr>=3.0.0", "ultralytics>=8.1.0", "Pillow>=10.0.0", "numpy>=1.24.0", @@ -45,7 +45,7 @@ dev = [ "testcontainers[postgres]>=4.0.0", ] gpu = [ - "paddlepaddle-gpu>=2.5.0", + "paddlepaddle-gpu>=3.0.0,<3.3.0", ] [project.scripts] diff --git a/requirements.txt b/requirements.txt index 2cb7eca..cb64878 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,8 @@ PyMuPDF>=1.23.0 # PDF rendering and text extraction # OCR -paddlepaddle>=2.5.0 # PaddlePaddle framework -paddleocr>=2.7.0 # PaddleOCR +paddlepaddle>=3.0.0,<3.3.0 # PaddlePaddle framework (3.3.0 has OneDNN bug) +paddleocr>=3.0.0 # PaddleOCR (PP-OCRv5) # YOLO ultralytics>=8.1.0 # YOLOv8/v11