From 025864121a0a743ce71493ecf19851e804d4c890 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Mon, 4 Aug 2025 23:17:28 -0700
Subject: [PATCH 01/22] added ocrmypdf

---
 argilla-server/pdm.lock                       | 294 +++++++++++++++++-
 argilla-server/pyproject.toml                 |   2 +
 .../argilla_server/api/schemas/v1/segments.py | 122 ++++++++
 .../contexts/document/figures.py              |  14 +
 .../contexts/document/preprocessing.py        | 158 ++++++++++
 .../contexts/document/tables.py               |  14 +
 .../src/argilla_server/jobs/document_jobs.py  |   8 +-
 7 files changed, 609 insertions(+), 3 deletions(-)
 create mode 100644 argilla-server/src/argilla_server/api/schemas/v1/segments.py
 create mode 100644 argilla-server/src/argilla_server/contexts/document/figures.py
 create mode 100644 argilla-server/src/argilla_server/contexts/document/preprocessing.py
 create mode 100644 argilla-server/src/argilla_server/contexts/document/tables.py

diff --git a/argilla-server/pdm.lock b/argilla-server/pdm.lock
index 86777a985..c40a808db 100644
--- a/argilla-server/pdm.lock
+++ b/argilla-server/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "postgresql", "test"]
 strategy = []
 lock_version = "4.5.0"
-content_hash = "sha256:9ff6bc1261cec9c0bdce741c04a63bf63a0a8e194e380d560c5032a6aef76b11"
+content_hash = "sha256:21af2a3f2dff0688e08aed10184370c7c0aed450f814cf0d3509e964d9582654"
 
 [[metadata.targets]]
 requires_python = ">=3.9"
@@ -835,6 +835,18 @@ files = [
     {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"},
 ]
 
+[[package]]
+name = "deprecation"
+version = "2.1.0"
+summary = ""
+dependencies = [
+    "packaging",
+]
+files = [
+    {file = "deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a"},
+    {file = "deprecation-2.1.0.tar.gz", hash = "sha256:72b3bde64e5d778694b0cf68178aed03d15e15477116add3fb773e581f9518ff"},
+]
+
 [[package]]
 name = "dill"
 version = "0.3.8"
@@ -1228,12 +1240,24 @@ files = [
     {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
 ]
 
+[[package]]
+name = "img2pdf"
+version = "0.6.1"
+summary = ""
+dependencies = [
+    "pikepdf",
+    "pillow",
+]
+files = [
+    {file = "img2pdf-0.6.1.tar.gz", hash = "sha256:306e279eb832bc159d7d6294b697a9fbd11b4be1f799b14b3b2174fb506af289"},
+]
+
 [[package]]
 name = "importlib-metadata"
 version = "8.5.0"
 summary = ""
 dependencies = [
-    "zipp; python_full_version < \"3.13\"",
+    "zipp; python_full_version < \"3.10\"",
 ]
 files = [
     {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"},
@@ -1261,6 +1285,100 @@ files = [
     {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
 ]
 
+[[package]]
+name = "lxml"
+version = "6.0.0"
+summary = ""
+files = [
+    {file = "lxml-6.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:35bc626eec405f745199200ccb5c6b36f202675d204aa29bb52e27ba2b71dea8"},
+    {file = "lxml-6.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:246b40f8a4aec341cbbf52617cad8ab7c888d944bfe12a6abd2b1f6cfb6f6082"},
+    {file = "lxml-6.0.0-cp310-cp310-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:2793a627e95d119e9f1e19720730472f5543a6d84c50ea33313ce328d870f2dd"},
+    {file = "lxml-6.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:46b9ed911f36bfeb6338e0b482e7fe7c27d362c52fde29f221fddbc9ee2227e7"},
+    {file = "lxml-6.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b4790b558bee331a933e08883c423f65bbcd07e278f91b2272489e31ab1e2b4"},
+    {file = "lxml-6.0.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2030956cf4886b10be9a0285c6802e078ec2391e1dd7ff3eb509c2c95a69b76"},
+    {file = "lxml-6.0.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d23854ecf381ab1facc8f353dcd9adeddef3652268ee75297c1164c987c11dc"},
+    {file = "lxml-6.0.0-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:43fe5af2d590bf4691531b1d9a2495d7aab2090547eaacd224a3afec95706d76"},
+    {file = "lxml-6.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74e748012f8c19b47f7d6321ac929a9a94ee92ef12bc4298c47e8b7219b26541"},
+    {file = "lxml-6.0.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:43cfbb7db02b30ad3926e8fceaef260ba2fb7df787e38fa2df890c1ca7966c3b"},
+    {file = "lxml-6.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:34190a1ec4f1e84af256495436b2d196529c3f2094f0af80202947567fdbf2e7"},
+    {file = "lxml-6.0.0-cp310-cp310-win32.whl", hash = "sha256:5967fe415b1920a3877a4195e9a2b779249630ee49ece22021c690320ff07452"},
+    {file = "lxml-6.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:f3389924581d9a770c6caa4df4e74b606180869043b9073e2cec324bad6e306e"},
+    {file = "lxml-6.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:522fe7abb41309e9543b0d9b8b434f2b630c5fdaf6482bee642b34c8c70079c8"},
+    {file = "lxml-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4ee56288d0df919e4aac43b539dd0e34bb55d6a12a6562038e8d6f3ed07f9e36"},
+    {file = "lxml-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b8dd6dd0e9c1992613ccda2bcb74fc9d49159dbe0f0ca4753f37527749885c25"},
+    {file = "lxml-6.0.0-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:d7ae472f74afcc47320238b5dbfd363aba111a525943c8a34a1b657c6be934c3"},
+    {file = "lxml-6.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5592401cdf3dc682194727c1ddaa8aa0f3ddc57ca64fd03226a430b955eab6f6"},
+    {file = "lxml-6.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:58ffd35bd5425c3c3b9692d078bf7ab851441434531a7e517c4984d5634cd65b"},
+    {file = "lxml-6.0.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f720a14aa102a38907c6d5030e3d66b3b680c3e6f6bc95473931ea3c00c59967"},
+    {file = "lxml-6.0.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2a5e8d207311a0170aca0eb6b160af91adc29ec121832e4ac151a57743a1e1e"},
+    {file = "lxml-6.0.0-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:2dd1cc3ea7e60bfb31ff32cafe07e24839df573a5e7c2d33304082a5019bcd58"},
+    {file = "lxml-6.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2cfcf84f1defed7e5798ef4f88aa25fcc52d279be731ce904789aa7ccfb7e8d2"},
+    {file = "lxml-6.0.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:a52a4704811e2623b0324a18d41ad4b9fabf43ce5ff99b14e40a520e2190c851"},
+    {file = "lxml-6.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c16304bba98f48a28ae10e32a8e75c349dd742c45156f297e16eeb1ba9287a1f"},
+    {file = "lxml-6.0.0-cp311-cp311-win32.whl", hash = "sha256:f8d19565ae3eb956d84da3ef367aa7def14a2735d05bd275cd54c0301f0d0d6c"},
+    {file = "lxml-6.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:b2d71cdefda9424adff9a3607ba5bbfc60ee972d73c21c7e3c19e71037574816"},
+    {file = "lxml-6.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:8a2e76efbf8772add72d002d67a4c3d0958638696f541734304c7f28217a9cab"},
+    {file = "lxml-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78718d8454a6e928470d511bf8ac93f469283a45c354995f7d19e77292f26108"},
+    {file = "lxml-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:84ef591495ffd3f9dcabffd6391db7bb70d7230b5c35ef5148354a134f56f2be"},
+    {file = "lxml-6.0.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:2930aa001a3776c3e2601cb8e0a15d21b8270528d89cc308be4843ade546b9ab"},
+    {file = "lxml-6.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:219e0431ea8006e15005767f0351e3f7f9143e793e58519dc97fe9e07fae5563"},
+    {file = "lxml-6.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bd5913b4972681ffc9718bc2d4c53cde39ef81415e1671ff93e9aa30b46595e7"},
+    {file = "lxml-6.0.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:390240baeb9f415a82eefc2e13285016f9c8b5ad71ec80574ae8fa9605093cd7"},
+    {file = "lxml-6.0.0-cp312-cp312-manylinux_2_27_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d6e200909a119626744dd81bae409fc44134389e03fbf1d68ed2a55a2fb10991"},
+    {file = "lxml-6.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ca50bd612438258a91b5b3788c6621c1f05c8c478e7951899f492be42defc0da"},
+    {file = "lxml-6.0.0-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:c24b8efd9c0f62bad0439283c2c795ef916c5a6b75f03c17799775c7ae3c0c9e"},
+    {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:afd27d8629ae94c5d863e32ab0e1d5590371d296b87dae0a751fb22bf3685741"},
+    {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:54c4855eabd9fc29707d30141be99e5cd1102e7d2258d2892314cf4c110726c3"},
+    {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c907516d49f77f6cd8ead1322198bdfd902003c3c330c77a1c5f3cc32a0e4d16"},
+    {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:36531f81c8214e293097cd2b7873f178997dae33d3667caaae8bdfb9666b76c0"},
+    {file = "lxml-6.0.0-cp312-cp312-win32.whl", hash = "sha256:690b20e3388a7ec98e899fd54c924e50ba6693874aa65ef9cb53de7f7de9d64a"},
+    {file = "lxml-6.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:310b719b695b3dd442cdfbbe64936b2f2e231bb91d998e99e6f0daf991a3eba3"},
+    {file = "lxml-6.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:8cb26f51c82d77483cdcd2b4a53cda55bbee29b3c2f3ddeb47182a2a9064e4eb"},
+    {file = "lxml-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6da7cd4f405fd7db56e51e96bff0865b9853ae70df0e6720624049da76bde2da"},
+    {file = "lxml-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b34339898bb556a2351a1830f88f751679f343eabf9cf05841c95b165152c9e7"},
+    {file = "lxml-6.0.0-cp313-cp313-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:51a5e4c61a4541bd1cd3ba74766d0c9b6c12d6a1a4964ef60026832aac8e79b3"},
+    {file = "lxml-6.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d18a25b19ca7307045581b18b3ec9ead2b1db5ccd8719c291f0cd0a5cec6cb81"},
+    {file = "lxml-6.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d4f0c66df4386b75d2ab1e20a489f30dc7fd9a06a896d64980541506086be1f1"},
+    {file = "lxml-6.0.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f4b481b6cc3a897adb4279216695150bbe7a44c03daba3c894f49d2037e0a24"},
+    {file = "lxml-6.0.0-cp313-cp313-manylinux_2_27_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8a78d6c9168f5bcb20971bf3329c2b83078611fbe1f807baadc64afc70523b3a"},
+    {file = "lxml-6.0.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ae06fbab4f1bb7db4f7c8ca9897dc8db4447d1a2b9bee78474ad403437bcc29"},
+    {file = "lxml-6.0.0-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:1fa377b827ca2023244a06554c6e7dc6828a10aaf74ca41965c5d8a4925aebb4"},
+    {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1676b56d48048a62ef77a250428d1f31f610763636e0784ba67a9740823988ca"},
+    {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:0e32698462aacc5c1cf6bdfebc9c781821b7e74c79f13e5ffc8bfe27c42b1abf"},
+    {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4d6036c3a296707357efb375cfc24bb64cd955b9ec731abf11ebb1e40063949f"},
+    {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7488a43033c958637b1a08cddc9188eb06d3ad36582cebc7d4815980b47e27ef"},
+    {file = "lxml-6.0.0-cp313-cp313-win32.whl", hash = "sha256:5fcd7d3b1d8ecb91445bd71b9c88bdbeae528fefee4f379895becfc72298d181"},
+    {file = "lxml-6.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:2f34687222b78fff795feeb799a7d44eca2477c3d9d3a46ce17d51a4f383e32e"},
+    {file = "lxml-6.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:21db1ec5525780fd07251636eb5f7acb84003e9382c72c18c542a87c416ade03"},
+    {file = "lxml-6.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:85b14a4689d5cff426c12eefe750738648706ea2753b20c2f973b2a000d3d261"},
+    {file = "lxml-6.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f64ccf593916e93b8d36ed55401bb7fe9c7d5de3180ce2e10b08f82a8f397316"},
+    {file = "lxml-6.0.0-cp39-cp39-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:b372d10d17a701b0945f67be58fae4664fd056b85e0ff0fbc1e6c951cdbc0512"},
+    {file = "lxml-6.0.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a674c0948789e9136d69065cc28009c1b1874c6ea340253db58be7622ce6398f"},
+    {file = "lxml-6.0.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:edf6e4c8fe14dfe316939711e3ece3f9a20760aabf686051b537a7562f4da91a"},
+    {file = "lxml-6.0.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:048a930eb4572829604982e39a0c7289ab5dc8abc7fc9f5aabd6fbc08c154e93"},
+    {file = "lxml-6.0.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0b5fa5eda84057a4f1bbb4bb77a8c28ff20ae7ce211588d698ae453e13c6281"},
+    {file = "lxml-6.0.0-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:c352fc8f36f7e9727db17adbf93f82499457b3d7e5511368569b4c5bd155a922"},
+    {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8db5dc617cb937ae17ff3403c3a70a7de9df4852a046f93e71edaec678f721d0"},
+    {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:2181e4b1d07dde53986023482673c0f1fba5178ef800f9ab95ad791e8bdded6a"},
+    {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b3c98d5b24c6095e89e03d65d5c574705be3d49c0d8ca10c17a8a4b5201b72f5"},
+    {file = "lxml-6.0.0-cp39-cp39-win32.whl", hash = "sha256:04d67ceee6db4bcb92987ccb16e53bef6b42ced872509f333c04fb58a3315256"},
+    {file = "lxml-6.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:e0b1520ef900e9ef62e392dd3d7ae4f5fa224d1dd62897a792cf353eb20b6cae"},
+    {file = "lxml-6.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:e35e8aaaf3981489f42884b59726693de32dabfc438ac10ef4eb3409961fd402"},
+    {file = "lxml-6.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:dbdd7679a6f4f08152818043dbb39491d1af3332128b3752c3ec5cebc0011a72"},
+    {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:40442e2a4456e9910875ac12951476d36c0870dcb38a68719f8c4686609897c4"},
+    {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:db0efd6bae1c4730b9c863fc4f5f3c0fa3e8f05cae2c44ae141cb9dfc7d091dc"},
+    {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ab542c91f5a47aaa58abdd8ea84b498e8e49fe4b883d67800017757a3eb78e8"},
+    {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:013090383863b72c62a702d07678b658fa2567aa58d373d963cca245b017e065"},
+    {file = "lxml-6.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c86df1c9af35d903d2b52d22ea3e66db8058d21dc0f59842ca5deb0595921141"},
+    {file = "lxml-6.0.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4337e4aec93b7c011f7ee2e357b0d30562edd1955620fdd4aeab6aacd90d43c5"},
+    {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ae74f7c762270196d2dda56f8dd7309411f08a4084ff2dfcc0b095a218df2e06"},
+    {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:059c4cbf3973a621b62ea3132934ae737da2c132a788e6cfb9b08d63a0ef73f9"},
+    {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:17f090a9bc0ce8da51a5632092f98a7e7f84bca26f33d161a98b57f7fb0004ca"},
+    {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9da022c14baeec36edfcc8daf0e281e2f55b950249a455776f0d1adeeada4734"},
+    {file = "lxml-6.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a55da151d0b0c6ab176b4e761670ac0e2667817a1e0dadd04a01d0561a219349"},
+    {file = "lxml-6.0.0.tar.gz", hash = "sha256:032e65120339d44cdc3efc326c9f660f5f7205f3a535c1fdbf898b29ea01fb72"},
+]
+
 [[package]]
 name = "mako"
 version = "1.3.6"
@@ -1537,6 +1655,47 @@ files = [
     {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"},
 ]
 
+[[package]]
+name = "ocrmypdf"
+version = "15.4.4"
+summary = ""
+dependencies = [
+    "deprecation; python_full_version < \"3.10\"",
+    "img2pdf; python_full_version < \"3.10\"",
+    "packaging; python_full_version < \"3.10\"",
+    "pdfminer-six; python_full_version < \"3.10\"",
+    "pikepdf; python_full_version < \"3.10\"",
+    "pillow; python_full_version < \"3.10\"",
+    "pluggy; python_full_version < \"3.10\"",
+    "reportlab; python_full_version < \"3.10\"",
+    "rich; python_full_version < \"3.10\"",
+    "typing-extensions; python_full_version < \"3.10\"",
+]
+files = [
+    {file = "ocrmypdf-15.4.4-py39-none-any.whl", hash = "sha256:13fd388035b5f4bb673bff570cfc2cf72e51168646d5401de9e48ca355917c6d"},
+    {file = "ocrmypdf-15.4.4.tar.gz", hash = "sha256:4696c81cc5b5d64f31ccfe685d10baeb69b42bb0974acddf292d8cf9d97605c3"},
+]
+
+[[package]]
+name = "ocrmypdf"
+version = "16.10.4"
+summary = ""
+dependencies = [
+    "deprecation; python_full_version >= \"3.10\"",
+    "img2pdf; python_full_version >= \"3.10\"",
+    "packaging; python_full_version >= \"3.10\"",
+    "pdfminer-six; python_full_version >= \"3.10\"",
+    "pi-heif; python_full_version >= \"3.10\"",
+    "pikepdf; python_full_version >= \"3.10\"",
+    "pillow; python_full_version >= \"3.10\"",
+    "pluggy; python_full_version >= \"3.10\"",
+    "rich; python_full_version >= \"3.10\"",
+]
+files = [
+    {file = "ocrmypdf-16.10.4-py3-none-any.whl", hash = "sha256:061f3165d09ffafac975cea00803802b8a75551ada9965292ea86ea382673688"},
+    {file = "ocrmypdf-16.10.4.tar.gz", hash = "sha256:de749ef5f554b63d57e68d032e7cba5500cbd5030835bf24f658f7b7a04f3dc1"},
+]
+
 [[package]]
 name = "opensearch-py"
 version = "2.0.1"
@@ -1615,6 +1774,124 @@ files = [
     {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"},
 ]
 
+[[package]]
+name = "pdfminer-six"
+version = "20250506"
+summary = ""
+dependencies = [
+    "charset-normalizer",
+    "cryptography",
+]
+files = [
+    {file = "pdfminer_six-20250506-py3-none-any.whl", hash = "sha256:d81ad173f62e5f841b53a8ba63af1a4a355933cfc0ffabd608e568b9193909e3"},
+    {file = "pdfminer_six-20250506.tar.gz", hash = "sha256:b03cc8df09cf3c7aba8246deae52e0bca7ebb112a38895b5e1d4f5dd2b8ca2e7"},
+]
+
+[[package]]
+name = "pi-heif"
+version = "0.22.0"
+summary = ""
+dependencies = [
+    "pillow; python_full_version >= \"3.10\"",
+]
+files = [
+    {file = "pi_heif-0.22.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:fca84436339eee2c91ff09cd7e301cfa2a0f7a9d83d5bc6a9d1db8587221d239"},
+    {file = "pi_heif-0.22.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:46b0fcf876d85c8684d3bc1a0b7a4e4bc5673b72084807dc6bf85caa2da9173b"},
+    {file = "pi_heif-0.22.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d85a8b09e28f3234a9a64796fc3ed71516b14a9ba08cad416ebd0db251e5f263"},
+    {file = "pi_heif-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21416131308fabaeadbd1eae4d4daf218443832409f91ea6571edb64a0dc8d1c"},
+    {file = "pi_heif-0.22.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d308f32ec557ec9f8cfee1225d83d391ffc72a1a8f03106a5805693c02359678"},
+    {file = "pi_heif-0.22.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:94359418200d7ed61f1910c5b3318fcaf0bb6e25c3e6361fbf986b320d4b7e80"},
+    {file = "pi_heif-0.22.0-cp310-cp310-win_amd64.whl", hash = "sha256:0292a1c4b58a7bfeaad0e315ca713beee3051600cf2c100a0fa96fb32377c8fd"},
+    {file = "pi_heif-0.22.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:98dab5eb6bd70bdbe8ce021b4287c42ca779f6ee6d6f6fc91609d950e135d6dd"},
+    {file = "pi_heif-0.22.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ed1731ebece9dcaea50db251b891318ebfc6971161664cca1fd1367e75aa815f"},
+    {file = "pi_heif-0.22.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d92149bad299390a96f29dc584bc0020c88d36d3edf073f03a6ac6b595673f63"},
+    {file = "pi_heif-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd9f1688caa359ad9c6a66fc167fa41fa24dc0fa8ceed65be2c31563d42eb700"},
+    {file = "pi_heif-0.22.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6339784cd447664faa4705373b7f4d7bc9c4133bc0e0a1140516614cd047e9a8"},
+    {file = "pi_heif-0.22.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2c5cfa7b8610750751cd414f7e276093080b38e1728d721f5d315f03a9ebd25c"},
+    {file = "pi_heif-0.22.0-cp311-cp311-win_amd64.whl", hash = "sha256:e739bfe4a1785e34b52eecf092d5c511b673f20f053c728472167fe3ddcbe202"},
+    {file = "pi_heif-0.22.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:fe7b539c1924973de96a58477dab29475ed8bfbc81cb4588db9655e3661710ba"},
+    {file = "pi_heif-0.22.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:322fd33c75ccf1208f08d07aea06c7582eed6e577a3400fe6efcbaab0c1677ff"},
+    {file = "pi_heif-0.22.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3965be305b4a5bbe4c7585f45feeab18ed18228e729a970e9b8a09b25434c885"},
+    {file = "pi_heif-0.22.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebd91145a1ab9229ce330e5a7cb8a95c875c16a1cb1f2b0b5ed86e61a9fb6bd4"},
+    {file = "pi_heif-0.22.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ed229d31a4e0037f0ba417a21f403fb8f965a40e3e5abaedafe717f6b710f544"},
+    {file = "pi_heif-0.22.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6d95b90d5b005c35839120e934bfa5746fdf88ba344d1e58a814a33e5e9f057c"},
+    {file = "pi_heif-0.22.0-cp312-cp312-win_amd64.whl", hash = "sha256:943dee9b05c768acbc06662b327518b2a257dd08ced79dce7c11fab5ac2d5c4b"},
+    {file = "pi_heif-0.22.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:95dd7ec2cbcef6ef1110c6ba539fa7e1489a023589076ca8b3eebcb1e38d256c"},
+    {file = "pi_heif-0.22.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:0e635dceb40424b5d88c7a2183d8dabb844c7776118df12f275ead2a10d275f6"},
+    {file = "pi_heif-0.22.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f668c27a564c7373a462c0484d49166084ec608b65f9d6763fef7a1c80eee8c0"},
+    {file = "pi_heif-0.22.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24ea5ba8cbd871ae09a856dbb9a7e6376ba70b5207085d0302f539574614b9e0"},
+    {file = "pi_heif-0.22.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a89b57cd839b09ee749d12397d2027e20fe7a64a44883688ab44a873b16b507b"},
+    {file = "pi_heif-0.22.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93acd60ef14e3ea835b7e3dafe284c07116349b0df05507520f10520c3ad09c1"},
+    {file = "pi_heif-0.22.0-cp313-cp313-win_amd64.whl", hash = "sha256:6415b0005216ad08f86d0ef75ec24e13e60bf5f45273ab54a4a22f008b9f41ac"},
+    {file = "pi_heif-0.22.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:3f85ac3c0e2fb18af10e5b9789dcfd73f091b1d6ea2090d70d6e87f8744b8fe9"},
+    {file = "pi_heif-0.22.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2635cbcf35206dd3d7f6453df8a6a5cd6a83bcdc9818d999b7342837482d614e"},
+    {file = "pi_heif-0.22.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:893a49c195563a9bbbef571daad995110b47e3e6b624b92269c281cf1b70b8da"},
+    {file = "pi_heif-0.22.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b160a20dd6fa9d951a556006f02ec601a433ec4002953fdb67025f42e5fa89ea"},
+    {file = "pi_heif-0.22.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e2508317837ad6da6b6e2ba154faab766a0cdc189a86dd45b4b7decd641bfa5"},
+    {file = "pi_heif-0.22.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a7a1666070cffce08027b4309fb7f270c0e3a4715a3e5a7a7202b05f65a849f2"},
+    {file = "pi_heif-0.22.0-cp39-cp39-win_amd64.whl", hash = "sha256:c73e651cb17b7da3a740881c479e224084c95380df0d9d4f72d4858a422e80ae"},
+    {file = "pi_heif-0.22.0-pp310-pypy310_pp73-macosx_13_0_x86_64.whl", hash = "sha256:6b83ec2f6db2dd61e09940006ee0a854eb58d91a52023be057da13a08a9f0517"},
+    {file = "pi_heif-0.22.0-pp310-pypy310_pp73-macosx_14_0_arm64.whl", hash = "sha256:f33211fa2afa756b13a63e21aeab577cdc7ddb18a929a012cbbcd3b7d8a772d0"},
+    {file = "pi_heif-0.22.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a82bb03e5ab429b6aee5f1446c7c1925b1fb4fd58d74c960c7995734285db269"},
+    {file = "pi_heif-0.22.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79d72744708949bd9028516d860bd2c341371bca13aa2196e4f2267263834608"},
+    {file = "pi_heif-0.22.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7bb583f93bb4c1dfaf3b6e689a9fa0de7c83182730c16ec8798c459cf8c3e8cf"},
+    {file = "pi_heif-0.22.0-pp39-pypy39_pp73-macosx_13_0_x86_64.whl", hash = "sha256:052fffb0b65c51adf90993a696dd51dddc5f5707d5f40e7bd9f4ad958bb505d9"},
+    {file = "pi_heif-0.22.0-pp39-pypy39_pp73-macosx_14_0_arm64.whl", hash = "sha256:b326a48001a97906e5eb4110113d0cfe1203704f3572100dd177782568c9fc32"},
+    {file = "pi_heif-0.22.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8cc68012a870d5e39d8fd5468dfd1d452ca10388cab5fac30f90ddfa0772a3e"},
+    {file = "pi_heif-0.22.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:350c49ac597d1b8cdaa8a35f2c0901a3847067b9d0a9fdc07d2d6851e5d63382"},
+    {file = "pi_heif-0.22.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3de6fb5a58cf271897adc31e045db45003ae1e32116efc30fa20c72e1c90b2b"},
+    {file = "pi_heif-0.22.0.tar.gz", hash = "sha256:489ddda3c9fed948715a9c8642c6ee24c3b438a7fbf85b3a8f097d632d7082a8"},
+]
+
+[[package]]
+name = "pikepdf"
+version = "9.10.2"
+summary = ""
+dependencies = [
+    "deprecated",
+    "lxml",
+    "packaging",
+    "pillow",
+]
+files = [
+    {file = "pikepdf-9.10.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:c2b40697c8aa48316c1846195afb8f12a3adf242c31fb3e960f067b4e3f47256"},
+    {file = "pikepdf-9.10.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:975b2f2924617cae299f5cc219cd6a4d07576566fac4d28aa87a2c93024f9d74"},
+    {file = "pikepdf-9.10.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:df5e66acc1f24c22cbf76089603045b9fab3e881e7bc3fd8d63630b395ee4865"},
+    {file = "pikepdf-9.10.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cb83e0296ea74b18bf5fec5860d16167e3cef0ce074a21bd93b73bdd60daf6e4"},
+    {file = "pikepdf-9.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5ea08e7df49e5e75b5f03d18ec901b77b202333393a01d88bfc73374cffd12a8"},
+    {file = "pikepdf-9.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ea12192f0cc3bc6fcfaedd0f98161a7f0ca8630cbf972d55d208fb56e7f57120"},
+    {file = "pikepdf-9.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:5aa2d4b8f28588cd4755211058ecb46941e0c73ec59ffd9744c59f1b924c6bd7"},
+    {file = "pikepdf-9.10.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:fa1cfcd725624910fc57c5b6305c5958cd28f1d40b1f9ad26723aba7caaae345"},
+    {file = "pikepdf-9.10.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:1575cb082b4ea39913ed90b96ff55d12d40f21a322f06144ab531d097c03b58c"},
+    {file = "pikepdf-9.10.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a3e92458f2fc0a5e0a98a65a69534deac7a5fdf0791618afed6ca1a3623e972"},
+    {file = "pikepdf-9.10.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c245099f9187d3c636430b941d72fa9e639b1dbed2b8f291b95b561a315fca4"},
+    {file = "pikepdf-9.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c4bfe38e2dfa47f6c5e7e4ff166c6663b149c071e7b7c745595d3e3272cdc625"},
+    {file = "pikepdf-9.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f49f12fef155bf92174f57d21724507427ee20ec43b61460120b8f7870905028"},
+    {file = "pikepdf-9.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:58105543a2b671cc2ffb2d2da385e383d4731a19def86de656bd7da36755e444"},
+    {file = "pikepdf-9.10.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:3b14cacd1f0275654a7803af2611e933f5d57a98cba08aa9041792bb0f38c073"},
+    {file = "pikepdf-9.10.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:aaeee4676b99655c0f655404c1fca7ba483c5b4d96a790786dd4caa21e11ac18"},
+    {file = "pikepdf-9.10.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:efee3a3cd8047e796508f56cefac4eb45d1173e81813dbeb3d8e9dd2e857de60"},
+    {file = "pikepdf-9.10.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:83cb30d947fae647876d2dba3c0295c0e7aa75e915bf0ea2350c72a6b652b2fa"},
+    {file = "pikepdf-9.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:cc498904eabec3f9d144f1c259080508b3c5809720ba8f142c3971b1525ebed8"},
+    {file = "pikepdf-9.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25fb3e0d15c2c3cd77735335d09ca968df693dd0f9c6f028e9c9ce7b0ac86b48"},
+    {file = "pikepdf-9.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:66819bd6edbca64fe2ec2020e85d339bee969aed051c2b7f256574da1a073ff6"},
+    {file = "pikepdf-9.10.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:a2ed7c8eabfe35b4ae2564b26cc6946b40c4efccfaa9acf91bac8e0cfc31a467"},
+    {file = "pikepdf-9.10.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:c7aec253420d69cbaf6228ade29ab1e2b501dd0d9561ea4c90f16c849ec5f9ea"},
+    {file = "pikepdf-9.10.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c6bc4851b2978198143908b9a0e845ecc6587904754436bf0ee488fd6ec4aba"},
+    {file = "pikepdf-9.10.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:860eec8cda5d7b6d168d6fd4a956d8101577d9ea4a585fafab3fc0b1bbaddea1"},
+    {file = "pikepdf-9.10.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:048f3d5138c44f8c452d818e14130fa30d809f61d70063b6e615e91148342188"},
+    {file = "pikepdf-9.10.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0fff140da5a75b41b4cdf34354366620c206f31fc513356c70cf5da6b81d2483"},
+    {file = "pikepdf-9.10.2-cp313-cp313-win_amd64.whl", hash = "sha256:1b5af8e233ed232f02e31a281134eed94504c72e9de88326433e34641f04a113"},
+    {file = "pikepdf-9.10.2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:ed7032dfe0f280e87908e025b22ecd49b230d2b753c4ef66d0f6ce2952f5e721"},
+    {file = "pikepdf-9.10.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:9d5f9fa9513e600752acdd81fd1b987b6bf85a36c25779bd9a7e0986626424d7"},
+    {file = "pikepdf-9.10.2-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1899d0d9dd1ebdf13125159029a2c89afc66d87f0f3bcdbca9adbda6ad2bce15"},
+    {file = "pikepdf-9.10.2-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77ec60c230f11797e94a0659523c579fd8d25969de9091b2d6c7799868cd60c3"},
+    {file = "pikepdf-9.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2ddc1cb0aba4f2fa0d95ed68460688e3efcd3a70973901faf5b8c85e81438bcf"},
+    {file = "pikepdf-9.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a0ee549af6560be2c3f7b9c37b4c9c814bcd24249323b0525ba0b00a11988d90"},
+    {file = "pikepdf-9.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:f1d7417a1b49d77f13f9e9310e5d122a0e69d5e06afd21e06d12b0baa5cd9578"},
+    {file = "pikepdf-9.10.2.tar.gz", hash = "sha256:f62fc2183888f2ca1d271bf4faa440a2e2d0159221620a9c6a314f9c9a95680c"},
+]
+
 [[package]]
 name = "pillow"
 version = "11.0.0"
@@ -2289,6 +2566,19 @@ files = [
     {file = "redis-5.2.0.tar.gz", hash = "sha256:0b1087665a771b1ff2e003aa5bdd354f15a70c9e25d5a7dbf9c722c16528a7b0"},
 ]
 
+[[package]]
+name = "reportlab"
+version = "4.4.3"
+summary = ""
+dependencies = [
+    "charset-normalizer; python_full_version < \"3.10\"",
+    "pillow; python_full_version < \"3.10\"",
+]
+files = [
+    {file = "reportlab-4.4.3-py3-none-any.whl", hash = "sha256:df905dc5ec5ddaae91fc9cb3371af863311271d555236410954961c5ee6ee1b5"},
+    {file = "reportlab-4.4.3.tar.gz", hash = "sha256:073b0975dab69536acd3251858e6b0524ed3e087e71f1d0d1895acb50acf9c7b"},
+]
+
 [[package]]
 name = "requests"
 version = "2.32.3"
diff --git a/argilla-server/pyproject.toml b/argilla-server/pyproject.toml
index 14c3e49c5..098bfdd57 100644
--- a/argilla-server/pyproject.toml
+++ b/argilla-server/pyproject.toml
@@ -66,6 +66,8 @@ dependencies = [
     "Jinja2>=3.1.4",           # Used by huggingface-hub to render dataset card templates
     # For file storage
     "minio>=7.2.7",
+    # For document processing
+    "ocrmypdf>=16.10.4"
 ]
 
 [project.optional-dependencies]
diff --git a/argilla-server/src/argilla_server/api/schemas/v1/segments.py b/argilla-server/src/argilla_server/api/schemas/v1/segments.py
new file mode 100644
index 000000000..f22da6284
--- /dev/null
+++ b/argilla-server/src/argilla_server/api/schemas/v1/segments.py
@@ -0,0 +1,122 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import uuid
+from typing import Optional, Any, List, Union
+
+from pydantic import BaseModel, Field, validator
+
+"""
+This is deprecated code that is outdated and should be used for reference only.
+We may want to switch to using LlamaIndexDocument or other document models in the future.
+"""
+
+
+class Segments(BaseModel):
+    items: List[Union["TextSegment", "TableSegment", "FigureSegment"]] = Field(
+        default_factory=list,
+        description="List of segments in the reading order of the document",
+    )
+
+    def get(self, id: str, header: str | None = None, default=None):
+        for item in self.items:
+            if item.id == id or (header and item.header == header):
+                return item
+
+        return default
+
+    def __repr_str__(self, join_str: str) -> str:
+        return "\n  " + f"{join_str}\n  ".join(f"{type(item).__name__}({item})" for item in self.items)
+
+    @validator("items", pre=True, each_item=True)
+    def parse_segments(cls, v):
+        if not isinstance(v, dict):
+            v = v.dict()
+
+        segment_type = v.get("type", "").lower()
+        if segment_type in {"figure", "image"}:
+            return FigureSegment(**v)
+        elif segment_type == "table" or "html" in v:
+            return TableSegment(**v)
+        else:
+            return TextSegment(**v)
+
+    def __getitem__(self, index):
+        return self.items[index]
+
+    def __len__(self):
+        return len(self.items)
+
+
+class Coordinates(BaseModel):
+    points: List[List[float]] = Field(
+        ..., description="List of 4 points, e.g. [[x1, y1], [x2, y1], [x1, y2], [x2, y2]]"
+    )
+    layout_width: Optional[int] = Field(None, description="Width of the layout")
+    layout_height: Optional[int] = Field(None, description="Height of the layout")
+    system: Optional[str] = Field(description="System of coordinates")
+
+    def __repr_str__(self, join_str: str) -> str:
+        return ""
+
+
+class TextSegment(BaseModel):
+    id: str = Field(
+        default_factory=lambda: str(uuid.uuid4()), description="Unique identifier of the segment", repr=False
+    )
+
+    header: Optional[str] = Field(
+        None,
+        description="Header of the element",
+    )
+    text: str = Field(..., description="Content as plain text", repr=False)
+    summary: Optional[str] = Field(None, description="Summary of the content")
+    page_number: Optional[int] = Field(None, description="Page number of the segment")
+    coordinates: Optional["Coordinates"] = Field(
+        None, description="Coordinates of the element in the document", repr=False
+    )
+    level: Optional[int] = Field(None, description="Level of the header")
+    source: Optional[str] = Field(None, description="Source of the element", repr=False)
+    type: Optional[str] = Field("text", description="Type of the element", repr=False)
+    original: Optional[Any] = Field(
+        None, exclude=True, description="Original object from which the segment was extracted", repr=False
+    )
+
+    def text_cleaned(self):
+        return self.text.replace(" | ", " ").replace("---", "").strip()
+
+    def __repr_str__(self, join_str: str) -> str:
+        return join_str.join(
+            repr(v)
+            if a is None
+            else (
+                f'{a}="{v[:100]}...{v[-100:]}"'.replace("\n", "")
+                if isinstance(v, str) and len(v) > 200
+                else f"{a}={v!r}"
+            )
+            for a, v in self.__repr_args__()
+            if v and a not in {"INCLUDE_METADATA_KEYS"}
+        )
+
+
+class TableSegment(TextSegment):
+    footer: Optional[str] = Field(None, description="Footer of the table or figure, to explain variable acronyms.")
+    html: Optional[str] = Field(None, description="Content as HTML structured", repr=False)
+    image: Optional[str] = Field(None, description="URL/filepath of the element's image", repr=False)
+    probability: Optional[float] = Field(None, description="Probability or confidence of the segment's extraction")
+    type: Optional[str] = Field("table", description="Type of the element", repr=False)
+
+
+class FigureSegment(TableSegment):
+    type: Optional[str] = Field("figure", description="Type of the element", repr=False)
diff --git a/argilla-server/src/argilla_server/contexts/document/figures.py b/argilla-server/src/argilla_server/contexts/document/figures.py
new file mode 100644
index 000000000..fb5dffc96
--- /dev/null
+++ b/argilla-server/src/argilla_server/contexts/document/figures.py
@@ -0,0 +1,14 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
new file mode 100644
index 000000000..6b28a8ebb
--- /dev/null
+++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
@@ -0,0 +1,158 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Document preprocessing utilities."""
+
+import logging
+import os
+import tempfile
+import time
+from io import BytesIO
+from uuid import uuid4
+
+try:
+    import ocrmypdf
+
+    OCRMYPDF_AVAILABLE = True
+except ImportError:
+    OCRMYPDF_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+
+def preprocess_pdf_with_ocrmypdf(file_data: bytes, filename: str) -> bytes:
+    """
+    Preprocess PDF with OCRmyPDF to add OCR layer and fix orientation.
+    Works with bytes data and returns processed bytes, minimizing disk I/O.
+
+    Args:
+        file_data: PDF file data as bytes
+        filename: Original filename for logging purposes
+
+    Returns:
+        Processed PDF data as bytes (or original bytes if processing fails)
+    """
+    if not OCRMYPDF_AVAILABLE:
+        logger.warning("OCRmyPDF not available, skipping preprocessing")
+        return file_data
+
+    # Only process PDF files
+    if not filename.lower().endswith(".pdf"):
+        logger.debug(f"Skipping OCRmyPDF for non-PDF file: {filename}")
+        return file_data
+
+    try:
+        logger.info(f"Starting OCRmyPDF preprocessing for: {filename}")
+        start_time = time.time()
+
+        # Try using BytesIO objects first to minimize disk I/O
+        try:
+            input_buffer = BytesIO(file_data)
+            output_buffer = BytesIO()
+
+            # OCRmyPDF configuration for optimal processing
+            ocrmypdf.ocr(
+                input_buffer,
+                output_buffer,
+                language=["eng"],  # Can be configured for other languages
+                rotate_pages=True,  # Auto-rotate pages with horizontal text
+                deskew=True,  # Fix skewed text
+                clean=True,  # Clean up artifacts
+                optimize=1,  # Optimize output file size
+                pdf_renderer="hocr",  # Use hOCR for better text positioning
+                force_ocr=False,  # Only OCR pages that need it
+                skip_text=False,  # Don't skip existing text
+                redo_ocr=False,  # Don't redo existing OCR
+                progress_bar=False,
+                quiet=True,
+            )
+
+            # Get processed PDF data
+            processed_data = output_buffer.getvalue()
+            output_buffer.close()
+            input_buffer.close()
+
+        except Exception as buffer_error:
+            # Fallback to temporary files if BytesIO approach fails
+            logger.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}")
+            processed_data = _preprocess_pdf_with_temp_files(file_data, filename)
+
+        processing_time = time.time() - start_time
+        logger.info(f"OCRmyPDF completed for {filename} in {processing_time:.2f} seconds")
+
+        return processed_data
+
+    except Exception as e:
+        logger.error(f"OCRmyPDF preprocessing failed for {filename}: {e}")
+        return file_data
+
+
+def _preprocess_pdf_with_temp_files(file_data: bytes, filename: str) -> bytes:
+    """
+    Fallback implementation using unique temporary files to avoid concurrency issues.
+    """
+    input_temp_file = None
+    output_temp_file = None
+
+    try:
+        # Generate unique identifiers to avoid filename collisions in concurrent jobs
+        unique_id = str(uuid4())
+        temp_dir = tempfile.gettempdir()
+
+        # Create input temp file with unique identifier
+        input_temp_file = tempfile.NamedTemporaryFile(
+            suffix=".pdf", prefix=f"ocr_input_{unique_id}_", dir=temp_dir, delete=False
+        )
+        input_temp_file.write(file_data)
+        input_temp_file.flush()
+        input_temp_file.close()
+
+        # Create output temp file with unique identifier
+        output_temp_file = tempfile.NamedTemporaryFile(
+            suffix=".pdf", prefix=f"ocr_output_{unique_id}_", dir=temp_dir, delete=False
+        )
+        output_temp_file.close()
+
+        # OCRmyPDF configuration for optimal processing
+        ocrmypdf.ocr(
+            input_temp_file.name,
+            output_temp_file.name,
+            language=["eng"],  # Can be configured for other languages
+            rotate_pages=True,  # Auto-rotate pages with horizontal text
+            deskew=True,  # Fix skewed text
+            clean=True,  # Clean up artifacts
+            optimize=1,  # Optimize output file size
+            pdf_renderer="hocr",  # Use hOCR for better text positioning
+            force_ocr=False,  # Only OCR pages that need it
+            skip_text=False,  # Don't skip existing text
+            redo_ocr=False,  # Don't redo existing OCR
+            progress_bar=False,
+            quiet=True,
+        )
+
+        # Read processed PDF data
+        with open(output_temp_file.name, "rb") as f:
+            processed_data = f.read()
+
+        return processed_data
+
+    finally:
+        # Clean up temporary files
+        for temp_file in [input_temp_file, output_temp_file]:
+            if temp_file is not None:
+                try:
+                    if hasattr(temp_file, "name"):
+                        os.unlink(temp_file.name)
+                except OSError as e:
+                    logger.warning(f"Failed to clean up temp file: {e}")
diff --git a/argilla-server/src/argilla_server/contexts/document/tables.py b/argilla-server/src/argilla_server/contexts/document/tables.py
new file mode 100644
index 000000000..fb5dffc96
--- /dev/null
+++ b/argilla-server/src/argilla_server/contexts/document/tables.py
@@ -0,0 +1,14 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/argilla-server/src/argilla_server/jobs/document_jobs.py b/argilla-server/src/argilla_server/jobs/document_jobs.py
index 3e2fbfa0d..8f3f99e90 100644
--- a/argilla-server/src/argilla_server/jobs/document_jobs.py
+++ b/argilla-server/src/argilla_server/jobs/document_jobs.py
@@ -26,6 +26,7 @@
 from argilla_server.jobs import DEFAULT_QUEUE, JOB_TIMEOUT_DISABLED
 from argilla_server.api.schemas.v1.documents import DocumentCreate
 from argilla_server.contexts import files, imports
+from argilla_server.contexts.document import preprocessing
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -129,11 +130,16 @@ async def upload_reference_documents_job(
                         continue
 
                     try:
+                        # Preprocess PDF files with OCRmyPDF for rotation and OCR
+                        processed_file_data = preprocessing.preprocess_pdf_with_ocrmypdf(
+                            file_data=file_data, filename=filename
+                        )
+
                         file_url = files.put_document_file(
                             client=client,
                             workspace_name=workspace.name,
                             document_id=file_document_create.id,  # type: ignore
-                            file_data=file_data,
+                            file_data=processed_file_data,
                             filename=filename,
                             # metadata=file_document_create.model_dump(
                             #     include={"file_name": True, "pmid": True, "doi": True}

From b4e15e617f7540a255bdbd1d944bdb6958533e0e Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Mon, 4 Aug 2025 23:36:30 -0700
Subject: [PATCH 02/22] refactor: enhance PDF preprocessing with configurable
 settings and integrate OCRmyPDF

---
 argilla-server/pdm.lock                       | 468 +-----------------
 argilla-server/pyproject.toml                 |   5 +-
 .../contexts/document/preprocessing.py        | 288 +++++++----
 .../src/argilla_server/jobs/document_jobs.py  |   8 +-
 4 files changed, 214 insertions(+), 555 deletions(-)

diff --git a/argilla-server/pdm.lock b/argilla-server/pdm.lock
index c40a808db..8403ffc7e 100644
--- a/argilla-server/pdm.lock
+++ b/argilla-server/pdm.lock
@@ -5,10 +5,10 @@
 groups = ["default", "postgresql", "test"]
 strategy = []
 lock_version = "4.5.0"
-content_hash = "sha256:21af2a3f2dff0688e08aed10184370c7c0aed450f814cf0d3509e964d9582654"
+content_hash = "sha256:f20406357dc5b02a37c1da8689053074229d1204cb9bcb4fe8848b6d2835b1b4"
 
 [[metadata.targets]]
-requires_python = ">=3.9"
+requires_python = ">=3.10"
 
 [[package]]
 name = "aiofiles"
@@ -102,21 +102,6 @@ files = [
     {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:77abf6665ae54000b98b3c742bc6ea1d1fb31c394bcabf8b5d2c1ac3ebfe7f3b"},
     {file = "aiohttp-3.10.10-cp313-cp313-win32.whl", hash = "sha256:4470c73c12cd9109db8277287d11f9dd98f77fc54155fc71a7738a83ffcc8ea8"},
     {file = "aiohttp-3.10.10-cp313-cp313-win_amd64.whl", hash = "sha256:486f7aabfa292719a2753c016cc3a8f8172965cabb3ea2e7f7436c7f5a22a151"},
-    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01948b1d570f83ee7bbf5a60ea2375a89dfb09fd419170e7f5af029510033d24"},
-    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9fc1500fd2a952c5c8e3b29aaf7e3cc6e27e9cfc0a8819b3bce48cc1b849e4cc"},
-    {file = "aiohttp-3.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f614ab0c76397661b90b6851a030004dac502e48260ea10f2441abd2207fbcc7"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00819de9e45d42584bed046314c40ea7e9aea95411b38971082cad449392b08c"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05646ebe6b94cc93407b3bf34b9eb26c20722384d068eb7339de802154d61bc5"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:998f3bd3cfc95e9424a6acd7840cbdd39e45bc09ef87533c006f94ac47296090"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9010c31cd6fa59438da4e58a7f19e4753f7f264300cd152e7f90d4602449762"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ea7ffc6d6d6f8a11e6f40091a1040995cdff02cfc9ba4c2f30a516cb2633554"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ef9c33cc5cbca35808f6c74be11eb7f5f6b14d2311be84a15b594bd3e58b5527"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ce0cdc074d540265bfeb31336e678b4e37316849d13b308607efa527e981f5c2"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:597a079284b7ee65ee102bc3a6ea226a37d2b96d0418cc9047490f231dc09fe8"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:7789050d9e5d0c309c706953e5e8876e38662d57d45f936902e176d19f1c58ab"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e7f8b04d83483577fd9200461b057c9f14ced334dcb053090cea1da9c8321a91"},
-    {file = "aiohttp-3.10.10-cp39-cp39-win32.whl", hash = "sha256:c02a30b904282777d872266b87b20ed8cc0d1501855e27f831320f471d54d983"},
-    {file = "aiohttp-3.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:edfe3341033a6b53a5c522c802deb2079eee5cbfbb0af032a55064bd65c73a23"},
     {file = "aiohttp-3.10.10.tar.gz", hash = "sha256:0631dd7c9f0822cc61c88586ca76d5b5ada26538097d0f1df510b082bad3411a"},
 ]
 
@@ -264,14 +249,6 @@ files = [
     {file = "asyncpg-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04ff0785ae7eed6cc138e73fc67b8e51d54ee7a3ce9b63666ce55a0bf095f7ba"},
     {file = "asyncpg-0.30.0-cp313-cp313-win32.whl", hash = "sha256:ae374585f51c2b444510cdf3595b97ece4f233fde739aa14b50e0d64e8a7a590"},
     {file = "asyncpg-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:f59b430b8e27557c3fb9869222559f7417ced18688375825f8f12302c34e915e"},
-    {file = "asyncpg-0.30.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6f4e83f067b35ab5e6371f8a4c93296e0439857b4569850b178a01385e82e9ad"},
-    {file = "asyncpg-0.30.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5df69d55add4efcd25ea2a3b02025b669a285b767bfbf06e356d68dbce4234ff"},
-    {file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3479a0d9a852c7c84e822c073622baca862d1217b10a02dd57ee4a7a081f708"},
-    {file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26683d3b9a62836fad771a18ecf4659a30f348a561279d6227dab96182f46144"},
-    {file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1b982daf2441a0ed314bd10817f1606f1c28b1136abd9e4f11335358c2c631cb"},
-    {file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1c06a3a50d014b303e5f6fc1e5f95eb28d2cee89cf58384b700da621e5d5e547"},
-    {file = "asyncpg-0.30.0-cp39-cp39-win32.whl", hash = "sha256:1b11a555a198b08f5c4baa8f8231c74a366d190755aa4f99aacec5970afe929a"},
-    {file = "asyncpg-0.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:8b684a3c858a83cd876f05958823b68e8d14ec01bb0c0d14a6704c5bf9711773"},
     {file = "asyncpg-0.30.0.tar.gz", hash = "sha256:c551e9928ab6707602f44811817f82ba3c446e018bfe1d3abecc8ba5f3eac851"},
 ]
 
@@ -322,8 +299,6 @@ files = [
     {file = "bcrypt-4.2.0-cp39-abi3-win_amd64.whl", hash = "sha256:61ed14326ee023917ecd093ee6ef422a72f3aec6f07e21ea5f10622b735538a9"},
     {file = "bcrypt-4.2.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:39e1d30c7233cfc54f5c3f2c825156fe044efdd3e0b9d309512cc514a263ec2a"},
     {file = "bcrypt-4.2.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f4f4acf526fcd1c34e7ce851147deedd4e26e6402369304220250598b26448db"},
-    {file = "bcrypt-4.2.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:1ff39b78a52cf03fdf902635e4c81e544714861ba3f0efc56558979dd4f09170"},
-    {file = "bcrypt-4.2.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:373db9abe198e8e2c70d12b479464e0d5092cc122b20ec504097b5f2297ed184"},
     {file = "bcrypt-4.2.0.tar.gz", hash = "sha256:cf69eaf5185fd58f268f805b505ce31f9b9fc2d64b376642164e9244540c1221"},
 ]
 
@@ -394,22 +369,6 @@ files = [
     {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7"},
     {file = "Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0"},
     {file = "Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b"},
-    {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a"},
-    {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7905193081db9bfa73b1219140b3d315831cbff0d8941f22da695832f0dd188f"},
-    {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a77def80806c421b4b0af06f45d65a136e7ac0bdca3c09d9e2ea4e515367c7e9"},
-    {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8dadd1314583ec0bf2d1379f7008ad627cd6336625d6679cf2f8e67081b83acf"},
-    {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:901032ff242d479a0efa956d853d16875d42157f98951c0230f69e69f9c09bac"},
-    {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:22fc2a8549ffe699bfba2256ab2ed0421a7b8fadff114a3d201794e45a9ff578"},
-    {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ae15b066e5ad21366600ebec29a7ccbc86812ed267e4b28e860b8ca16a2bc474"},
-    {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:949f3b7c29912693cee0afcf09acd6ebc04c57af949d9bf77d6101ebb61e388c"},
-    {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:89f4988c7203739d48c6f806f1e87a1d96e0806d44f0fba61dba81392c9e474d"},
-    {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:de6551e370ef19f8de1807d0a9aa2cdfdce2e85ce88b122fe9f6b2b076837e59"},
-    {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0737ddb3068957cf1b054899b0883830bb1fec522ec76b1098f9b6e0f02d9419"},
-    {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4f3607b129417e111e30637af1b56f24f7a49e64763253bbc275c75fa887d4b2"},
-    {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6c6e0c425f22c1c719c42670d561ad682f7bfeeef918edea971a79ac5252437f"},
-    {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:494994f807ba0b92092a163a0a283961369a65f6cbe01e8891132b7a320e61eb"},
-    {file = "Brotli-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f0d8a7a6b5983c2496e364b969f0e526647a06b075d034f3297dc66f3b360c64"},
-    {file = "Brotli-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdad5b9014d83ca68c25d2e9444e28e967ef16e80f6b436918c700c117a85467"},
     {file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"},
 ]
 
@@ -489,18 +448,6 @@ files = [
     {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"},
     {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"},
     {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"},
-    {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"},
-    {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"},
-    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"},
-    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"},
-    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"},
-    {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"},
-    {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"},
     {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"},
 ]
 
@@ -569,21 +516,6 @@ files = [
     {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"},
     {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"},
     {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"},
     {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"},
     {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"},
 ]
@@ -664,16 +596,6 @@ files = [
     {file = "coverage-7.6.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cc8ff50b50ce532de2fa7a7daae9dd12f0a699bfcd47f20945364e5c31799fef"},
     {file = "coverage-7.6.4-cp313-cp313t-win32.whl", hash = "sha256:b8d3a03d9bfcaf5b0141d07a88456bb6a4c3ce55c080712fec8418ef3610230e"},
     {file = "coverage-7.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:f3ddf056d3ebcf6ce47bdaf56142af51bb7fad09e4af310241e9db7a3a8022e1"},
-    {file = "coverage-7.6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cb7fa111d21a6b55cbf633039f7bc2749e74932e3aa7cb7333f675a58a58bf3"},
-    {file = "coverage-7.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11a223a14e91a4693d2d0755c7a043db43d96a7450b4f356d506c2562c48642c"},
-    {file = "coverage-7.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a413a096c4cbac202433c850ee43fa326d2e871b24554da8327b01632673a076"},
-    {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00a1d69c112ff5149cabe60d2e2ee948752c975d95f1e1096742e6077affd376"},
-    {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f76846299ba5c54d12c91d776d9605ae33f8ae2b9d1d3c3703cf2db1a67f2c0"},
-    {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fe439416eb6380de434886b00c859304338f8b19f6f54811984f3420a2e03858"},
-    {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:0294ca37f1ba500667b1aef631e48d875ced93ad5e06fa665a3295bdd1d95111"},
-    {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6f01ba56b1c0e9d149f9ac85a2f999724895229eb36bd997b61e62999e9b0901"},
-    {file = "coverage-7.6.4-cp39-cp39-win32.whl", hash = "sha256:bc66f0bf1d7730a17430a50163bb264ba9ded56739112368ba985ddaa9c3bd09"},
-    {file = "coverage-7.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:c481b47f6b5845064c65a7bc78bc0860e635a9b055af0df46fdf1c58cebf8e8f"},
     {file = "coverage-7.6.4-pp39.pp310-none-any.whl", hash = "sha256:3c65d37f3a9ebb703e710befdc489a38683a5b152242664b973a7b7b22348a4e"},
     {file = "coverage-7.6.4.tar.gz", hash = "sha256:29fc0f17b1d3fea332f8001d4558f8214af7f1d87a345f3a133c901d60347c73"},
 ]
@@ -738,16 +660,6 @@ files = [
     {file = "coverage-7.6.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cc8ff50b50ce532de2fa7a7daae9dd12f0a699bfcd47f20945364e5c31799fef"},
     {file = "coverage-7.6.4-cp313-cp313t-win32.whl", hash = "sha256:b8d3a03d9bfcaf5b0141d07a88456bb6a4c3ce55c080712fec8418ef3610230e"},
     {file = "coverage-7.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:f3ddf056d3ebcf6ce47bdaf56142af51bb7fad09e4af310241e9db7a3a8022e1"},
-    {file = "coverage-7.6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cb7fa111d21a6b55cbf633039f7bc2749e74932e3aa7cb7333f675a58a58bf3"},
-    {file = "coverage-7.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11a223a14e91a4693d2d0755c7a043db43d96a7450b4f356d506c2562c48642c"},
-    {file = "coverage-7.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a413a096c4cbac202433c850ee43fa326d2e871b24554da8327b01632673a076"},
-    {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00a1d69c112ff5149cabe60d2e2ee948752c975d95f1e1096742e6077affd376"},
-    {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f76846299ba5c54d12c91d776d9605ae33f8ae2b9d1d3c3703cf2db1a67f2c0"},
-    {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fe439416eb6380de434886b00c859304338f8b19f6f54811984f3420a2e03858"},
-    {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:0294ca37f1ba500667b1aef631e48d875ced93ad5e06fa665a3295bdd1d95111"},
-    {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6f01ba56b1c0e9d149f9ac85a2f999724895229eb36bd997b61e62999e9b0901"},
-    {file = "coverage-7.6.4-cp39-cp39-win32.whl", hash = "sha256:bc66f0bf1d7730a17430a50163bb264ba9ded56739112368ba985ddaa9c3bd09"},
-    {file = "coverage-7.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:c481b47f6b5845064c65a7bc78bc0860e635a9b055af0df46fdf1c58cebf8e8f"},
     {file = "coverage-7.6.4-pp39.pp310-none-any.whl", hash = "sha256:3c65d37f3a9ebb703e710befdc489a38683a5b152242664b973a7b7b22348a4e"},
     {file = "coverage-7.6.4.tar.gz", hash = "sha256:29fc0f17b1d3fea332f8001d4558f8214af7f1d87a345f3a133c901d60347c73"},
 ]
@@ -782,10 +694,6 @@ files = [
     {file = "cryptography-43.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a2a431ee15799d6db9fe80c82b055bae5a752bef645bba795e8e52687c69efe3"},
     {file = "cryptography-43.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:281c945d0e28c92ca5e5930664c1cefd85efe80e5c0d2bc58dd63383fda29f83"},
     {file = "cryptography-43.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f18c716be16bc1fea8e95def49edf46b82fccaa88587a45f8dc0ff6ab5d8e0a7"},
-    {file = "cryptography-43.0.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:4a02ded6cd4f0a5562a8887df8b3bd14e822a90f97ac5e544c162899bc467664"},
-    {file = "cryptography-43.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:53a583b6637ab4c4e3591a15bc9db855b8d9dee9a669b550f311480acab6eb08"},
-    {file = "cryptography-43.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1ec0bcf7e17c0c5669d881b1cd38c4972fade441b27bda1051665faaa89bdcaa"},
-    {file = "cryptography-43.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2ce6fae5bdad59577b44e4dfed356944fbf1d925269114c28be377692643b4ff"},
     {file = "cryptography-43.0.3.tar.gz", hash = "sha256:315b9001266a492a6ff443b61238f956b214dbec9910a081ba5b6646a055a805"},
 ]
 
@@ -1029,21 +937,6 @@ files = [
     {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03"},
     {file = "frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c"},
     {file = "frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28"},
-    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9bbcdfaf4af7ce002694a4e10a0159d5a8d20056a12b05b45cea944a4953f972"},
-    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1893f948bf6681733aaccf36c5232c231e3b5166d607c5fa77773611df6dc336"},
-    {file = "frozenlist-1.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2b5e23253bb709ef57a8e95e6ae48daa9ac5f265637529e4ce6b003a37b2621f"},
-    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f253985bb515ecd89629db13cb58d702035ecd8cfbca7d7a7e29a0e6d39af5f"},
-    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04a5c6babd5e8fb7d3c871dc8b321166b80e41b637c31a995ed844a6139942b6"},
-    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9fe0f1c29ba24ba6ff6abf688cb0b7cf1efab6b6aa6adc55441773c252f7411"},
-    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:226d72559fa19babe2ccd920273e767c96a49b9d3d38badd7c91a0fdeda8ea08"},
-    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b731db116ab3aedec558573c1a5eec78822b32292fe4f2f0345b7f697745c2"},
-    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:366d8f93e3edfe5a918c874702f78faac300209a4d5bf38352b2c1bdc07a766d"},
-    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1b96af8c582b94d381a1c1f51ffaedeb77c821c690ea5f01da3d70a487dd0a9b"},
-    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c03eff4a41bd4e38415cbed054bbaff4a075b093e2394b6915dca34a40d1e38b"},
-    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:50cf5e7ee9b98f22bdecbabf3800ae78ddcc26e4a435515fc72d97903e8488e0"},
-    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e76bfbc72353269c44e0bc2cfe171900fbf7f722ad74c9a7b638052afe6a00c"},
-    {file = "frozenlist-1.5.0-cp39-cp39-win32.whl", hash = "sha256:666534d15ba8f0fda3f53969117383d5dc021266b3c1a42c9ec4855e4b58b9d3"},
-    {file = "frozenlist-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:5c28f4b5dbef8a0d8aad0d4de24d1e9e981728628afaf4ea0792f5d0939372f0"},
     {file = "frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3"},
     {file = "frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817"},
 ]
@@ -1119,16 +1012,6 @@ files = [
     {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6510bf84a6b643dabba74d3049ead221257603a253d0a9873f55f6a59a65f822"},
     {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:04b013dc07c96f83134b1e99888e7a79979f1a247e2a9f59697fa14b5862ed01"},
     {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6"},
-    {file = "greenlet-3.1.1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:396979749bd95f018296af156201d6211240e7a23090f50a8d5d18c370084dc3"},
-    {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca9d0ff5ad43e785350894d97e13633a66e2b50000e8a183a50a88d834752d42"},
-    {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f6ff3b14f2df4c41660a7dec01045a045653998784bf8cfcb5a525bdffffbc8f"},
-    {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94ebba31df2aa506d7b14866fed00ac141a867e63143fe5bca82a8e503b36437"},
-    {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aaad12ac0ff500f62cebed98d8789198ea0e6f233421059fa68a5aa7220145"},
-    {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63e4844797b975b9af3a3fb8f7866ff08775f5426925e1e0bbcfe7932059a12c"},
-    {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7939aa3ca7d2a1593596e7ac6d59391ff30281ef280d8632fa03d81f7c5f955e"},
-    {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d0028e725ee18175c6e422797c407874da24381ce0690d6b9396c204c7f7276e"},
-    {file = "greenlet-3.1.1-cp39-cp39-win32.whl", hash = "sha256:5e06afd14cbaf9e00899fae69b24a32f2196c19de08fcb9f4779dd4f004e5e7c"},
-    {file = "greenlet-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:3319aa75e0e0639bc15ff54ca327e8dc7a6fe404003496e3c6925cd3142e0e22"},
     {file = "greenlet-3.1.1.tar.gz", hash = "sha256:4ce3ac6cdb6adf7946475d7ef31777c26d94bccc377e070a7986bd2d5c515467"},
 ]
 
@@ -1187,13 +1070,6 @@ files = [
     {file = "httptools-0.6.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:342dd6946aa6bda4b8f18c734576106b8a31f2fe31492881a9a160ec84ff4bd5"},
     {file = "httptools-0.6.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b36913ba52008249223042dca46e69967985fb4051951f94357ea681e1f5dc0"},
     {file = "httptools-0.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8"},
-    {file = "httptools-0.6.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:85797e37e8eeaa5439d33e556662cc370e474445d5fab24dcadc65a8ffb04003"},
-    {file = "httptools-0.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:db353d22843cf1028f43c3651581e4bb49374d85692a85f95f7b9a130e1b2cab"},
-    {file = "httptools-0.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1ffd262a73d7c28424252381a5b854c19d9de5f56f075445d33919a637e3547"},
-    {file = "httptools-0.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:703c346571fa50d2e9856a37d7cd9435a25e7fd15e236c397bf224afaa355fe9"},
-    {file = "httptools-0.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:aafe0f1918ed07b67c1e838f950b1c1fabc683030477e60b335649b8020e1076"},
-    {file = "httptools-0.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0e563e54979e97b6d13f1bbc05a96109923e76b901f786a5eae36e99c01237bd"},
-    {file = "httptools-0.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:b799de31416ecc589ad79dd85a0b2657a8fe39327944998dea368c1d4c9e55e6"},
     {file = "httptools-0.6.4.tar.gz", hash = "sha256:4e93eee4add6493b59a5c514da98c939b244fce4a0d8879cd3f466562f4b7d5c"},
 ]
 
@@ -1252,18 +1128,6 @@ files = [
     {file = "img2pdf-0.6.1.tar.gz", hash = "sha256:306e279eb832bc159d7d6294b697a9fbd11b4be1f799b14b3b2174fb506af289"},
 ]
 
-[[package]]
-name = "importlib-metadata"
-version = "8.5.0"
-summary = ""
-dependencies = [
-    "zipp; python_full_version < \"3.10\"",
-]
-files = [
-    {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"},
-    {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"},
-]
-
 [[package]]
 name = "iniconfig"
 version = "2.0.0"
@@ -1350,32 +1214,12 @@ files = [
     {file = "lxml-6.0.0-cp313-cp313-win32.whl", hash = "sha256:5fcd7d3b1d8ecb91445bd71b9c88bdbeae528fefee4f379895becfc72298d181"},
     {file = "lxml-6.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:2f34687222b78fff795feeb799a7d44eca2477c3d9d3a46ce17d51a4f383e32e"},
     {file = "lxml-6.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:21db1ec5525780fd07251636eb5f7acb84003e9382c72c18c542a87c416ade03"},
-    {file = "lxml-6.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:85b14a4689d5cff426c12eefe750738648706ea2753b20c2f973b2a000d3d261"},
-    {file = "lxml-6.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f64ccf593916e93b8d36ed55401bb7fe9c7d5de3180ce2e10b08f82a8f397316"},
-    {file = "lxml-6.0.0-cp39-cp39-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:b372d10d17a701b0945f67be58fae4664fd056b85e0ff0fbc1e6c951cdbc0512"},
-    {file = "lxml-6.0.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a674c0948789e9136d69065cc28009c1b1874c6ea340253db58be7622ce6398f"},
-    {file = "lxml-6.0.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:edf6e4c8fe14dfe316939711e3ece3f9a20760aabf686051b537a7562f4da91a"},
-    {file = "lxml-6.0.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:048a930eb4572829604982e39a0c7289ab5dc8abc7fc9f5aabd6fbc08c154e93"},
-    {file = "lxml-6.0.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0b5fa5eda84057a4f1bbb4bb77a8c28ff20ae7ce211588d698ae453e13c6281"},
-    {file = "lxml-6.0.0-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:c352fc8f36f7e9727db17adbf93f82499457b3d7e5511368569b4c5bd155a922"},
-    {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8db5dc617cb937ae17ff3403c3a70a7de9df4852a046f93e71edaec678f721d0"},
-    {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:2181e4b1d07dde53986023482673c0f1fba5178ef800f9ab95ad791e8bdded6a"},
-    {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b3c98d5b24c6095e89e03d65d5c574705be3d49c0d8ca10c17a8a4b5201b72f5"},
-    {file = "lxml-6.0.0-cp39-cp39-win32.whl", hash = "sha256:04d67ceee6db4bcb92987ccb16e53bef6b42ced872509f333c04fb58a3315256"},
-    {file = "lxml-6.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:e0b1520ef900e9ef62e392dd3d7ae4f5fa224d1dd62897a792cf353eb20b6cae"},
-    {file = "lxml-6.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:e35e8aaaf3981489f42884b59726693de32dabfc438ac10ef4eb3409961fd402"},
     {file = "lxml-6.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:dbdd7679a6f4f08152818043dbb39491d1af3332128b3752c3ec5cebc0011a72"},
     {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:40442e2a4456e9910875ac12951476d36c0870dcb38a68719f8c4686609897c4"},
     {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:db0efd6bae1c4730b9c863fc4f5f3c0fa3e8f05cae2c44ae141cb9dfc7d091dc"},
     {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ab542c91f5a47aaa58abdd8ea84b498e8e49fe4b883d67800017757a3eb78e8"},
     {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:013090383863b72c62a702d07678b658fa2567aa58d373d963cca245b017e065"},
     {file = "lxml-6.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c86df1c9af35d903d2b52d22ea3e66db8058d21dc0f59842ca5deb0595921141"},
-    {file = "lxml-6.0.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4337e4aec93b7c011f7ee2e357b0d30562edd1955620fdd4aeab6aacd90d43c5"},
-    {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ae74f7c762270196d2dda56f8dd7309411f08a4084ff2dfcc0b095a218df2e06"},
-    {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:059c4cbf3973a621b62ea3132934ae737da2c132a788e6cfb9b08d63a0ef73f9"},
-    {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:17f090a9bc0ce8da51a5632092f98a7e7f84bca26f33d161a98b57f7fb0004ca"},
-    {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9da022c14baeec36edfcc8daf0e281e2f55b950249a455776f0d1adeeada4734"},
-    {file = "lxml-6.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a55da151d0b0c6ab176b4e761670ac0e2667817a1e0dadd04a01d0561a219349"},
     {file = "lxml-6.0.0.tar.gz", hash = "sha256:032e65120339d44cdc3efc326c9f660f5f7205f3a535c1fdbf898b29ea01fb72"},
 ]
 
@@ -1458,16 +1302,6 @@ files = [
     {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"},
     {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"},
     {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"},
     {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"},
 ]
 
@@ -1564,21 +1398,6 @@ files = [
     {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6"},
     {file = "multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81"},
     {file = "multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43"},
-    {file = "multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada"},
-    {file = "multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a"},
     {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"},
     {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"},
 ]
@@ -1593,8 +1412,6 @@ dependencies = [
 files = [
     {file = "multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee"},
     {file = "multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec"},
-    {file = "multiprocess-0.70.16-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41"},
-    {file = "multiprocess-0.70.16-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a"},
     {file = "multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02"},
     {file = "multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a"},
     {file = "multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e"},
@@ -1632,17 +1449,6 @@ files = [
     {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
     {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
     {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
-    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
-    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
-    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
-    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
-    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
-    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
-    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
-    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
-    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
-    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
-    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
     {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
 ]
 
@@ -1655,41 +1461,20 @@ files = [
     {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"},
 ]
 
-[[package]]
-name = "ocrmypdf"
-version = "15.4.4"
-summary = ""
-dependencies = [
-    "deprecation; python_full_version < \"3.10\"",
-    "img2pdf; python_full_version < \"3.10\"",
-    "packaging; python_full_version < \"3.10\"",
-    "pdfminer-six; python_full_version < \"3.10\"",
-    "pikepdf; python_full_version < \"3.10\"",
-    "pillow; python_full_version < \"3.10\"",
-    "pluggy; python_full_version < \"3.10\"",
-    "reportlab; python_full_version < \"3.10\"",
-    "rich; python_full_version < \"3.10\"",
-    "typing-extensions; python_full_version < \"3.10\"",
-]
-files = [
-    {file = "ocrmypdf-15.4.4-py39-none-any.whl", hash = "sha256:13fd388035b5f4bb673bff570cfc2cf72e51168646d5401de9e48ca355917c6d"},
-    {file = "ocrmypdf-15.4.4.tar.gz", hash = "sha256:4696c81cc5b5d64f31ccfe685d10baeb69b42bb0974acddf292d8cf9d97605c3"},
-]
-
 [[package]]
 name = "ocrmypdf"
 version = "16.10.4"
 summary = ""
 dependencies = [
-    "deprecation; python_full_version >= \"3.10\"",
-    "img2pdf; python_full_version >= \"3.10\"",
-    "packaging; python_full_version >= \"3.10\"",
-    "pdfminer-six; python_full_version >= \"3.10\"",
-    "pi-heif; python_full_version >= \"3.10\"",
-    "pikepdf; python_full_version >= \"3.10\"",
-    "pillow; python_full_version >= \"3.10\"",
-    "pluggy; python_full_version >= \"3.10\"",
-    "rich; python_full_version >= \"3.10\"",
+    "deprecation",
+    "img2pdf",
+    "packaging",
+    "pdfminer-six",
+    "pi-heif",
+    "pikepdf",
+    "pillow",
+    "pluggy",
+    "rich",
 ]
 files = [
     {file = "ocrmypdf-16.10.4-py3-none-any.whl", hash = "sha256:061f3165d09ffafac975cea00803802b8a75551ada9965292ea86ea382673688"},
@@ -1764,16 +1549,21 @@ files = [
     {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"},
     {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"},
     {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"},
-    {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"},
-    {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"},
-    {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"},
-    {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"},
-    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"},
-    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"},
-    {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"},
     {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"},
 ]
 
+[[package]]
+name = "pdf2image"
+version = "1.17.0"
+summary = ""
+dependencies = [
+    "pillow",
+]
+files = [
+    {file = "pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2"},
+    {file = "pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57"},
+]
+
 [[package]]
 name = "pdfminer-six"
 version = "20250506"
@@ -1792,7 +1582,7 @@ name = "pi-heif"
 version = "0.22.0"
 summary = ""
 dependencies = [
-    "pillow; python_full_version >= \"3.10\"",
+    "pillow",
 ]
 files = [
     {file = "pi_heif-0.22.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:fca84436339eee2c91ff09cd7e301cfa2a0f7a9d83d5bc6a9d1db8587221d239"},
@@ -1823,23 +1613,11 @@ files = [
     {file = "pi_heif-0.22.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a89b57cd839b09ee749d12397d2027e20fe7a64a44883688ab44a873b16b507b"},
     {file = "pi_heif-0.22.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93acd60ef14e3ea835b7e3dafe284c07116349b0df05507520f10520c3ad09c1"},
     {file = "pi_heif-0.22.0-cp313-cp313-win_amd64.whl", hash = "sha256:6415b0005216ad08f86d0ef75ec24e13e60bf5f45273ab54a4a22f008b9f41ac"},
-    {file = "pi_heif-0.22.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:3f85ac3c0e2fb18af10e5b9789dcfd73f091b1d6ea2090d70d6e87f8744b8fe9"},
-    {file = "pi_heif-0.22.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2635cbcf35206dd3d7f6453df8a6a5cd6a83bcdc9818d999b7342837482d614e"},
-    {file = "pi_heif-0.22.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:893a49c195563a9bbbef571daad995110b47e3e6b624b92269c281cf1b70b8da"},
-    {file = "pi_heif-0.22.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b160a20dd6fa9d951a556006f02ec601a433ec4002953fdb67025f42e5fa89ea"},
-    {file = "pi_heif-0.22.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e2508317837ad6da6b6e2ba154faab766a0cdc189a86dd45b4b7decd641bfa5"},
-    {file = "pi_heif-0.22.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a7a1666070cffce08027b4309fb7f270c0e3a4715a3e5a7a7202b05f65a849f2"},
-    {file = "pi_heif-0.22.0-cp39-cp39-win_amd64.whl", hash = "sha256:c73e651cb17b7da3a740881c479e224084c95380df0d9d4f72d4858a422e80ae"},
     {file = "pi_heif-0.22.0-pp310-pypy310_pp73-macosx_13_0_x86_64.whl", hash = "sha256:6b83ec2f6db2dd61e09940006ee0a854eb58d91a52023be057da13a08a9f0517"},
     {file = "pi_heif-0.22.0-pp310-pypy310_pp73-macosx_14_0_arm64.whl", hash = "sha256:f33211fa2afa756b13a63e21aeab577cdc7ddb18a929a012cbbcd3b7d8a772d0"},
     {file = "pi_heif-0.22.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a82bb03e5ab429b6aee5f1446c7c1925b1fb4fd58d74c960c7995734285db269"},
     {file = "pi_heif-0.22.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79d72744708949bd9028516d860bd2c341371bca13aa2196e4f2267263834608"},
     {file = "pi_heif-0.22.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7bb583f93bb4c1dfaf3b6e689a9fa0de7c83182730c16ec8798c459cf8c3e8cf"},
-    {file = "pi_heif-0.22.0-pp39-pypy39_pp73-macosx_13_0_x86_64.whl", hash = "sha256:052fffb0b65c51adf90993a696dd51dddc5f5707d5f40e7bd9f4ad958bb505d9"},
-    {file = "pi_heif-0.22.0-pp39-pypy39_pp73-macosx_14_0_arm64.whl", hash = "sha256:b326a48001a97906e5eb4110113d0cfe1203704f3572100dd177782568c9fc32"},
-    {file = "pi_heif-0.22.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8cc68012a870d5e39d8fd5468dfd1d452ca10388cab5fac30f90ddfa0772a3e"},
-    {file = "pi_heif-0.22.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:350c49ac597d1b8cdaa8a35f2c0901a3847067b9d0a9fdc07d2d6851e5d63382"},
-    {file = "pi_heif-0.22.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3de6fb5a58cf271897adc31e045db45003ae1e32116efc30fa20c72e1c90b2b"},
     {file = "pi_heif-0.22.0.tar.gz", hash = "sha256:489ddda3c9fed948715a9c8642c6ee24c3b438a7fbf85b3a8f097d632d7082a8"},
 ]
 
@@ -1882,13 +1660,6 @@ files = [
     {file = "pikepdf-9.10.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:048f3d5138c44f8c452d818e14130fa30d809f61d70063b6e615e91148342188"},
     {file = "pikepdf-9.10.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0fff140da5a75b41b4cdf34354366620c206f31fc513356c70cf5da6b81d2483"},
     {file = "pikepdf-9.10.2-cp313-cp313-win_amd64.whl", hash = "sha256:1b5af8e233ed232f02e31a281134eed94504c72e9de88326433e34641f04a113"},
-    {file = "pikepdf-9.10.2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:ed7032dfe0f280e87908e025b22ecd49b230d2b753c4ef66d0f6ce2952f5e721"},
-    {file = "pikepdf-9.10.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:9d5f9fa9513e600752acdd81fd1b987b6bf85a36c25779bd9a7e0986626424d7"},
-    {file = "pikepdf-9.10.2-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1899d0d9dd1ebdf13125159029a2c89afc66d87f0f3bcdbca9adbda6ad2bce15"},
-    {file = "pikepdf-9.10.2-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77ec60c230f11797e94a0659523c579fd8d25969de9091b2d6c7799868cd60c3"},
-    {file = "pikepdf-9.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2ddc1cb0aba4f2fa0d95ed68460688e3efcd3a70973901faf5b8c85e81438bcf"},
-    {file = "pikepdf-9.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a0ee549af6560be2c3f7b9c37b4c9c814bcd24249323b0525ba0b00a11988d90"},
-    {file = "pikepdf-9.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:f1d7417a1b49d77f13f9e9310e5d122a0e69d5e06afd21e06d12b0baa5cd9578"},
     {file = "pikepdf-9.10.2.tar.gz", hash = "sha256:f62fc2183888f2ca1d271bf4faa440a2e2d0159221620a9c6a314f9c9a95680c"},
 ]
 
@@ -1949,17 +1720,6 @@ files = [
     {file = "pillow-11.0.0-cp313-cp313t-win32.whl", hash = "sha256:607bbe123c74e272e381a8d1957083a9463401f7bd01287f50521ecb05a313f8"},
     {file = "pillow-11.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5c39ed17edea3bc69c743a8dd3e9853b7509625c2462532e62baa0732163a904"},
     {file = "pillow-11.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:75acbbeb05b86bc53cbe7b7e6fe00fbcf82ad7c684b3ad82e3d711da9ba287d3"},
-    {file = "pillow-11.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2e46773dc9f35a1dd28bd6981332fd7f27bec001a918a72a79b4133cf5291dba"},
-    {file = "pillow-11.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2679d2258b7f1192b378e2893a8a0a0ca472234d4c2c0e6bdd3380e8dfa21b6a"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eda2616eb2313cbb3eebbe51f19362eb434b18e3bb599466a1ffa76a033fb916"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20ec184af98a121fb2da42642dea8a29ec80fc3efbaefb86d8fdd2606619045d"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:8594f42df584e5b4bb9281799698403f7af489fba84c34d53d1c4bfb71b7c4e7"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:c12b5ae868897c7338519c03049a806af85b9b8c237b7d675b8c5e089e4a618e"},
-    {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:70fbbdacd1d271b77b7721fe3cdd2d537bbbd75d29e6300c672ec6bb38d9672f"},
-    {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5178952973e588b3f1360868847334e9e3bf49d19e169bbbdfaf8398002419ae"},
-    {file = "pillow-11.0.0-cp39-cp39-win32.whl", hash = "sha256:8c676b587da5673d3c75bd67dd2a8cdfeb282ca38a30f37950511766b26858c4"},
-    {file = "pillow-11.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:94f3e1780abb45062287b4614a5bc0874519c86a777d4a7ad34978e86428b8dd"},
-    {file = "pillow-11.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:290f2cc809f9da7d6d622550bbf4c1e57518212da51b6a30fe8e0a270a5b78bd"},
     {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1187739620f2b365de756ce086fdb3604573337cc28a0d3ac4a01ab6b2d2a6d2"},
     {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fbbcb7b57dc9c794843e3d1258c0fbf0f48656d46ffe9e09b63bbd6e8cd5d0a2"},
     {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d203af30149ae339ad1b4f710d9844ed8796e97fda23ffbc4cc472968a47d0b"},
@@ -1967,10 +1727,6 @@ files = [
     {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:73853108f56df97baf2bb8b522f3578221e56f646ba345a372c78326710d3830"},
     {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e58876c91f97b0952eb766123bfef372792ab3f4e3e1f1a2267834c2ab131734"},
     {file = "pillow-11.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:224aaa38177597bb179f3ec87eeefcce8e4f85e608025e9cfac60de237ba6316"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:5bd2d3bdb846d757055910f0a59792d33b555800813c3b39ada1829c372ccb06"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:375b8dd15a1f5d2feafff536d47e22f69625c1aa92f12b339ec0b2ca40263273"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:daffdf51ee5db69a82dd127eabecce20729e21f7a3680cf7cbb23f0829189790"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7326a1787e3c7b0429659e0a944725e1b03eeaa10edd945a86dead1913383944"},
     {file = "pillow-11.0.0.tar.gz", hash = "sha256:72bacbaf24ac003fea9bff9837d1eedb6088758d41e100c1552930151f677739"},
 ]
 
@@ -2052,22 +1808,6 @@ files = [
     {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"},
     {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"},
     {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"},
-    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"},
-    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"},
-    {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"},
-    {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"},
-    {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"},
     {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"},
     {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"},
 ]
@@ -2098,8 +1838,6 @@ files = [
     {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"},
     {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"},
     {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"},
-    {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"},
-    {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"},
     {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"},
 ]
 
@@ -2142,13 +1880,6 @@ files = [
     {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:336addb8b6f5208be1b2398442c703a710b6b937b1a046065ee4db65e782ff5a"},
     {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:45476490dd4adec5472c92b4d253e245258745d0ccaabe706f8d03288ed60a79"},
     {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b46591222c864e7da7faa3b19455196416cd8355ff6c2cc2e65726a760a3c420"},
-    {file = "pyarrow-18.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb7e3abcda7e1e6b83c2dc2909c8d045881017270a119cc6ee7fdcfe71d02df8"},
-    {file = "pyarrow-18.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:09f30690b99ce34e0da64d20dab372ee54431745e4efb78ac938234a282d15f9"},
-    {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d5ca5d707e158540312e09fd907f9f49bacbe779ab5236d9699ced14d2293b8"},
-    {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6331f280c6e4521c69b201a42dd978f60f7e129511a55da9e0bfe426b4ebb8d"},
-    {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3ac24b2be732e78a5a3ac0b3aa870d73766dd00beba6e015ea2ea7394f8b4e55"},
-    {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b30a927c6dff89ee702686596f27c25160dd6c99be5bcc1513a763ae5b1bfc03"},
-    {file = "pyarrow-18.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:8f40ec677e942374e3d7f2fad6a67a4c2811a8b975e8703c6fd26d3b168a90e2"},
     {file = "pyarrow-18.0.0.tar.gz", hash = "sha256:a6aa027b1a9d2970cf328ccd6dbe4a996bc13c39fd427f502782f5bdb9ca20f5"},
 ]
 
@@ -2202,11 +1933,6 @@ files = [
     {file = "pycryptodome-3.23.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14e15c081e912c4b0d75632acd8382dfce45b258667aa3c67caf7a4d4c13f630"},
     {file = "pycryptodome-3.23.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7fc76bf273353dc7e5207d172b83f569540fc9a28d63171061c42e361d22353"},
     {file = "pycryptodome-3.23.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:45c69ad715ca1a94f778215a11e66b7ff989d792a4d63b68dc586a1da1392ff5"},
-    {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:865d83c906b0fc6a59b510deceee656b6bc1c4fa0d82176e2b77e97a420a996a"},
-    {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89d4d56153efc4d81defe8b65fd0821ef8b2d5ddf8ed19df31ba2f00872b8002"},
-    {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3f2d0aaf8080bda0587d58fc9fe4766e012441e2eed4269a77de6aea981c8be"},
-    {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64093fc334c1eccfd3933c134c4457c34eaca235eeae49d69449dc4728079339"},
-    {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ce64e84a962b63a47a592690bdc16a7eaf709d2c2697ababf24a0def566899a6"},
     {file = "pycryptodome-3.23.0.tar.gz", hash = "sha256:447700a657182d60338bab09fdb27518f8856aecd80ae4c6bdddb67ff5da44ef"},
 ]
 
@@ -2280,18 +2006,6 @@ files = [
     {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed541d70698978a20eb63d8c5d72f2cc6d7079d9d90f6b50bad07826f1320f5f"},
     {file = "pydantic_core-2.23.4-cp313-none-win32.whl", hash = "sha256:3d5639516376dce1940ea36edf408c554475369f5da2abd45d44621cb616f769"},
     {file = "pydantic_core-2.23.4-cp313-none-win_amd64.whl", hash = "sha256:5a1504ad17ba4210df3a045132a7baeeba5a200e930f57512ee02909fc5c4cb5"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a4fa4fc04dff799089689f4fd502ce7d59de529fc2f40a2c8836886c03e0175a"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7df63886be5e270da67e0966cf4afbae86069501d35c8c1b3b6c168f42cb36"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcedcd19a557e182628afa1d553c3895a9f825b936415d0dbd3cd0bbcfd29b4b"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f54b118ce5de9ac21c363d9b3caa6c800341e8c47a508787e5868c6b79c9323"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d2f57d3e1379a9525c5ab067b27dbb8a0642fb5d454e17a9ac434f9ce523e3"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de6d1d1b9e5101508cb37ab0d972357cac5235f5c6533d1071964c47139257df"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1278e0d324f6908e872730c9102b0112477a7f7cf88b308e4fc36ce1bdb6d58c"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a6b5099eeec78827553827f4c6b8615978bb4b6a88e5d9b93eddf8bb6790f55"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e55541f756f9b3ee346b840103f32779c695a19826a4c442b7954550a0972040"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a5c7ba8ffb6d6f8f2ab08743be203654bb1aaa8c9dcb09f82ddd34eadb695605"},
-    {file = "pydantic_core-2.23.4-cp39-none-win32.whl", hash = "sha256:37b0fe330e4a58d3c58b24d91d1eb102aeec675a3db4c292ec3928ecd892a9a6"},
-    {file = "pydantic_core-2.23.4-cp39-none-win_amd64.whl", hash = "sha256:1498bec4c05c9c787bde9125cfdcc63a41004ff167f495063191b863399b1a29"},
     {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f455ee30a9d61d3e1a15abd5068827773d6e4dc513e795f380cdd59932c782d5"},
     {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1e90d2e3bd2c3863d48525d297cd143fe541be8bbf6f579504b9712cb6b643ec"},
     {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e203fdf807ac7e12ab59ca2bfcabb38c7cf0b33c41efeb00f8e5da1d86af480"},
@@ -2300,14 +2014,6 @@ files = [
     {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d06b0c8da4f16d1d1e352134427cb194a0a6e19ad5db9161bf32b2113409e728"},
     {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ba1a0996f6c2773bd83e63f18914c1de3c9dd26d55f4ac302a7efe93fb8e7433"},
     {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9a5bce9d23aac8f0cf0836ecfc033896aa8443b501c58d0602dbfd5bd5b37753"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:78ddaaa81421a29574a682b3179d4cf9e6d405a09b99d93ddcf7e5239c742e21"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:883a91b5dd7d26492ff2f04f40fbb652de40fcc0afe07e8129e8ae779c2110eb"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88ad334a15b32a791ea935af224b9de1bf99bcd62fabf745d5f3442199d86d59"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:233710f069d251feb12a56da21e14cca67994eab08362207785cf8c598e74577"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:19442362866a753485ba5e4be408964644dd6a09123d9416c54cd49171f50744"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:624e278a7d29b6445e4e813af92af37820fafb6dcc55c012c834f9e26f9aaaef"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f5ef8f42bec47f21d07668a043f077d507e5bf4e668d5c6dfe6aaba89de1a5b8"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:aea443fffa9fbe3af1a9ba721a87f926fe548d32cab71d188a6ede77d0ff244e"},
     {file = "pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863"},
 ]
 
@@ -2414,7 +2120,6 @@ name = "pytest-randomly"
 version = "3.16.0"
 summary = ""
 dependencies = [
-    "importlib-metadata; python_full_version < \"3.10\"",
     "pytest",
 ]
 files = [
@@ -2542,15 +2247,6 @@ files = [
     {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
     {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
     {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
-    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
-    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
-    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
-    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
-    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
-    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
     {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
 ]
 
@@ -2566,19 +2262,6 @@ files = [
     {file = "redis-5.2.0.tar.gz", hash = "sha256:0b1087665a771b1ff2e003aa5bdd354f15a70c9e25d5a7dbf9c722c16528a7b0"},
 ]
 
-[[package]]
-name = "reportlab"
-version = "4.4.3"
-summary = ""
-dependencies = [
-    "charset-normalizer; python_full_version < \"3.10\"",
-    "pillow; python_full_version < \"3.10\"",
-]
-files = [
-    {file = "reportlab-4.4.3-py3-none-any.whl", hash = "sha256:df905dc5ec5ddaae91fc9cb3371af863311271d555236410954961c5ee6ee1b5"},
-    {file = "reportlab-4.4.3.tar.gz", hash = "sha256:073b0975dab69536acd3251858e6b0524ed3e087e71f1d0d1895acb50acf9c7b"},
-]
-
 [[package]]
 name = "requests"
 version = "2.32.3"
@@ -2735,14 +2418,6 @@ files = [
     {file = "SQLAlchemy-2.0.36-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a121d62ebe7d26fec9155f83f8be5189ef1405f5973ea4874a26fab9f1e262c"},
     {file = "SQLAlchemy-2.0.36-cp313-cp313-win32.whl", hash = "sha256:0572f4bd6f94752167adfd7c1bed84f4b240ee6203a95e05d1e208d488d0d436"},
     {file = "SQLAlchemy-2.0.36-cp313-cp313-win_amd64.whl", hash = "sha256:8c78ac40bde930c60e0f78b3cd184c580f89456dd87fc08f9e3ee3ce8765ce88"},
-    {file = "SQLAlchemy-2.0.36-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:dc022184d3e5cacc9579e41805a681187650e170eb2fd70e28b86192a479dcaa"},
-    {file = "SQLAlchemy-2.0.36-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b817d41d692bf286abc181f8af476c4fbef3fd05e798777492618378448ee689"},
-    {file = "SQLAlchemy-2.0.36-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4e46a888b54be23d03a89be510f24a7652fe6ff660787b96cd0e57a4ebcb46d"},
-    {file = "SQLAlchemy-2.0.36-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4ae3005ed83f5967f961fd091f2f8c5329161f69ce8480aa8168b2d7fe37f06"},
-    {file = "SQLAlchemy-2.0.36-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:03e08af7a5f9386a43919eda9de33ffda16b44eb11f3b313e6822243770e9763"},
-    {file = "SQLAlchemy-2.0.36-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3dbb986bad3ed5ceaf090200eba750b5245150bd97d3e67343a3cfed06feecf7"},
-    {file = "SQLAlchemy-2.0.36-cp39-cp39-win32.whl", hash = "sha256:9fe53b404f24789b5ea9003fc25b9a3988feddebd7e7b369c8fac27ad6f52f28"},
-    {file = "SQLAlchemy-2.0.36-cp39-cp39-win_amd64.whl", hash = "sha256:af148a33ff0349f53512a049c6406923e4e02bf2f26c5fb285f143faf4f0e46a"},
     {file = "SQLAlchemy-2.0.36-py3-none-any.whl", hash = "sha256:fddbe92b4760c6f5d48162aef14824add991aeda8ddadb3c31d56eb15ca69f8e"},
     {file = "sqlalchemy-2.0.36.tar.gz", hash = "sha256:7f2767680b6d2398aea7082e45a774b2b0767b5c8d8ffb9c8b683088ea9b29c5"},
 ]
@@ -2769,7 +2444,6 @@ version = "0.41.2"
 summary = ""
 dependencies = [
     "anyio",
-    "typing-extensions; python_full_version < \"3.10\"",
 ]
 files = [
     {file = "starlette-0.41.2-py3-none-any.whl", hash = "sha256:fbc189474b4731cf30fcef52f18a8d070e3f3b46c6a04c97579e85e6ffca942d"},
@@ -2918,12 +2592,6 @@ files = [
     {file = "uvloop-0.21.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816"},
     {file = "uvloop-0.21.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc"},
     {file = "uvloop-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553"},
-    {file = "uvloop-0.21.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c097078b8031190c934ed0ebfee8cc5f9ba9642e6eb88322b9958b649750f72b"},
-    {file = "uvloop-0.21.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:46923b0b5ee7fc0020bef24afe7836cb068f5050ca04caf6b487c513dc1a20b2"},
-    {file = "uvloop-0.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53e420a3afe22cdcf2a0f4846e377d16e718bc70103d7088a4f7623567ba5fb0"},
-    {file = "uvloop-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88cb67cdbc0e483da00af0b2c3cdad4b7c61ceb1ee0f33fe00e09c81e3a6cb75"},
-    {file = "uvloop-0.21.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:221f4f2a1f46032b403bf3be628011caf75428ee3cc204a22addf96f586b19fd"},
-    {file = "uvloop-0.21.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2d1f581393673ce119355d56da84fe1dd9d2bb8b3d13ce792524e1607139feff"},
     {file = "uvloop-0.21.0.tar.gz", hash = "sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3"},
 ]
 
@@ -2985,26 +2653,10 @@ files = [
     {file = "watchfiles-0.24.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:edf71b01dec9f766fb285b73930f95f730bb0943500ba0566ae234b5c1618c18"},
     {file = "watchfiles-0.24.0-cp313-none-win32.whl", hash = "sha256:f4c96283fca3ee09fb044f02156d9570d156698bc3734252175a38f0e8975f07"},
     {file = "watchfiles-0.24.0-cp313-none-win_amd64.whl", hash = "sha256:a974231b4fdd1bb7f62064a0565a6b107d27d21d9acb50c484d2cdba515b9366"},
-    {file = "watchfiles-0.24.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:b665caeeda58625c3946ad7308fbd88a086ee51ccb706307e5b1fa91556ac886"},
-    {file = "watchfiles-0.24.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5c51749f3e4e269231510da426ce4a44beb98db2dce9097225c338f815b05d4f"},
-    {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82b2509f08761f29a0fdad35f7e1638b8ab1adfa2666d41b794090361fb8b855"},
-    {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9a60e2bf9dc6afe7f743e7c9b149d1fdd6dbf35153c78fe3a14ae1a9aee3d98b"},
-    {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7d9b87c4c55e3ea8881dfcbf6d61ea6775fffed1fedffaa60bd047d3c08c430"},
-    {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:78470906a6be5199524641f538bd2c56bb809cd4bf29a566a75051610bc982c3"},
-    {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07cdef0c84c03375f4e24642ef8d8178e533596b229d32d2bbd69e5128ede02a"},
-    {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d337193bbf3e45171c8025e291530fb7548a93c45253897cd764a6a71c937ed9"},
-    {file = "watchfiles-0.24.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ec39698c45b11d9694a1b635a70946a5bad066b593af863460a8e600f0dff1ca"},
-    {file = "watchfiles-0.24.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2e28d91ef48eab0afb939fa446d8ebe77e2f7593f5f463fd2bb2b14132f95b6e"},
-    {file = "watchfiles-0.24.0-cp39-none-win32.whl", hash = "sha256:7138eff8baa883aeaa074359daabb8b6c1e73ffe69d5accdc907d62e50b1c0da"},
-    {file = "watchfiles-0.24.0-cp39-none-win_amd64.whl", hash = "sha256:b3ef2c69c655db63deb96b3c3e587084612f9b1fa983df5e0c3379d41307467f"},
     {file = "watchfiles-0.24.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:632676574429bee8c26be8af52af20e0c718cc7f5f67f3fb658c71928ccd4f7f"},
     {file = "watchfiles-0.24.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a2a9891723a735d3e2540651184be6fd5b96880c08ffe1a98bae5017e65b544b"},
     {file = "watchfiles-0.24.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a7fa2bc0efef3e209a8199fd111b8969fe9db9c711acc46636686331eda7dd4"},
     {file = "watchfiles-0.24.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01550ccf1d0aed6ea375ef259706af76ad009ef5b0203a3a4cce0f6024f9b68a"},
-    {file = "watchfiles-0.24.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:96619302d4374de5e2345b2b622dc481257a99431277662c30f606f3e22f42be"},
-    {file = "watchfiles-0.24.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:85d5f0c7771dcc7a26c7a27145059b6bb0ce06e4e751ed76cdf123d7039b60b5"},
-    {file = "watchfiles-0.24.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:951088d12d339690a92cef2ec5d3cfd957692834c72ffd570ea76a6790222777"},
-    {file = "watchfiles-0.24.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49fb58bcaa343fedc6a9e91f90195b20ccb3135447dc9e4e2570c3a39565853e"},
     {file = "watchfiles-0.24.0.tar.gz", hash = "sha256:afb72325b74fa7a428c009c1b8be4b4d7c2afedafb2982827ef2156646df2fe1"},
 ]
 
@@ -3057,29 +2709,12 @@ files = [
     {file = "websockets-13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:70c5be9f416aa72aab7a2a76c90ae0a4fe2755c1816c153c1a2bcc3333ce4ce6"},
     {file = "websockets-13.1-cp313-cp313-win32.whl", hash = "sha256:624459daabeb310d3815b276c1adef475b3e6804abaf2d9d2c061c319f7f187d"},
     {file = "websockets-13.1-cp313-cp313-win_amd64.whl", hash = "sha256:c518e84bb59c2baae725accd355c8dc517b4a3ed8db88b4bc93c78dae2974bf2"},
-    {file = "websockets-13.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9b37c184f8b976f0c0a231a5f3d6efe10807d41ccbe4488df8c74174805eea7d"},
-    {file = "websockets-13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:163e7277e1a0bd9fb3c8842a71661ad19c6aa7bb3d6678dc7f89b17fbcc4aeb7"},
-    {file = "websockets-13.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4b889dbd1342820cc210ba44307cf75ae5f2f96226c0038094455a96e64fb07a"},
-    {file = "websockets-13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:586a356928692c1fed0eca68b4d1c2cbbd1ca2acf2ac7e7ebd3b9052582deefa"},
-    {file = "websockets-13.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7bd6abf1e070a6b72bfeb71049d6ad286852e285f146682bf30d0296f5fbadfa"},
-    {file = "websockets-13.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2aad13a200e5934f5a6767492fb07151e1de1d6079c003ab31e1823733ae79"},
-    {file = "websockets-13.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:df01aea34b6e9e33572c35cd16bae5a47785e7d5c8cb2b54b2acdb9678315a17"},
-    {file = "websockets-13.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e54affdeb21026329fb0744ad187cf812f7d3c2aa702a5edb562b325191fcab6"},
-    {file = "websockets-13.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9ef8aa8bdbac47f4968a5d66462a2a0935d044bf35c0e5a8af152d58516dbeb5"},
-    {file = "websockets-13.1-cp39-cp39-win32.whl", hash = "sha256:deeb929efe52bed518f6eb2ddc00cc496366a14c726005726ad62c2dd9017a3c"},
-    {file = "websockets-13.1-cp39-cp39-win_amd64.whl", hash = "sha256:7c65ffa900e7cc958cd088b9a9157a8141c991f8c53d11087e6fb7277a03f81d"},
     {file = "websockets-13.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5dd6da9bec02735931fccec99d97c29f47cc61f644264eb995ad6c0c27667238"},
     {file = "websockets-13.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:2510c09d8e8df777177ee3d40cd35450dc169a81e747455cc4197e63f7e7bfe5"},
     {file = "websockets-13.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1c3cf67185543730888b20682fb186fc8d0fa6f07ccc3ef4390831ab4b388d9"},
     {file = "websockets-13.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcc03c8b72267e97b49149e4863d57c2d77f13fae12066622dc78fe322490fe6"},
     {file = "websockets-13.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:004280a140f220c812e65f36944a9ca92d766b6cc4560be652a0a3883a79ed8a"},
     {file = "websockets-13.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e2620453c075abeb0daa949a292e19f56de518988e079c36478bacf9546ced23"},
-    {file = "websockets-13.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:25c35bf84bf7c7369d247f0b8cfa157f989862c49104c5cf85cb5436a641d93e"},
-    {file = "websockets-13.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:83f91d8a9bb404b8c2c41a707ac7f7f75b9442a0a876df295de27251a856ad09"},
-    {file = "websockets-13.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a43cfdcddd07f4ca2b1afb459824dd3c6d53a51410636a2c7fc97b9a8cf4842"},
-    {file = "websockets-13.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48a2ef1381632a2f0cb4efeff34efa97901c9fbc118e01951ad7cfc10601a9bb"},
-    {file = "websockets-13.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:459bf774c754c35dbb487360b12c5727adab887f1622b8aed5755880a21c4a20"},
-    {file = "websockets-13.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:95858ca14a9f6fa8413d29e0a585b31b278388aa775b8a81fa24830123874678"},
     {file = "websockets-13.1-py3-none-any.whl", hash = "sha256:a9a396a6ad26130cdae92ae10c36af09d9bfe6cafe69670fd3b6da9b07b4044f"},
     {file = "websockets-13.1.tar.gz", hash = "sha256:a3b3366087c1bc0a2795111edcadddb8b3b59509d5db5d7ea3fdd69f954a8878"},
 ]
@@ -3119,16 +2754,6 @@ files = [
     {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"},
     {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"},
     {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"},
-    {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"},
-    {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"},
-    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"},
-    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"},
-    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"},
-    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"},
-    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"},
-    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"},
-    {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"},
-    {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"},
     {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"},
     {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"},
 ]
@@ -3198,31 +2823,11 @@ files = [
     {file = "xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637"},
     {file = "xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43"},
     {file = "xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b"},
-    {file = "xxhash-3.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bfc8cdd7f33d57f0468b0614ae634cc38ab9202c6957a60e31d285a71ebe0301"},
-    {file = "xxhash-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e0c48b6300cd0b0106bf49169c3e0536408dfbeb1ccb53180068a18b03c662ab"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe1a92cfbaa0a1253e339ccec42dbe6db262615e52df591b68726ab10338003f"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33513d6cc3ed3b559134fb307aae9bdd94d7e7c02907b37896a6c45ff9ce51bd"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eefc37f6138f522e771ac6db71a6d4838ec7933939676f3753eafd7d3f4c40bc"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a606c8070ada8aa2a88e181773fa1ef17ba65ce5dd168b9d08038e2a61b33754"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:42eca420c8fa072cc1dd62597635d140e78e384a79bb4944f825fbef8bfeeef6"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:604253b2143e13218ff1ef0b59ce67f18b8bd1c4205d2ffda22b09b426386898"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6e93a5ad22f434d7876665444a97e713a8f60b5b1a3521e8df11b98309bff833"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7a46e1d6d2817ba8024de44c4fd79913a90e5f7265434cef97026215b7d30df6"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:30eb2efe6503c379b7ab99c81ba4a779748e3830241f032ab46bd182bf5873af"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c8aa771ff2c13dd9cda8166d685d7333d389fae30a4d2bb39d63ab5775de8606"},
-    {file = "xxhash-3.5.0-cp39-cp39-win32.whl", hash = "sha256:5ed9ebc46f24cf91034544b26b131241b699edbfc99ec5e7f8f3d02d6eb7fba4"},
-    {file = "xxhash-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:220f3f896c6b8d0316f63f16c077d52c412619e475f9372333474ee15133a558"},
-    {file = "xxhash-3.5.0-cp39-cp39-win_arm64.whl", hash = "sha256:a7b1d8315d9b5e9f89eb2933b73afae6ec9597a258d52190944437158b49d38e"},
     {file = "xxhash-3.5.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c"},
     {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986"},
     {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6"},
     {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b"},
     {file = "xxhash-3.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:531af8845aaadcadf951b7e0c1345c6b9c68a990eeb74ff9acd8501a0ad6a1c9"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ce379bcaa9fcc00f19affa7773084dd09f5b59947b3fb47a1ceb0179f91aaa1"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd1b2281d01723f076df3c8188f43f2472248a6b63118b036e641243656b1b0f"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c770750cc80e8694492244bca7251385188bc5597b6a39d98a9f30e8da984e0"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b150b8467852e1bd844387459aa6fbe11d7f38b56e901f9f3b3e6aba0d660240"},
     {file = "xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f"},
 ]
 
@@ -3300,31 +2905,6 @@ files = [
     {file = "yarl-1.17.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5b937c216b6dee8b858c6afea958de03c5ff28406257d22b55c24962a2baf6fd"},
     {file = "yarl-1.17.0-cp313-cp313-win32.whl", hash = "sha256:d0131b14cb545c1a7bd98f4565a3e9bdf25a1bd65c83fc156ee5d8a8499ec4a3"},
     {file = "yarl-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:01c96efa4313c01329e88b7e9e9e1b2fc671580270ddefdd41129fa8d0db7696"},
-    {file = "yarl-1.17.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:0d44f67e193f0a7acdf552ecb4d1956a3a276c68e7952471add9f93093d1c30d"},
-    {file = "yarl-1.17.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:16ea0aa5f890cdcb7ae700dffa0397ed6c280840f637cd07bffcbe4b8d68b985"},
-    {file = "yarl-1.17.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cf5469dc7dcfa65edf5cc3a6add9f84c5529c6b556729b098e81a09a92e60e51"},
-    {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e662bf2f6e90b73cf2095f844e2bc1fda39826472a2aa1959258c3f2a8500a2f"},
-    {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8260e88f1446904ba20b558fa8ce5d0ab9102747238e82343e46d056d7304d7e"},
-    {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5dc16477a4a2c71e64c5d3d15d7ae3d3a6bb1e8b955288a9f73c60d2a391282f"},
-    {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46027e326cecd55e5950184ec9d86c803f4f6fe4ba6af9944a0e537d643cdbe0"},
-    {file = "yarl-1.17.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc95e46c92a2b6f22e70afe07e34dbc03a4acd07d820204a6938798b16f4014f"},
-    {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:16ca76c7ac9515320cd09d6cc083d8d13d1803f6ebe212b06ea2505fd66ecff8"},
-    {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:eb1a5b97388f2613f9305d78a3473cdf8d80c7034e554d8199d96dcf80c62ac4"},
-    {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:41fd5498975418cdc34944060b8fbeec0d48b2741068077222564bea68daf5a6"},
-    {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:146ca582ed04a5664ad04b0e0603934281eaab5c0115a5a46cce0b3c061a56a1"},
-    {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:6abb8c06107dbec97481b2392dafc41aac091a5d162edf6ed7d624fe7da0587a"},
-    {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:4d14be4613dd4f96c25feb4bd8c0d8ce0f529ab0ae555a17df5789e69d8ec0c5"},
-    {file = "yarl-1.17.0-cp39-cp39-win32.whl", hash = "sha256:174d6a6cad1068f7850702aad0c7b1bca03bcac199ca6026f84531335dfc2646"},
-    {file = "yarl-1.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:6af417ca2c7349b101d3fd557ad96b4cd439fdb6ab0d288e3f64a068eea394d0"},
     {file = "yarl-1.17.0-py3-none-any.whl", hash = "sha256:62dd42bb0e49423f4dd58836a04fcf09c80237836796025211bbe913f1524993"},
     {file = "yarl-1.17.0.tar.gz", hash = "sha256:d3f13583f378930377e02002b4085a3d025b00402d5a80911726d43a67911cd9"},
 ]
-
-[[package]]
-name = "zipp"
-version = "3.20.2"
-summary = ""
-files = [
-    {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"},
-    {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"},
-]
diff --git a/argilla-server/pyproject.toml b/argilla-server/pyproject.toml
index 098bfdd57..d9eae2ddc 100644
--- a/argilla-server/pyproject.toml
+++ b/argilla-server/pyproject.toml
@@ -3,7 +3,7 @@ name = "extralit-server"
 dynamic = ["version"]
 description = "Open-source tool for accurate & fast scientific literature data extraction with LLM and human-in-the-loop."
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = { text = "Apache-2.0" }
 keywords = [
     "literature-review",
@@ -67,7 +67,8 @@ dependencies = [
     # For file storage
     "minio>=7.2.7",
     # For document processing
-    "ocrmypdf>=16.10.4"
+    "ocrmypdf>=16.10.4",
+    "pdf2image>=1.17.0"
 ]
 
 [project.optional-dependencies]
diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
index 6b28a8ebb..c3047be97 100644
--- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py
+++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
@@ -19,8 +19,12 @@
 import tempfile
 import time
 from io import BytesIO
+from typing import List, Optional
 from uuid import uuid4
 
+from pydantic import Field
+from pydantic_settings import BaseSettings
+
 try:
     import ocrmypdf
 
@@ -31,128 +35,202 @@
 logger = logging.getLogger(__name__)
 
 
-def preprocess_pdf_with_ocrmypdf(file_data: bytes, filename: str) -> bytes:
+class PDFPreprocessingSettings(BaseSettings):
     """
-    Preprocess PDF with OCRmyPDF to add OCR layer and fix orientation.
-    Works with bytes data and returns processed bytes, minimizing disk I/O.
+    PDF preprocessing settings that can be configured via environment variables.
 
-    Args:
-        file_data: PDF file data as bytes
-        filename: Original filename for logging purposes
+    All settings have the PREPROCESSING_ prefix.
+    """
 
-    Returns:
-        Processed PDF data as bytes (or original bytes if processing fails)
+    enabled: bool = Field(
+        default=True, description="Enable PDF preprocessing with OCRmyPDF. Set to False to disable all processing."
+    )
+
+    language: List[str] = Field(
+        default=["eng"], description="List of languages for OCR processing (e.g., ['eng', 'spa', 'fra'])"
+    )
+
+    rotate_pages: bool = Field(default=True, description="Auto-rotate pages with horizontal text")
+
+    deskew: bool = Field(default=True, description="Fix skewed text")
+
+    clean: bool = Field(default=True, description="Clean up artifacts")
+
+    optimize: int = Field(
+        default=1, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)"
+    )
+
+    pdf_renderer: str = Field(default="hocr", description="PDF renderer: 'auto', 'hocr', 'sandwich'")
+
+    force_ocr: bool = Field(default=False, description="Force OCR on all pages, even if they already have text")
+
+    skip_text: bool = Field(default=False, description="Skip text-based operations (OCR only for images)")
+
+    redo_ocr: bool = Field(default=False, description="Redo OCR on pages that already have OCR")
+
+    progress_bar: bool = Field(default=False, description="Show progress bar during processing")
+
+    quiet: bool = Field(default=True, description="Suppress OCRmyPDF output messages")
+
+    class Config:
+        env_prefix = "PREPROCESSING_"
+
+
+class PDFPreprocessor:
     """
-    if not OCRMYPDF_AVAILABLE:
-        logger.warning("OCRmyPDF not available, skipping preprocessing")
-        return file_data
+    PDF preprocessor that uses OCRmyPDF for rotation, OCR, and optimization.
 
-    # Only process PDF files
-    if not filename.lower().endswith(".pdf"):
-        logger.debug(f"Skipping OCRmyPDF for non-PDF file: {filename}")
-        return file_data
+    Can be configured with environment variables using the PDFPreprocessingSettings.
+    """
 
-    try:
-        logger.info(f"Starting OCRmyPDF preprocessing for: {filename}")
-        start_time = time.time()
+    def __init__(self, settings: Optional[PDFPreprocessingSettings] = None):
+        """
+        Initialize the PDF preprocessor.
+
+        Args:
+            settings: Optional PDFPreprocessingSettings instance. If None, loads from environment.
+        """
+        self.settings = settings or PDFPreprocessingSettings()
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+
+        if not self.settings.enabled:
+            self.logger.info("PDF preprocessing is disabled via configuration")
+        elif not OCRMYPDF_AVAILABLE:
+            self.logger.warning("OCRmyPDF not available, PDF preprocessing will be skipped")
+
+    def preprocess(self, file_data: bytes, filename: str) -> bytes:
+        """
+        Preprocess PDF with OCRmyPDF using configured settings.
+
+        Args:
+            file_data: PDF file data as bytes
+            filename: Original filename for logging purposes
+
+        Returns:
+            Processed PDF data as bytes (or original bytes if processing fails/disabled)
+        """
+        if not self.settings.enabled:
+            self.logger.debug(f"PDF preprocessing disabled, skipping: {filename}")
+            return file_data
+
+        if not OCRMYPDF_AVAILABLE:
+            self.logger.warning("OCRmyPDF not available, skipping preprocessing")
+            return file_data
+
+        if not filename.lower().endswith(".pdf"):
+            return file_data
+
+        try:
+            start_time = time.time()
+            self.logger.info(f"Starting OCRmyPDF preprocessing for: {filename}")
+
+            try:
+                input_buffer = BytesIO(file_data)
+                output_buffer = BytesIO()
+
+                ocrmypdf.ocr(
+                    input_buffer,
+                    output_buffer,
+                    language=self.settings.language,
+                    rotate_pages=self.settings.rotate_pages,
+                    deskew=self.settings.deskew,
+                    clean=self.settings.clean,
+                    optimize=self.settings.optimize,
+                    pdf_renderer=self.settings.pdf_renderer,
+                    force_ocr=self.settings.force_ocr,
+                    skip_text=self.settings.skip_text,
+                    redo_ocr=self.settings.redo_ocr,
+                    progress_bar=self.settings.progress_bar,
+                    quiet=self.settings.quiet,
+                )
+
+                processed_data = output_buffer.getvalue()
+                output_buffer.close()
+                input_buffer.close()
+
+            except Exception as buffer_error:
+                self.logger.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}")
+                processed_data = self._preprocess_with_temp_files(file_data, filename)
+
+            processing_time = time.time() - start_time
+            self.logger.info(f"OCRmyPDF completed for {filename} in {processing_time:.2f} seconds")
+
+            return processed_data
+
+        except Exception as e:
+            self.logger.error(f"OCRmyPDF preprocessing failed for {filename}: {e}")
+            return file_data
+
+    def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes:
+        """
+        Fallback implementation using unique temporary files to avoid concurrency issues.
+        """
+        input_temp_file = None
+        output_temp_file = None
 
-        # Try using BytesIO objects first to minimize disk I/O
         try:
-            input_buffer = BytesIO(file_data)
-            output_buffer = BytesIO()
+            unique_id = str(uuid4())
+            temp_dir = tempfile.gettempdir()
+
+            input_temp_file = tempfile.NamedTemporaryFile(
+                suffix=".pdf", prefix=f"ocr_input_{unique_id}_", dir=temp_dir, delete=False
+            )
+            input_temp_file.write(file_data)
+            input_temp_file.flush()
+            input_temp_file.close()
+
+            output_temp_file = tempfile.NamedTemporaryFile(
+                suffix=".pdf", prefix=f"ocr_output_{unique_id}_", dir=temp_dir, delete=False
+            )
+            output_temp_file.close()
 
-            # OCRmyPDF configuration for optimal processing
             ocrmypdf.ocr(
-                input_buffer,
-                output_buffer,
-                language=["eng"],  # Can be configured for other languages
-                rotate_pages=True,  # Auto-rotate pages with horizontal text
-                deskew=True,  # Fix skewed text
-                clean=True,  # Clean up artifacts
-                optimize=1,  # Optimize output file size
-                pdf_renderer="hocr",  # Use hOCR for better text positioning
-                force_ocr=False,  # Only OCR pages that need it
-                skip_text=False,  # Don't skip existing text
-                redo_ocr=False,  # Don't redo existing OCR
-                progress_bar=False,
-                quiet=True,
+                input_temp_file.name,
+                output_temp_file.name,
+                language=self.settings.language,
+                rotate_pages=self.settings.rotate_pages,
+                deskew=self.settings.deskew,
+                clean=self.settings.clean,
+                optimize=self.settings.optimize,
+                pdf_renderer=self.settings.pdf_renderer,
+                force_ocr=self.settings.force_ocr,
+                skip_text=self.settings.skip_text,
+                redo_ocr=self.settings.redo_ocr,
+                progress_bar=self.settings.progress_bar,
+                quiet=self.settings.quiet,
             )
 
-            # Get processed PDF data
-            processed_data = output_buffer.getvalue()
-            output_buffer.close()
-            input_buffer.close()
+            with open(output_temp_file.name, "rb") as f:
+                processed_data = f.read()
 
-        except Exception as buffer_error:
-            # Fallback to temporary files if BytesIO approach fails
-            logger.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}")
-            processed_data = _preprocess_pdf_with_temp_files(file_data, filename)
+            return processed_data
 
-        processing_time = time.time() - start_time
-        logger.info(f"OCRmyPDF completed for {filename} in {processing_time:.2f} seconds")
+        finally:
+            for temp_file in [input_temp_file, output_temp_file]:
+                if temp_file is not None:
+                    try:
+                        if hasattr(temp_file, "name"):
+                            os.unlink(temp_file.name)
+                    except OSError as e:
+                        self.logger.warning(f"Failed to clean up temp file: {e}")
 
-        return processed_data
 
-    except Exception as e:
-        logger.error(f"OCRmyPDF preprocessing failed for {filename}: {e}")
-        return file_data
+# Global preprocessor instance (can be configured via environment variables)
+pdf_preprocessor = PDFPreprocessor()
 
 
-def _preprocess_pdf_with_temp_files(file_data: bytes, filename: str) -> bytes:
+def preprocess_pdf_with_ocrmypdf(file_data: bytes, filename: str) -> bytes:
     """
-    Fallback implementation using unique temporary files to avoid concurrency issues.
+    Preprocess PDF with OCRmyPDF to add OCR layer and fix orientation.
+
+    This function provides backward compatibility by using the global pdf_preprocessor instance.
+    For new code, consider using PDFPreprocessor directly for better configuration control.
+
+    Args:
+        file_data: PDF file data as bytes
+        filename: Original filename for logging purposes
+
+    Returns:
+        Processed PDF data as bytes (or original bytes if processing fails)
     """
-    input_temp_file = None
-    output_temp_file = None
-
-    try:
-        # Generate unique identifiers to avoid filename collisions in concurrent jobs
-        unique_id = str(uuid4())
-        temp_dir = tempfile.gettempdir()
-
-        # Create input temp file with unique identifier
-        input_temp_file = tempfile.NamedTemporaryFile(
-            suffix=".pdf", prefix=f"ocr_input_{unique_id}_", dir=temp_dir, delete=False
-        )
-        input_temp_file.write(file_data)
-        input_temp_file.flush()
-        input_temp_file.close()
-
-        # Create output temp file with unique identifier
-        output_temp_file = tempfile.NamedTemporaryFile(
-            suffix=".pdf", prefix=f"ocr_output_{unique_id}_", dir=temp_dir, delete=False
-        )
-        output_temp_file.close()
-
-        # OCRmyPDF configuration for optimal processing
-        ocrmypdf.ocr(
-            input_temp_file.name,
-            output_temp_file.name,
-            language=["eng"],  # Can be configured for other languages
-            rotate_pages=True,  # Auto-rotate pages with horizontal text
-            deskew=True,  # Fix skewed text
-            clean=True,  # Clean up artifacts
-            optimize=1,  # Optimize output file size
-            pdf_renderer="hocr",  # Use hOCR for better text positioning
-            force_ocr=False,  # Only OCR pages that need it
-            skip_text=False,  # Don't skip existing text
-            redo_ocr=False,  # Don't redo existing OCR
-            progress_bar=False,
-            quiet=True,
-        )
-
-        # Read processed PDF data
-        with open(output_temp_file.name, "rb") as f:
-            processed_data = f.read()
-
-        return processed_data
-
-    finally:
-        # Clean up temporary files
-        for temp_file in [input_temp_file, output_temp_file]:
-            if temp_file is not None:
-                try:
-                    if hasattr(temp_file, "name"):
-                        os.unlink(temp_file.name)
-                except OSError as e:
-                    logger.warning(f"Failed to clean up temp file: {e}")
+    return pdf_preprocessor.preprocess(file_data, filename)
diff --git a/argilla-server/src/argilla_server/jobs/document_jobs.py b/argilla-server/src/argilla_server/jobs/document_jobs.py
index 8f3f99e90..42f7004db 100644
--- a/argilla-server/src/argilla_server/jobs/document_jobs.py
+++ b/argilla-server/src/argilla_server/jobs/document_jobs.py
@@ -131,7 +131,7 @@ async def upload_reference_documents_job(
 
                     try:
                         # Preprocess PDF files with OCRmyPDF for rotation and OCR
-                        processed_file_data = preprocessing.preprocess_pdf_with_ocrmypdf(
+                        processed_file_data = preprocessing.pdf_preprocessor.preprocess(
                             file_data=file_data, filename=filename
                         )
 
@@ -141,9 +141,9 @@ async def upload_reference_documents_job(
                             document_id=file_document_create.id,  # type: ignore
                             file_data=processed_file_data,
                             filename=filename,
-                            # metadata=file_document_create.model_dump(
-                            #     include={"file_name": True, "pmid": True, "doi": True}
-                            # ),
+                            metadata=file_document_create.model_dump(
+                                include={"file_name": True, "pmid": True, "doi": True}
+                            ),
                         )
 
                         if file_url:

From b7b8a1b72929a308834fe44c04621c0dde10babf Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Mon, 4 Aug 2025 23:53:52 -0700
Subject: [PATCH 03/22] feat: add margin analysis to PDF preprocessing with
 opencv-python

---
 argilla-server/pdm.lock                       |  19 +-
 argilla-server/pyproject.toml                 |   3 +-
 .../contexts/document/preprocessing.py        | 351 +++++++++++++++++-
 .../src/argilla_server/jobs/document_jobs.py  |  13 +-
 4 files changed, 364 insertions(+), 22 deletions(-)

diff --git a/argilla-server/pdm.lock b/argilla-server/pdm.lock
index 8403ffc7e..731284e16 100644
--- a/argilla-server/pdm.lock
+++ b/argilla-server/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "postgresql", "test"]
 strategy = []
 lock_version = "4.5.0"
-content_hash = "sha256:f20406357dc5b02a37c1da8689053074229d1204cb9bcb4fe8848b6d2835b1b4"
+content_hash = "sha256:037bf9850aef2d48dd2d032bdac1c64e906123f5aab6cea46dff7d66d2035d37"
 
 [[metadata.targets]]
 requires_python = ">=3.10"
@@ -1481,6 +1481,23 @@ files = [
     {file = "ocrmypdf-16.10.4.tar.gz", hash = "sha256:de749ef5f554b63d57e68d032e7cba5500cbd5030835bf24f658f7b7a04f3dc1"},
 ]
 
+[[package]]
+name = "opencv-python"
+version = "4.11.0.86"
+summary = ""
+dependencies = [
+    "numpy",
+]
+files = [
+    {file = "opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4"},
+    {file = "opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:432f67c223f1dc2824f5e73cdfcd9db0efc8710647d4e813012195dc9122a52a"},
+    {file = "opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:9d05ef13d23fe97f575153558653e2d6e87103995d54e6a35db3f282fe1f9c66"},
+    {file = "opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b92ae2c8852208817e6776ba1ea0d6b1e0a1b5431e971a2a0ddd2a8cc398202"},
+    {file = "opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b02611523803495003bd87362db3e1d2a0454a6a63025dc6658a9830570aa0d"},
+    {file = "opencv_python-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:810549cb2a4aedaa84ad9a1c92fbfdfc14090e2749cedf2c1589ad8359aa169b"},
+    {file = "opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec"},
+]
+
 [[package]]
 name = "opensearch-py"
 version = "2.0.1"
diff --git a/argilla-server/pyproject.toml b/argilla-server/pyproject.toml
index d9eae2ddc..abd8a1969 100644
--- a/argilla-server/pyproject.toml
+++ b/argilla-server/pyproject.toml
@@ -68,7 +68,8 @@ dependencies = [
     "minio>=7.2.7",
     # For document processing
     "ocrmypdf>=16.10.4",
-    "pdf2image>=1.17.0"
+    "pdf2image>=1.17.0",
+    "opencv-python>=4.11.0.86"
 ]
 
 [project.optional-dependencies]
diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
index c3047be97..a0ff0fd19 100644
--- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py
+++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
@@ -18,13 +18,31 @@
 import os
 import tempfile
 import time
+from dataclasses import dataclass
 from io import BytesIO
-from typing import List, Optional
+from typing import Dict, List, Optional, Tuple
 from uuid import uuid4
 
+import numpy as np
 from pydantic import Field
 from pydantic_settings import BaseSettings
 
+try:
+    pass
+
+    CV2_AVAILABLE = True
+except ImportError:
+    CV2_AVAILABLE = False
+
+try:
+    from pdf2image import convert_from_bytes
+    from PIL import ImageChops
+    from PIL.Image import Image as PILImage
+
+    PDF2IMAGE_AVAILABLE = True
+except ImportError:
+    PDF2IMAGE_AVAILABLE = False
+
 try:
     import ocrmypdf
 
@@ -35,6 +53,272 @@
 logger = logging.getLogger(__name__)
 
 
+@dataclass
+class PDFProcessingResult:
+    """
+    Result of PDF preprocessing containing both processed data and analysis metadata.
+    """
+
+    processed_data: bytes
+    metadata: Dict
+
+
+class PDFAnalyzer:
+    """
+    Analyzes PDF layout structure to detect margins, headers, footers, and other regions.
+    """
+
+    def __init__(self):
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+
+    def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
+        """
+        Analyze PDF layout to extract margin and region information.
+
+        Args:
+            pdf_data: PDF file data as bytes
+            filename: Filename for logging
+
+        Returns:
+            Dictionary containing layout analysis metadata
+        """
+        if not (PDF2IMAGE_AVAILABLE and CV2_AVAILABLE):
+            self.logger.warning("PDF analysis requires pdf2image and cv2, skipping layout analysis")
+            return {"analysis_available": False, "error": "Missing dependencies"}
+
+        try:
+            # Convert PDF to images
+            images = convert_from_bytes(pdf_data, dpi=150)  # Lower DPI for analysis
+            if not images:
+                return {"analysis_available": False, "error": "No pages found"}
+
+            self.logger.info(f"Analyzing layout for {filename} with {len(images)} pages")
+
+            # Analyze layout
+            layout_data = self._analyze_page_layout(images)
+
+            return {
+                "analysis_available": True,
+                "total_pages": len(images),
+                "page_dimensions": {"width": images[0].size[0], "height": images[0].size[1]} if images else {},
+                **layout_data,
+            }
+
+        except Exception as e:
+            self.logger.error(f"PDF layout analysis failed for {filename}: {e}")
+            return {"analysis_available": False, "error": str(e)}
+
+    def _analyze_page_layout(self, images: List[PILImage]) -> Dict:
+        """
+        Analyze page layout by comparing pages to find common regions.
+        """
+        if len(images) < 2:
+            return self._analyze_single_page(images[0]) if images else {}
+
+        # Use first page as reference, compare with others
+        reference_img = images[0].convert("RGB")
+        margin_data = []
+
+        for i in range(1, min(len(images), 5)):  # Analyze up to 5 pages for efficiency
+            compare_img = images[i].convert("RGB")
+            page_margins = self._compare_pages_for_margins(reference_img, compare_img)
+            if page_margins:
+                margin_data.append(page_margins)
+
+        # Aggregate margin data
+        if margin_data:
+            return self._aggregate_margin_data(margin_data, reference_img.size)
+        else:
+            return self._analyze_single_page(reference_img)
+
+    def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) -> Optional[Dict]:
+        """
+        Compare two pages to identify common regions (headers, footers, margins).
+        """
+        try:
+            # Ensure same size
+            if reference.size != compare.size:
+                compare = compare.resize(reference.size)
+
+            # Compute difference and create sameness mask
+            diff = ImageChops.difference(reference, compare)
+            sameness_mask = ImageChops.invert(diff.convert("L"))
+
+            # Find horizontal bands (potential headers/footers)
+            horizontal_bands = self._find_horizontal_bands(sameness_mask)
+
+            # Classify regions
+            regions = self._classify_regions(horizontal_bands, reference.size)
+
+            return regions
+
+        except Exception as e:
+            self.logger.debug(f"Page comparison failed: {e}")
+            return None
+
+    def _find_horizontal_bands(
+        self, mask: PILImage, min_height: int = 15, min_ratio: float = 0.95
+    ) -> List[Tuple[int, int]]:
+        """
+        Find horizontal bands of similar content across pages.
+        """
+        mask_np = np.array(mask.convert("L"))
+        h, w = mask_np.shape
+
+        # Calculate row-wise similarity
+        row_sums = np.sum(mask_np == 255, axis=1) / w
+        same_rows = row_sums >= min_ratio
+
+        # Find contiguous bands
+        bands = []
+        start = None
+
+        for i, is_same in enumerate(same_rows):
+            if is_same and start is None:
+                start = i
+            elif not is_same and start is not None:
+                if i - start >= min_height:
+                    bands.append((start, i))
+                start = None
+
+        # Handle band that extends to end
+        if start is not None and h - start >= min_height:
+            bands.append((start, h))
+
+        return bands
+
+    def _classify_regions(self, bands: List[Tuple[int, int]], page_size: Tuple[int, int]) -> Dict:
+        """
+        Classify horizontal bands into headers, footers, and margins.
+        """
+        width, height = page_size
+        regions = {"header_bands": [], "footer_bands": [], "estimated_margins": {}}
+
+        for start_y, end_y in bands:
+            band_center = (start_y + end_y) / 2
+            band_height = end_y - start_y
+
+            # Classify based on position
+            if band_center < height * 0.25:  # Top 25%
+                regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
+            elif band_center > height * 0.75:  # Bottom 25%
+                regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
+
+        # Estimate margins based on bands
+        regions["estimated_margins"] = self._estimate_margins_from_bands(regions, page_size)
+
+        return regions
+
+    def _estimate_margins_from_bands(self, regions: Dict, page_size: Tuple[int, int]) -> Dict:
+        """
+        Estimate page margins based on detected bands.
+        """
+        width, height = page_size
+        margins = {
+            "top": 0,
+            "bottom": 0,
+            "left": 50,  # Default estimates
+            "right": 50,
+        }
+
+        # Calculate top margin from header bands
+        if regions["header_bands"]:
+            max_header_end = max(band["end_y"] for band in regions["header_bands"])
+            margins["top"] = max_header_end
+
+        # Calculate bottom margin from footer bands
+        if regions["footer_bands"]:
+            min_footer_start = min(band["start_y"] for band in regions["footer_bands"])
+            margins["bottom"] = height - min_footer_start
+
+        # Convert to relative percentages for consistency
+        return {
+            "top_px": margins["top"],
+            "bottom_px": margins["bottom"],
+            "left_px": margins["left"],
+            "right_px": margins["right"],
+            "top_percent": (margins["top"] / height) * 100,
+            "bottom_percent": (margins["bottom"] / height) * 100,
+            "left_percent": (margins["left"] / width) * 100,
+            "right_percent": (margins["right"] / width) * 100,
+        }
+
+    def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, int]) -> Dict:
+        """
+        Aggregate margin data from multiple page comparisons.
+        """
+        # Average the margin estimates
+        all_margins = [data.get("estimated_margins", {}) for data in margin_data if data.get("estimated_margins")]
+
+        if not all_margins:
+            return self._analyze_single_page_size(page_size)
+
+        # Calculate average margins
+        avg_margins = {}
+        for key in [
+            "top_px",
+            "bottom_px",
+            "left_px",
+            "right_px",
+            "top_percent",
+            "bottom_percent",
+            "left_percent",
+            "right_percent",
+        ]:
+            values = [m.get(key, 0) for m in all_margins if key in m]
+            avg_margins[key] = sum(values) / len(values) if values else 0
+
+        # Collect all bands
+        all_header_bands = []
+        all_footer_bands = []
+
+        for data in margin_data:
+            all_header_bands.extend(data.get("header_bands", []))
+            all_footer_bands.extend(data.get("footer_bands", []))
+
+        return {
+            "layout_analysis": {
+                "header_bands": all_header_bands,
+                "footer_bands": all_footer_bands,
+                "estimated_margins": avg_margins,
+                "analysis_method": "multi_page_comparison",
+            }
+        }
+
+    def _analyze_single_page(self, image: PILImage) -> Dict:
+        """
+        Analyze a single page when comparison isn't possible.
+        """
+        return self._analyze_single_page_size(image.size)
+
+    def _analyze_single_page_size(self, page_size: Tuple[int, int]) -> Dict:
+        """
+        Provide default margin estimates for single page analysis.
+        """
+        width, height = page_size
+
+        # Use common academic paper margins as defaults
+        default_margins = {
+            "top_px": int(height * 0.1),  # 10% top margin
+            "bottom_px": int(height * 0.1),  # 10% bottom margin
+            "left_px": int(width * 0.1),  # 10% left margin
+            "right_px": int(width * 0.1),  # 10% right margin
+            "top_percent": 10.0,
+            "bottom_percent": 10.0,
+            "left_percent": 10.0,
+            "right_percent": 10.0,
+        }
+
+        return {
+            "layout_analysis": {
+                "header_bands": [],
+                "footer_bands": [],
+                "estimated_margins": default_margins,
+                "analysis_method": "default_estimates",
+            }
+        }
+
+
 class PDFPreprocessingSettings(BaseSettings):
     """
     PDF preprocessing settings that can be configured via environment variables.
@@ -79,6 +363,7 @@ class Config:
 class PDFPreprocessor:
     """
     PDF preprocessor that uses OCRmyPDF for rotation, OCR, and optimization.
+    Also performs layout analysis to extract margin and structure information.
 
     Can be configured with environment variables using the PDFPreprocessingSettings.
     """
@@ -92,38 +377,64 @@ def __init__(self, settings: Optional[PDFPreprocessingSettings] = None):
         """
         self.settings = settings or PDFPreprocessingSettings()
         self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+        self.analyzer = PDFAnalyzer()
 
         if not self.settings.enabled:
             self.logger.info("PDF preprocessing is disabled via configuration")
         elif not OCRMYPDF_AVAILABLE:
             self.logger.warning("OCRmyPDF not available, PDF preprocessing will be skipped")
 
-    def preprocess(self, file_data: bytes, filename: str) -> bytes:
+    def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult:
         """
-        Preprocess PDF with OCRmyPDF using configured settings.
+        Preprocess PDF with OCRmyPDF and analyze layout structure.
 
         Args:
             file_data: PDF file data as bytes
             filename: Original filename for logging purposes
 
         Returns:
-            Processed PDF data as bytes (or original bytes if processing fails/disabled)
+            PDFProcessingResult containing processed data and layout analysis metadata
         """
+        # Initialize metadata
+        metadata = {
+            "preprocessing_enabled": self.settings.enabled,
+            "ocrmypdf_available": OCRMYPDF_AVAILABLE,
+            "original_filename": filename,
+            "processing_timestamp": time.time(),
+        }
+
+        # Handle non-PDF files or disabled preprocessing
+        if not filename.lower().endswith(".pdf"):
+            metadata["skipped_reason"] = "not_pdf"
+            return PDFProcessingResult(processed_data=file_data, metadata=metadata)
+
         if not self.settings.enabled:
             self.logger.debug(f"PDF preprocessing disabled, skipping: {filename}")
-            return file_data
+            metadata["skipped_reason"] = "preprocessing_disabled"
+            # Still run analysis on original data
+            layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
+            metadata.update(layout_analysis)
+            return PDFProcessingResult(processed_data=file_data, metadata=metadata)
 
         if not OCRMYPDF_AVAILABLE:
             self.logger.warning("OCRmyPDF not available, skipping preprocessing")
-            return file_data
-
-        if not filename.lower().endswith(".pdf"):
-            return file_data
+            metadata["skipped_reason"] = "ocrmypdf_unavailable"
+            # Still run analysis on original data
+            layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
+            metadata.update(layout_analysis)
+            return PDFProcessingResult(processed_data=file_data, metadata=metadata)
 
         try:
             start_time = time.time()
-            self.logger.info(f"Starting OCRmyPDF preprocessing for: {filename}")
+            self.logger.info(f"Starting PDF preprocessing and analysis for: {filename}")
 
+            # Step 1: Analyze original PDF layout
+            self.logger.debug("Analyzing PDF layout structure...")
+            layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
+            metadata.update(layout_analysis)
+
+            # Step 2: OCR preprocessing
+            self.logger.debug("Starting OCRmyPDF processing...")
             try:
                 input_buffer = BytesIO(file_data)
                 output_buffer = BytesIO()
@@ -148,18 +459,27 @@ def preprocess(self, file_data: bytes, filename: str) -> bytes:
                 output_buffer.close()
                 input_buffer.close()
 
+                metadata["ocr_method"] = "bytesio"
+
             except Exception as buffer_error:
                 self.logger.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}")
                 processed_data = self._preprocess_with_temp_files(file_data, filename)
+                metadata["ocr_method"] = "temp_files"
+                metadata["ocr_fallback_reason"] = str(buffer_error)
 
             processing_time = time.time() - start_time
-            self.logger.info(f"OCRmyPDF completed for {filename} in {processing_time:.2f} seconds")
+            metadata["processing_time_seconds"] = processing_time
+            metadata["processing_successful"] = True
 
-            return processed_data
+            self.logger.info(f"PDF preprocessing completed for {filename} in {processing_time:.2f} seconds")
+
+            return PDFProcessingResult(processed_data=processed_data, metadata=metadata)
 
         except Exception as e:
-            self.logger.error(f"OCRmyPDF preprocessing failed for {filename}: {e}")
-            return file_data
+            self.logger.error(f"PDF preprocessing failed for {filename}: {e}")
+            metadata["processing_successful"] = False
+            metadata["processing_error"] = str(e)
+            return PDFProcessingResult(processed_data=file_data, metadata=metadata)
 
     def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes:
         """
@@ -233,4 +553,5 @@ def preprocess_pdf_with_ocrmypdf(file_data: bytes, filename: str) -> bytes:
     Returns:
         Processed PDF data as bytes (or original bytes if processing fails)
     """
-    return pdf_preprocessor.preprocess(file_data, filename)
+    result = pdf_preprocessor.preprocess(file_data, filename)
+    return result.processed_data
diff --git a/argilla-server/src/argilla_server/jobs/document_jobs.py b/argilla-server/src/argilla_server/jobs/document_jobs.py
index 42f7004db..2c4a61cf1 100644
--- a/argilla-server/src/argilla_server/jobs/document_jobs.py
+++ b/argilla-server/src/argilla_server/jobs/document_jobs.py
@@ -130,10 +130,14 @@ async def upload_reference_documents_job(
                         continue
 
                     try:
-                        # Preprocess PDF files with OCRmyPDF for rotation and OCR
-                        processed_file_data = preprocessing.pdf_preprocessor.preprocess(
+                        # Preprocess PDF files with OCRmyPDF for rotation and OCR, plus layout analysis
+                        preprocessing_result = preprocessing.pdf_preprocessor.preprocess(
                             file_data=file_data, filename=filename
                         )
+                        processed_file_data = preprocessing_result.processed_data
+
+                        # Store preprocessing metadata in file metadata
+                        file_metadata.update({"preprocessing": preprocessing_result.metadata})
 
                         file_url = files.put_document_file(
                             client=client,
@@ -141,9 +145,7 @@ async def upload_reference_documents_job(
                             document_id=file_document_create.id,  # type: ignore
                             file_data=processed_file_data,
                             filename=filename,
-                            metadata=file_document_create.model_dump(
-                                include={"file_name": True, "pmid": True, "doi": True}
-                            ),
+                            metadata=file_metadata,
                         )
 
                         if file_url:
@@ -158,6 +160,7 @@ async def upload_reference_documents_job(
 
                     # Create document in database
                     try:
+                        file_document_create.metadata = file_metadata
                         document = await imports.create_document(db, file_document_create)
                         _LOGGER.info(f"Document created successfully for file {filename} with ID {document.id}")
                         file_result.update({"success": True, "document_id": str(document.id), "status": "created"})

From 25a90981d8e9b61f0c2239306775b9b7fcd0a0ea Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Tue, 5 Aug 2025 00:05:12 -0700
Subject: [PATCH 04/22] feat: enable PDF preprocessing analysis with new
 configuration options

---
 argilla-server/.env.dev                       |   8 +
 .../contexts/document/analysis.py             | 495 ++++++++++++++++++
 .../contexts/document/preprocessing.py        | 328 ++----------
 3 files changed, 535 insertions(+), 296 deletions(-)
 create mode 100644 argilla-server/src/argilla_server/contexts/document/analysis.py

diff --git a/argilla-server/.env.dev b/argilla-server/.env.dev
index a1399f4f2..354fff256 100644
--- a/argilla-server/.env.dev
+++ b/argilla-server/.env.dev
@@ -20,3 +20,11 @@ ARGILLA_ELASTICSEARCH=http://localhost:9200
 
 # Redis configuration
 ARGILLA_REDIS_URL=redis://localhost:6379/0
+
+# PDF Preprocessing
+PREPROCESSING_ENABLED=true
+PREPROCESSING_ENABLE_ANALYSIS=true
+PREPROCESSING_LANGUAGE='["eng"]'
+PREPROCESSING_ROTATE_PAGES=true
+PREPROCESSING_OPTIMIZE=1
+PREPROCESSING_QUIET=true
\ No newline at end of file
diff --git a/argilla-server/src/argilla_server/contexts/document/analysis.py b/argilla-server/src/argilla_server/contexts/document/analysis.py
new file mode 100644
index 000000000..1f097cfe6
--- /dev/null
+++ b/argilla-server/src/argilla_server/contexts/document/analysis.py
@@ -0,0 +1,495 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+
+try:
+    import cv2
+
+    CV2_AVAILABLE = True
+except ImportError:
+    CV2_AVAILABLE = False
+
+try:
+    from pdf2image import convert_from_bytes
+    from PIL import ImageChops, ImageDraw
+    from PIL.Image import Image as PILImage
+
+    PDF2IMAGE_AVAILABLE = True
+except ImportError:
+    PDF2IMAGE_AVAILABLE = False
+
+try:
+    pass
+
+    OCRMYPDF_AVAILABLE = True
+except ImportError:
+    OCRMYPDF_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PDFProcessingResult:
+    """
+    Result of PDF preprocessing containing both processed data and analysis metadata.
+    """
+
+    processed_data: bytes
+    metadata: Dict
+
+
+def pil_to_cv(image: PILImage) -> np.ndarray:
+    """Convert PIL Image to OpenCV format."""
+    return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+
+
+def classify_and_draw_layout_regions(
+    reference: PILImage, mask: PILImage, min_area: int = 5000, label: bool = True
+) -> Tuple[PILImage, List[Dict]]:
+    """
+    Classify and optionally draw layout regions using contour detection.
+
+    Returns:
+        Tuple of (annotated image, list of detected regions)
+    """
+    if not CV2_AVAILABLE:
+        return reference, []
+
+    mask_np = np.array(mask.convert("L"))
+    h, w = mask_np.shape
+
+    # Clean up the mask using morphological operations
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
+    cleaned = cv2.morphologyEx(mask_np, cv2.MORPH_CLOSE, kernel)
+
+    contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    img = reference.copy() if label else reference
+    regions = []
+
+    if label:
+        draw = ImageDraw.Draw(img)
+
+    for cnt in contours:
+        x, y, rw, rh = cv2.boundingRect(cnt)
+        area = rw * rh
+
+        if area < min_area:
+            continue
+
+        cx, cy = x + rw // 2, y + rh // 2
+
+        # Classify region based on position
+        if cy < h * 0.25:
+            region = "header"
+        elif cy > h * 0.75:
+            region = "footer"
+        elif cx < w * 0.15:
+            region = "left_margin"
+        elif cx > w * 0.85:
+            region = "right_margin"
+        else:
+            region = "body"
+
+        region_data = {
+            "type": region,
+            "x": x,
+            "y": y,
+            "width": rw,
+            "height": rh,
+            "area": area,
+            "center_x": cx,
+            "center_y": cy,
+        }
+        regions.append(region_data)
+
+        if label:
+            draw.rectangle([x, y, x + rw, y + rh], outline="green", width=2)
+            draw.text((x, y - 10), region, fill="green")
+
+    return img, regions
+
+
+def find_horizontal_bands(mask: PILImage, min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]:
+    """Find horizontal bands of similar content across pages."""
+    mask_np = np.array(mask.convert("L"))
+    h, w = mask_np.shape
+
+    row_sums = np.sum(mask_np == 255, axis=1) / w  # white = same
+    same_rows = row_sums >= min_ratio
+
+    bands = []
+    start = None
+    for i, val in enumerate(same_rows):
+        if val and start is None:
+            start = i
+        elif not val and start is not None:
+            if i - start >= min_height:
+                bands.append((start, i))
+            start = None
+    if start is not None and h - start >= min_height:
+        bands.append((start, h))
+
+    return bands
+
+
+class PDFAnalyzer:
+    """
+    Analyzes PDF layout structure to detect margins, headers, footers, and other regions.
+    """
+
+    def __init__(self):
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+
+    def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
+        """
+        Analyze PDF layout to extract margin and region information.
+
+        Args:
+            pdf_data: PDF file data as bytes
+            filename: Filename for logging
+
+        Returns:
+            Dictionary containing layout analysis metadata
+        """
+        if not (PDF2IMAGE_AVAILABLE and CV2_AVAILABLE):
+            self.logger.warning("PDF analysis requires pdf2image and cv2, skipping layout analysis")
+            return {"analysis_available": False, "error": "Missing dependencies"}
+
+        try:
+            # Convert PDF to images
+            images = convert_from_bytes(pdf_data, dpi=150)  # Lower DPI for analysis
+            if not images:
+                return {"analysis_available": False, "error": "No pages found"}
+
+            self.logger.info(f"Analyzing layout for {filename} with {len(images)} pages")
+
+            # Analyze layout
+            layout_data = self._analyze_page_layout(images)
+
+            return {
+                "analysis_available": True,
+                "total_pages": len(images),
+                "page_dimensions": {"width": images[0].size[0], "height": images[0].size[1]} if images else {},
+                **layout_data,
+            }
+
+        except Exception as e:
+            self.logger.error(f"PDF layout analysis failed for {filename}: {e}")
+            return {"analysis_available": False, "error": str(e)}
+
+    def _analyze_page_layout(self, images: List[PILImage]) -> Dict:
+        """
+        Analyze page layout by comparing pages to find common regions.
+        """
+        if len(images) < 2:
+            return self._analyze_single_page(images[0]) if images else {}
+
+        # Use first page as reference, compare with others
+        reference_img = images[0].convert("RGB")
+        margin_data = []
+
+        for i in range(1, min(len(images), 5)):  # Analyze up to 5 pages for efficiency
+            compare_img = images[i].convert("RGB")
+            page_margins = self._compare_pages_for_margins(reference_img, compare_img)
+            if page_margins:
+                margin_data.append(page_margins)
+
+        # Aggregate margin data
+        if margin_data:
+            return self._aggregate_margin_data(margin_data, reference_img.size)
+        else:
+            return self._analyze_single_page(reference_img)
+
+    def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) -> Optional[Dict]:
+        """
+        Compare two pages to identify common regions using advanced CV2 techniques.
+        """
+        try:
+            # Ensure same size
+            if reference.size != compare.size:
+                self.logger.debug(f"Resizing page to match reference size")
+                compare = compare.resize(reference.size)
+
+            # Step 1: Compute difference and invert so white = same
+            diff = ImageChops.difference(reference, compare)
+            sameness_mask = ImageChops.invert(diff.convert("L"))
+
+            # Step 2: Threshold the mask (keep high-sameness pixels)
+            # Create a lookup table for thresholding
+            threshold = 30
+            lut = [255 if i > threshold else 0 for i in range(256)]
+            sameness_mask.point(lut).convert("1")
+
+            # Step 3: Find horizontal bands (potential headers/footers)
+            horizontal_bands = find_horizontal_bands(sameness_mask)
+
+            # Step 4: Use contour-based region classification
+            annotated_img, detected_regions = classify_and_draw_layout_regions(
+                reference, sameness_mask, min_area=5000, label=False
+            )
+
+            # Step 5: Classify and aggregate results
+            regions = self._classify_regions_advanced(horizontal_bands, detected_regions, reference.size)
+
+            return regions
+
+        except Exception as e:
+            self.logger.debug(f"Page comparison failed: {e}")
+            return None
+
+    def _classify_regions_advanced(
+        self, bands: List[Tuple[int, int]], detected_regions: List[Dict], page_size: Tuple[int, int]
+    ) -> Dict:
+        """
+        Advanced region classification combining horizontal bands and contour detection.
+        """
+        width, height = page_size
+        regions = {
+            "header_bands": [],
+            "footer_bands": [],
+            "detected_regions": detected_regions,
+            "estimated_margins": {},
+        }
+
+        # Process horizontal bands
+        for start_y, end_y in bands:
+            band_center = (start_y + end_y) / 2
+            band_height = end_y - start_y
+
+            # Classify based on position
+            if band_center < height * 0.25:  # Top 25%
+                regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
+            elif band_center > height * 0.75:  # Bottom 25%
+                regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
+
+        # Estimate margins using both techniques
+        regions["estimated_margins"] = self._estimate_margins_advanced(regions, detected_regions, page_size)
+
+        return regions
+
+    def _estimate_margins_advanced(
+        self, regions: Dict, detected_regions: List[Dict], page_size: Tuple[int, int]
+    ) -> Dict:
+        """
+        Advanced margin estimation using both band and contour information.
+        """
+        width, height = page_size
+        margins = {
+            "top": 0,
+            "bottom": 0,
+            "left": 50,  # Default estimates
+            "right": 50,
+        }
+
+        # Calculate top margin from header regions
+        header_sources = []
+        if regions["header_bands"]:
+            header_sources.append(max(band["end_y"] for band in regions["header_bands"]))
+
+        # Add header regions from contour detection
+        header_regions = [r for r in detected_regions if r["type"] == "header"]
+        if header_regions:
+            header_sources.append(max(r["y"] + r["height"] for r in header_regions))
+
+        if header_sources:
+            margins["top"] = max(header_sources)
+
+        # Calculate bottom margin from footer regions
+        footer_sources = []
+        if regions["footer_bands"]:
+            footer_sources.append(min(band["start_y"] for band in regions["footer_bands"]))
+
+        # Add footer regions from contour detection
+        footer_regions = [r for r in detected_regions if r["type"] == "footer"]
+        if footer_regions:
+            footer_sources.append(min(r["y"] for r in footer_regions))
+
+        if footer_sources:
+            margins["bottom"] = height - min(footer_sources)
+
+        # Calculate left/right margins from contour detection
+        left_regions = [r for r in detected_regions if r["type"] == "left_margin"]
+        if left_regions:
+            margins["left"] = max(r["x"] + r["width"] for r in left_regions)
+
+        right_regions = [r for r in detected_regions if r["type"] == "right_margin"]
+        if right_regions:
+            margins["right"] = width - min(r["x"] for r in right_regions)
+
+        # Convert to relative percentages for consistency
+        return {
+            "top_px": margins["top"],
+            "bottom_px": margins["bottom"],
+            "left_px": margins["left"],
+            "right_px": margins["right"],
+            "top_percent": (margins["top"] / height) * 100 if height > 0 else 0,
+            "bottom_percent": (margins["bottom"] / height) * 100 if height > 0 else 0,
+            "left_percent": (margins["left"] / width) * 100 if width > 0 else 0,
+            "right_percent": (margins["right"] / width) * 100 if width > 0 else 0,
+        }
+
+    def _classify_regions(self, bands: List[Tuple[int, int]], page_size: Tuple[int, int]) -> Dict:
+        """
+        Classify horizontal bands into headers, footers, and margins.
+        """
+        width, height = page_size
+        regions = {"header_bands": [], "footer_bands": [], "estimated_margins": {}}
+
+        for start_y, end_y in bands:
+            band_center = (start_y + end_y) / 2
+            band_height = end_y - start_y
+
+            # Classify based on position
+            if band_center < height * 0.25:  # Top 25%
+                regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
+            elif band_center > height * 0.75:  # Bottom 25%
+                regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
+
+        # Estimate margins based on bands
+        regions["estimated_margins"] = self._estimate_margins_from_bands(regions, page_size)
+
+        return regions
+
+    def _estimate_margins_from_bands(self, regions: Dict, page_size: Tuple[int, int]) -> Dict:
+        """
+        Estimate page margins based on detected bands.
+        """
+        width, height = page_size
+        margins = {
+            "top": 0,
+            "bottom": 0,
+            "left": 50,  # Default estimates
+            "right": 50,
+        }
+
+        # Calculate top margin from header bands
+        if regions["header_bands"]:
+            max_header_end = max(band["end_y"] for band in regions["header_bands"])
+            margins["top"] = max_header_end
+
+        # Calculate bottom margin from footer bands
+        if regions["footer_bands"]:
+            min_footer_start = min(band["start_y"] for band in regions["footer_bands"])
+            margins["bottom"] = height - min_footer_start
+
+        # Convert to relative percentages for consistency
+        return {
+            "top_px": margins["top"],
+            "bottom_px": margins["bottom"],
+            "left_px": margins["left"],
+            "right_px": margins["right"],
+            "top_percent": (margins["top"] / height) * 100,
+            "bottom_percent": (margins["bottom"] / height) * 100,
+            "left_percent": (margins["left"] / width) * 100,
+            "right_percent": (margins["right"] / width) * 100,
+        }
+
+    def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, int]) -> Dict:
+        """
+        Aggregate margin data from multiple page comparisons.
+        """
+        # Average the margin estimates
+        all_margins = [data.get("estimated_margins", {}) for data in margin_data if data.get("estimated_margins")]
+
+        if not all_margins:
+            return self._analyze_single_page_size(page_size)
+
+        # Calculate average margins
+        avg_margins = {}
+        for key in [
+            "top_px",
+            "bottom_px",
+            "left_px",
+            "right_px",
+            "top_percent",
+            "bottom_percent",
+            "left_percent",
+            "right_percent",
+        ]:
+            values = [m.get(key, 0) for m in all_margins if key in m]
+            avg_margins[key] = sum(values) / len(values) if values else 0
+
+        # Collect all bands and regions
+        all_header_bands = []
+        all_footer_bands = []
+        all_detected_regions = []
+
+        for data in margin_data:
+            all_header_bands.extend(data.get("header_bands", []))
+            all_footer_bands.extend(data.get("footer_bands", []))
+            all_detected_regions.extend(data.get("detected_regions", []))
+
+        # Aggregate detected regions by type
+        region_stats = {}
+        for region in all_detected_regions:
+            region_type = region["type"]
+            if region_type not in region_stats:
+                region_stats[region_type] = []
+            region_stats[region_type].append(region)
+
+        return {
+            "layout_analysis": {
+                "header_bands": all_header_bands,
+                "footer_bands": all_footer_bands,
+                "detected_regions": all_detected_regions,
+                "region_statistics": {
+                    region_type: {
+                        "count": len(regions),
+                        "avg_area": sum(r["area"] for r in regions) / len(regions) if regions else 0,
+                        "total_area": sum(r["area"] for r in regions),
+                    }
+                    for region_type, regions in region_stats.items()
+                },
+                "estimated_margins": avg_margins,
+                "analysis_method": "multi_page_comparison_advanced",
+            }
+        }
+
+    def _analyze_single_page(self, image: PILImage) -> Dict:
+        """
+        Analyze a single page when comparison isn't possible.
+        """
+        return self._analyze_single_page_size(image.size)
+
+    def _analyze_single_page_size(self, page_size: Tuple[int, int]) -> Dict:
+        """
+        Provide default margin estimates for single page analysis.
+        """
+        width, height = page_size
+
+        # Use common academic paper margins as defaults
+        default_margins = {
+            "top_px": int(height * 0.1),  # 10% top margin
+            "bottom_px": int(height * 0.1),  # 10% bottom margin
+            "left_px": int(width * 0.1),  # 10% left margin
+            "right_px": int(width * 0.1),  # 10% right margin
+            "top_percent": 10.0,
+            "bottom_percent": 10.0,
+            "left_percent": 10.0,
+            "right_percent": 10.0,
+        }
+
+        return {
+            "layout_analysis": {
+                "header_bands": [],
+                "footer_bands": [],
+                "estimated_margins": default_margins,
+                "analysis_method": "default_estimates",
+            }
+        }
diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
index a0ff0fd19..5395a7fc3 100644
--- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py
+++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
@@ -18,31 +18,13 @@
 import os
 import tempfile
 import time
-from dataclasses import dataclass
 from io import BytesIO
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional
 from uuid import uuid4
 
-import numpy as np
 from pydantic import Field
 from pydantic_settings import BaseSettings
 
-try:
-    pass
-
-    CV2_AVAILABLE = True
-except ImportError:
-    CV2_AVAILABLE = False
-
-try:
-    from pdf2image import convert_from_bytes
-    from PIL import ImageChops
-    from PIL.Image import Image as PILImage
-
-    PDF2IMAGE_AVAILABLE = True
-except ImportError:
-    PDF2IMAGE_AVAILABLE = False
-
 try:
     import ocrmypdf
 
@@ -50,273 +32,12 @@
 except ImportError:
     OCRMYPDF_AVAILABLE = False
 
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class PDFProcessingResult:
-    """
-    Result of PDF preprocessing containing both processed data and analysis metadata.
-    """
-
-    processed_data: bytes
-    metadata: Dict
-
-
-class PDFAnalyzer:
-    """
-    Analyzes PDF layout structure to detect margins, headers, footers, and other regions.
-    """
-
-    def __init__(self):
-        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
-
-    def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
-        """
-        Analyze PDF layout to extract margin and region information.
-
-        Args:
-            pdf_data: PDF file data as bytes
-            filename: Filename for logging
-
-        Returns:
-            Dictionary containing layout analysis metadata
-        """
-        if not (PDF2IMAGE_AVAILABLE and CV2_AVAILABLE):
-            self.logger.warning("PDF analysis requires pdf2image and cv2, skipping layout analysis")
-            return {"analysis_available": False, "error": "Missing dependencies"}
-
-        try:
-            # Convert PDF to images
-            images = convert_from_bytes(pdf_data, dpi=150)  # Lower DPI for analysis
-            if not images:
-                return {"analysis_available": False, "error": "No pages found"}
-
-            self.logger.info(f"Analyzing layout for {filename} with {len(images)} pages")
-
-            # Analyze layout
-            layout_data = self._analyze_page_layout(images)
-
-            return {
-                "analysis_available": True,
-                "total_pages": len(images),
-                "page_dimensions": {"width": images[0].size[0], "height": images[0].size[1]} if images else {},
-                **layout_data,
-            }
-
-        except Exception as e:
-            self.logger.error(f"PDF layout analysis failed for {filename}: {e}")
-            return {"analysis_available": False, "error": str(e)}
-
-    def _analyze_page_layout(self, images: List[PILImage]) -> Dict:
-        """
-        Analyze page layout by comparing pages to find common regions.
-        """
-        if len(images) < 2:
-            return self._analyze_single_page(images[0]) if images else {}
-
-        # Use first page as reference, compare with others
-        reference_img = images[0].convert("RGB")
-        margin_data = []
-
-        for i in range(1, min(len(images), 5)):  # Analyze up to 5 pages for efficiency
-            compare_img = images[i].convert("RGB")
-            page_margins = self._compare_pages_for_margins(reference_img, compare_img)
-            if page_margins:
-                margin_data.append(page_margins)
-
-        # Aggregate margin data
-        if margin_data:
-            return self._aggregate_margin_data(margin_data, reference_img.size)
-        else:
-            return self._analyze_single_page(reference_img)
-
-    def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) -> Optional[Dict]:
-        """
-        Compare two pages to identify common regions (headers, footers, margins).
-        """
-        try:
-            # Ensure same size
-            if reference.size != compare.size:
-                compare = compare.resize(reference.size)
-
-            # Compute difference and create sameness mask
-            diff = ImageChops.difference(reference, compare)
-            sameness_mask = ImageChops.invert(diff.convert("L"))
-
-            # Find horizontal bands (potential headers/footers)
-            horizontal_bands = self._find_horizontal_bands(sameness_mask)
-
-            # Classify regions
-            regions = self._classify_regions(horizontal_bands, reference.size)
-
-            return regions
-
-        except Exception as e:
-            self.logger.debug(f"Page comparison failed: {e}")
-            return None
-
-    def _find_horizontal_bands(
-        self, mask: PILImage, min_height: int = 15, min_ratio: float = 0.95
-    ) -> List[Tuple[int, int]]:
-        """
-        Find horizontal bands of similar content across pages.
-        """
-        mask_np = np.array(mask.convert("L"))
-        h, w = mask_np.shape
-
-        # Calculate row-wise similarity
-        row_sums = np.sum(mask_np == 255, axis=1) / w
-        same_rows = row_sums >= min_ratio
-
-        # Find contiguous bands
-        bands = []
-        start = None
-
-        for i, is_same in enumerate(same_rows):
-            if is_same and start is None:
-                start = i
-            elif not is_same and start is not None:
-                if i - start >= min_height:
-                    bands.append((start, i))
-                start = None
-
-        # Handle band that extends to end
-        if start is not None and h - start >= min_height:
-            bands.append((start, h))
-
-        return bands
-
-    def _classify_regions(self, bands: List[Tuple[int, int]], page_size: Tuple[int, int]) -> Dict:
-        """
-        Classify horizontal bands into headers, footers, and margins.
-        """
-        width, height = page_size
-        regions = {"header_bands": [], "footer_bands": [], "estimated_margins": {}}
-
-        for start_y, end_y in bands:
-            band_center = (start_y + end_y) / 2
-            band_height = end_y - start_y
-
-            # Classify based on position
-            if band_center < height * 0.25:  # Top 25%
-                regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
-            elif band_center > height * 0.75:  # Bottom 25%
-                regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
-
-        # Estimate margins based on bands
-        regions["estimated_margins"] = self._estimate_margins_from_bands(regions, page_size)
-
-        return regions
-
-    def _estimate_margins_from_bands(self, regions: Dict, page_size: Tuple[int, int]) -> Dict:
-        """
-        Estimate page margins based on detected bands.
-        """
-        width, height = page_size
-        margins = {
-            "top": 0,
-            "bottom": 0,
-            "left": 50,  # Default estimates
-            "right": 50,
-        }
-
-        # Calculate top margin from header bands
-        if regions["header_bands"]:
-            max_header_end = max(band["end_y"] for band in regions["header_bands"])
-            margins["top"] = max_header_end
-
-        # Calculate bottom margin from footer bands
-        if regions["footer_bands"]:
-            min_footer_start = min(band["start_y"] for band in regions["footer_bands"])
-            margins["bottom"] = height - min_footer_start
-
-        # Convert to relative percentages for consistency
-        return {
-            "top_px": margins["top"],
-            "bottom_px": margins["bottom"],
-            "left_px": margins["left"],
-            "right_px": margins["right"],
-            "top_percent": (margins["top"] / height) * 100,
-            "bottom_percent": (margins["bottom"] / height) * 100,
-            "left_percent": (margins["left"] / width) * 100,
-            "right_percent": (margins["right"] / width) * 100,
-        }
-
-    def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, int]) -> Dict:
-        """
-        Aggregate margin data from multiple page comparisons.
-        """
-        # Average the margin estimates
-        all_margins = [data.get("estimated_margins", {}) for data in margin_data if data.get("estimated_margins")]
-
-        if not all_margins:
-            return self._analyze_single_page_size(page_size)
-
-        # Calculate average margins
-        avg_margins = {}
-        for key in [
-            "top_px",
-            "bottom_px",
-            "left_px",
-            "right_px",
-            "top_percent",
-            "bottom_percent",
-            "left_percent",
-            "right_percent",
-        ]:
-            values = [m.get(key, 0) for m in all_margins if key in m]
-            avg_margins[key] = sum(values) / len(values) if values else 0
-
-        # Collect all bands
-        all_header_bands = []
-        all_footer_bands = []
-
-        for data in margin_data:
-            all_header_bands.extend(data.get("header_bands", []))
-            all_footer_bands.extend(data.get("footer_bands", []))
-
-        return {
-            "layout_analysis": {
-                "header_bands": all_header_bands,
-                "footer_bands": all_footer_bands,
-                "estimated_margins": avg_margins,
-                "analysis_method": "multi_page_comparison",
-            }
-        }
-
-    def _analyze_single_page(self, image: PILImage) -> Dict:
-        """
-        Analyze a single page when comparison isn't possible.
-        """
-        return self._analyze_single_page_size(image.size)
-
-    def _analyze_single_page_size(self, page_size: Tuple[int, int]) -> Dict:
-        """
-        Provide default margin estimates for single page analysis.
-        """
-        width, height = page_size
-
-        # Use common academic paper margins as defaults
-        default_margins = {
-            "top_px": int(height * 0.1),  # 10% top margin
-            "bottom_px": int(height * 0.1),  # 10% bottom margin
-            "left_px": int(width * 0.1),  # 10% left margin
-            "right_px": int(width * 0.1),  # 10% right margin
-            "top_percent": 10.0,
-            "bottom_percent": 10.0,
-            "left_percent": 10.0,
-            "right_percent": 10.0,
-        }
+try:
+    from argilla_server.contexts.document.analysis import PDFAnalyzer, PDFProcessingResult
 
-        return {
-            "layout_analysis": {
-                "header_bands": [],
-                "footer_bands": [],
-                "estimated_margins": default_margins,
-                "analysis_method": "default_estimates",
-            }
-        }
+    ANALYSIS_AVAILABLE = True
+except ImportError:
+    ANALYSIS_AVAILABLE = False
 
 
 class PDFPreprocessingSettings(BaseSettings):
@@ -356,6 +77,9 @@ class PDFPreprocessingSettings(BaseSettings):
 
     quiet: bool = Field(default=True, description="Suppress OCRmyPDF output messages")
 
+    # Analysis settings
+    enable_analysis: bool = Field(default=True, description="Enable PDF layout analysis and margin detection")
+
     class Config:
         env_prefix = "PREPROCESSING_"
 
@@ -377,7 +101,14 @@ def __init__(self, settings: Optional[PDFPreprocessingSettings] = None):
         """
         self.settings = settings or PDFPreprocessingSettings()
         self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
-        self.analyzer = PDFAnalyzer()
+
+        # Initialize analyzer if available and enabled
+        if self.settings.enable_analysis and ANALYSIS_AVAILABLE:
+            self.analyzer = PDFAnalyzer()
+        else:
+            self.analyzer = None
+            if self.settings.enable_analysis and not ANALYSIS_AVAILABLE:
+                self.logger.warning("PDF analysis is enabled but dependencies are not available")
 
         if not self.settings.enabled:
             self.logger.info("PDF preprocessing is disabled via configuration")
@@ -411,27 +142,32 @@ def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult:
         if not self.settings.enabled:
             self.logger.debug(f"PDF preprocessing disabled, skipping: {filename}")
             metadata["skipped_reason"] = "preprocessing_disabled"
-            # Still run analysis on original data
-            layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
-            metadata.update(layout_analysis)
+            # Still run analysis on original data if enabled and available
+            if self.analyzer:
+                layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
+                metadata.update(layout_analysis)
             return PDFProcessingResult(processed_data=file_data, metadata=metadata)
 
         if not OCRMYPDF_AVAILABLE:
             self.logger.warning("OCRmyPDF not available, skipping preprocessing")
             metadata["skipped_reason"] = "ocrmypdf_unavailable"
-            # Still run analysis on original data
-            layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
-            metadata.update(layout_analysis)
+            # Still run analysis on original data if enabled and available
+            if self.analyzer:
+                layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
+                metadata.update(layout_analysis)
             return PDFProcessingResult(processed_data=file_data, metadata=metadata)
 
         try:
             start_time = time.time()
             self.logger.info(f"Starting PDF preprocessing and analysis for: {filename}")
 
-            # Step 1: Analyze original PDF layout
-            self.logger.debug("Analyzing PDF layout structure...")
-            layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
-            metadata.update(layout_analysis)
+            # Step 1: Analyze original PDF layout (if enabled and available)
+            if self.analyzer:
+                self.logger.debug("Analyzing PDF layout structure...")
+                layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
+                metadata.update(layout_analysis)
+            else:
+                metadata.update({"analysis_available": False, "analysis_skipped": "disabled_or_unavailable"})
 
             # Step 2: OCR preprocessing
             self.logger.debug("Starting OCRmyPDF processing...")

From ab7f39b06590e13a0f2dd75b005d995fc0c1ac51 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Tue, 5 Aug 2025 10:02:00 -0700
Subject: [PATCH 05/22] feat: update PDF preprocessing settings and add new
 document analysis schemas

---
 argilla-server/.env.dev                            |  3 ++-
 .../api/schemas/v1/document/analysis.py            | 14 ++++++++++++++
 .../api/schemas/v1/{ => document}/segments.py      |  0
 .../contexts/document/preprocessing.py             |  2 +-
 4 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 argilla-server/src/argilla_server/api/schemas/v1/document/analysis.py
 rename argilla-server/src/argilla_server/api/schemas/v1/{ => document}/segments.py (100%)

diff --git a/argilla-server/.env.dev b/argilla-server/.env.dev
index 354fff256..024d40e71 100644
--- a/argilla-server/.env.dev
+++ b/argilla-server/.env.dev
@@ -23,8 +23,9 @@ ARGILLA_REDIS_URL=redis://localhost:6379/0
 
 # PDF Preprocessing
 PREPROCESSING_ENABLED=true
-PREPROCESSING_ENABLE_ANALYSIS=true
+PREPROCESSING_ENABLE_ANALYSIS=false
 PREPROCESSING_LANGUAGE='["eng"]'
 PREPROCESSING_ROTATE_PAGES=true
 PREPROCESSING_OPTIMIZE=1
+PREPROCESSING_CLEAN=false
 PREPROCESSING_QUIET=true
\ No newline at end of file
diff --git a/argilla-server/src/argilla_server/api/schemas/v1/document/analysis.py b/argilla-server/src/argilla_server/api/schemas/v1/document/analysis.py
new file mode 100644
index 000000000..fb5dffc96
--- /dev/null
+++ b/argilla-server/src/argilla_server/api/schemas/v1/document/analysis.py
@@ -0,0 +1,14 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/argilla-server/src/argilla_server/api/schemas/v1/segments.py b/argilla-server/src/argilla_server/api/schemas/v1/document/segments.py
similarity index 100%
rename from argilla-server/src/argilla_server/api/schemas/v1/segments.py
rename to argilla-server/src/argilla_server/api/schemas/v1/document/segments.py
diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
index 5395a7fc3..9b72183f4 100644
--- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py
+++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
@@ -59,7 +59,7 @@ class PDFPreprocessingSettings(BaseSettings):
 
     deskew: bool = Field(default=True, description="Fix skewed text")
 
-    clean: bool = Field(default=True, description="Clean up artifacts")
+    clean: bool = Field(default=True, description="Use `unpaper` to clean up artifacts")
 
     optimize: int = Field(
         default=1, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)"

From 1a67fbc521689f78e4de9c91412e81279ca02100 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Tue, 5 Aug 2025 10:40:06 -0700
Subject: [PATCH 06/22] feat: add new preprocessing options for Tesseract
 timeout and text skipping in PDF settings

---
 argilla-server/.env.dev                       |   2 +
 .../contexts/document/preprocessing.py        | 120 +++++++-----------
 2 files changed, 51 insertions(+), 71 deletions(-)

diff --git a/argilla-server/.env.dev b/argilla-server/.env.dev
index 024d40e71..6c9ad1694 100644
--- a/argilla-server/.env.dev
+++ b/argilla-server/.env.dev
@@ -28,4 +28,6 @@ PREPROCESSING_LANGUAGE='["eng"]'
 PREPROCESSING_ROTATE_PAGES=true
 PREPROCESSING_OPTIMIZE=1
 PREPROCESSING_CLEAN=false
+PREPROCESSING_SKIP_TEXT=true
+PREPROCESSING_TESSERACT_TIMEOUT=0
 PREPROCESSING_QUIET=true
\ No newline at end of file
diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
index 9b72183f4..085daf6e5 100644
--- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py
+++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
@@ -39,6 +39,8 @@
 except ImportError:
     ANALYSIS_AVAILABLE = False
 
+_LOGGER = logging.getLogger(__name__)
+
 
 class PDFPreprocessingSettings(BaseSettings):
     """
@@ -47,6 +49,9 @@ class PDFPreprocessingSettings(BaseSettings):
     All settings have the PREPROCESSING_ prefix.
     """
 
+    class Config:
+        env_prefix = "PREPROCESSING_"
+
     enabled: bool = Field(
         default=True, description="Enable PDF preprocessing with OCRmyPDF. Set to False to disable all processing."
     )
@@ -57,7 +62,7 @@ class PDFPreprocessingSettings(BaseSettings):
 
     rotate_pages: bool = Field(default=True, description="Auto-rotate pages with horizontal text")
 
-    deskew: bool = Field(default=True, description="Fix skewed text")
+    deskew: bool = Field(default=False, description="Fix skewed text")
 
     clean: bool = Field(default=True, description="Use `unpaper` to clean up artifacts")
 
@@ -69,19 +74,44 @@ class PDFPreprocessingSettings(BaseSettings):
 
     force_ocr: bool = Field(default=False, description="Force OCR on all pages, even if they already have text")
 
-    skip_text: bool = Field(default=False, description="Skip text-based operations (OCR only for images)")
+    tesseract_timeout: int = Field(
+        default=0, description="Timeout for Tesseract OCR processing in seconds (0 for no timeout)"
+    )
+
+    skip_text: bool = Field(default=True, description="Skip text-based operations (OCR only for images)")
 
     redo_ocr: bool = Field(default=False, description="Redo OCR on pages that already have OCR")
 
     progress_bar: bool = Field(default=False, description="Show progress bar during processing")
 
-    quiet: bool = Field(default=True, description="Suppress OCRmyPDF output messages")
-
-    # Analysis settings
     enable_analysis: bool = Field(default=True, description="Enable PDF layout analysis and margin detection")
 
-    class Config:
-        env_prefix = "PREPROCESSING_"
+    output_type: str = Field(
+        default="pdf",
+        description="Output type for OCRmyPDF. Set to 'pdf' to skip PDF/A conversion.",
+    )
+
+    def get_ocrmypdf_args(self) -> dict:
+        """
+        Get OCRmyPDF arguments as a dictionary for use with **kwargs.
+
+        Returns:
+            Dictionary of OCRmyPDF arguments excluding input/output parameters.
+        """
+        return {
+            "language": self.language,
+            "rotate_pages": self.rotate_pages,
+            "deskew": self.deskew,
+            "clean": self.clean,
+            "optimize": self.optimize,
+            "pdf_renderer": self.pdf_renderer,
+            "force_ocr": self.force_ocr,
+            "skip_text": self.skip_text,
+            "tesseract_timeout": self.tesseract_timeout,
+            "redo_ocr": self.redo_ocr,
+            "progress_bar": self.progress_bar,
+            "output_type": self.output_type,  # skip PDF/A conversion
+        }
 
 
 class PDFPreprocessor:
@@ -100,7 +130,6 @@ def __init__(self, settings: Optional[PDFPreprocessingSettings] = None):
             settings: Optional PDFPreprocessingSettings instance. If None, loads from environment.
         """
         self.settings = settings or PDFPreprocessingSettings()
-        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
 
         # Initialize analyzer if available and enabled
         if self.settings.enable_analysis and ANALYSIS_AVAILABLE:
@@ -108,12 +137,12 @@ def __init__(self, settings: Optional[PDFPreprocessingSettings] = None):
         else:
             self.analyzer = None
             if self.settings.enable_analysis and not ANALYSIS_AVAILABLE:
-                self.logger.warning("PDF analysis is enabled but dependencies are not available")
+                _LOGGER.warning("PDF analysis is enabled but dependencies are not available")
 
         if not self.settings.enabled:
-            self.logger.info("PDF preprocessing is disabled via configuration")
+            _LOGGER.info("PDF preprocessing is disabled via configuration")
         elif not OCRMYPDF_AVAILABLE:
-            self.logger.warning("OCRmyPDF not available, PDF preprocessing will be skipped")
+            _LOGGER.warning("OCRmyPDF not available, PDF preprocessing will be skipped")
 
     def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult:
         """
@@ -126,31 +155,20 @@ def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult:
         Returns:
             PDFProcessingResult containing processed data and layout analysis metadata
         """
-        # Initialize metadata
-        metadata = {
-            "preprocessing_enabled": self.settings.enabled,
-            "ocrmypdf_available": OCRMYPDF_AVAILABLE,
-            "original_filename": filename,
-            "processing_timestamp": time.time(),
-        }
+        metadata = {}
 
         # Handle non-PDF files or disabled preprocessing
         if not filename.lower().endswith(".pdf"):
-            metadata["skipped_reason"] = "not_pdf"
             return PDFProcessingResult(processed_data=file_data, metadata=metadata)
 
         if not self.settings.enabled:
-            self.logger.debug(f"PDF preprocessing disabled, skipping: {filename}")
-            metadata["skipped_reason"] = "preprocessing_disabled"
-            # Still run analysis on original data if enabled and available
             if self.analyzer:
                 layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
                 metadata.update(layout_analysis)
             return PDFProcessingResult(processed_data=file_data, metadata=metadata)
 
         if not OCRMYPDF_AVAILABLE:
-            self.logger.warning("OCRmyPDF not available, skipping preprocessing")
-            metadata["skipped_reason"] = "ocrmypdf_unavailable"
+            _LOGGER.warning("OCRmyPDF not available, skipping preprocessing")
             # Still run analysis on original data if enabled and available
             if self.analyzer:
                 layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
@@ -159,62 +177,36 @@ def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult:
 
         try:
             start_time = time.time()
-            self.logger.info(f"Starting PDF preprocessing and analysis for: {filename}")
 
             # Step 1: Analyze original PDF layout (if enabled and available)
             if self.analyzer:
-                self.logger.debug("Analyzing PDF layout structure...")
                 layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
                 metadata.update(layout_analysis)
-            else:
-                metadata.update({"analysis_available": False, "analysis_skipped": "disabled_or_unavailable"})
 
             # Step 2: OCR preprocessing
-            self.logger.debug("Starting OCRmyPDF processing...")
             try:
                 input_buffer = BytesIO(file_data)
                 output_buffer = BytesIO()
 
-                ocrmypdf.ocr(
-                    input_buffer,
-                    output_buffer,
-                    language=self.settings.language,
-                    rotate_pages=self.settings.rotate_pages,
-                    deskew=self.settings.deskew,
-                    clean=self.settings.clean,
-                    optimize=self.settings.optimize,
-                    pdf_renderer=self.settings.pdf_renderer,
-                    force_ocr=self.settings.force_ocr,
-                    skip_text=self.settings.skip_text,
-                    redo_ocr=self.settings.redo_ocr,
-                    progress_bar=self.settings.progress_bar,
-                    quiet=self.settings.quiet,
-                )
+                ocrmypdf.ocr(input_buffer, output_buffer, **self.settings.get_ocrmypdf_args())
 
                 processed_data = output_buffer.getvalue()
                 output_buffer.close()
                 input_buffer.close()
 
-                metadata["ocr_method"] = "bytesio"
-
             except Exception as buffer_error:
-                self.logger.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}")
+                _LOGGER.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}")
                 processed_data = self._preprocess_with_temp_files(file_data, filename)
-                metadata["ocr_method"] = "temp_files"
-                metadata["ocr_fallback_reason"] = str(buffer_error)
 
             processing_time = time.time() - start_time
             metadata["processing_time_seconds"] = processing_time
-            metadata["processing_successful"] = True
-
-            self.logger.info(f"PDF preprocessing completed for {filename} in {processing_time:.2f} seconds")
+            print(metadata)
+            _LOGGER.info(f"PDF preprocessing completed for {filename} in {processing_time:.2f} seconds")
 
             return PDFProcessingResult(processed_data=processed_data, metadata=metadata)
 
         except Exception as e:
-            self.logger.error(f"PDF preprocessing failed for {filename}: {e}")
-            metadata["processing_successful"] = False
-            metadata["processing_error"] = str(e)
+            _LOGGER.error(f"PDF preprocessing failed for {filename}: {e}")
             return PDFProcessingResult(processed_data=file_data, metadata=metadata)
 
     def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes:
@@ -240,21 +232,7 @@ def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes:
             )
             output_temp_file.close()
 
-            ocrmypdf.ocr(
-                input_temp_file.name,
-                output_temp_file.name,
-                language=self.settings.language,
-                rotate_pages=self.settings.rotate_pages,
-                deskew=self.settings.deskew,
-                clean=self.settings.clean,
-                optimize=self.settings.optimize,
-                pdf_renderer=self.settings.pdf_renderer,
-                force_ocr=self.settings.force_ocr,
-                skip_text=self.settings.skip_text,
-                redo_ocr=self.settings.redo_ocr,
-                progress_bar=self.settings.progress_bar,
-                quiet=self.settings.quiet,
-            )
+            ocrmypdf.ocr(input_temp_file.name, output_temp_file.name, **self.settings.get_ocrmypdf_args())
 
             with open(output_temp_file.name, "rb") as f:
                 processed_data = f.read()
@@ -268,7 +246,7 @@ def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes:
                         if hasattr(temp_file, "name"):
                             os.unlink(temp_file.name)
                     except OSError as e:
-                        self.logger.warning(f"Failed to clean up temp file: {e}")
+                        _LOGGER.warning(f"Failed to clean up temp file: {e}")
 
 
 # Global preprocessor instance (can be configured via environment variables)

From 3f84066f7870430a4a3a85e16d70a0077ad54da6 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Tue, 5 Aug 2025 10:50:01 -0700
Subject: [PATCH 07/22] feat: introduce rotate pages threshold in PDF
 preprocessing settings and update Tesseract timeout description

---
 argilla-server/.env.dev                                |  3 ++-
 .../argilla_server/contexts/document/preprocessing.py  | 10 ++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/argilla-server/.env.dev b/argilla-server/.env.dev
index 6c9ad1694..2122a42cf 100644
--- a/argilla-server/.env.dev
+++ b/argilla-server/.env.dev
@@ -26,8 +26,9 @@ PREPROCESSING_ENABLED=true
 PREPROCESSING_ENABLE_ANALYSIS=false
 PREPROCESSING_LANGUAGE='["eng"]'
 PREPROCESSING_ROTATE_PAGES=true
+PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0
 PREPROCESSING_OPTIMIZE=1
 PREPROCESSING_CLEAN=false
 PREPROCESSING_SKIP_TEXT=true
-PREPROCESSING_TESSERACT_TIMEOUT=0
+# PREPROCESSING_TESSERACT_TIMEOUT=0 # Uncomment to disable Tesseract OCR
 PREPROCESSING_QUIET=true
\ No newline at end of file
diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
index 085daf6e5..88601eb24 100644
--- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py
+++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py
@@ -62,6 +62,11 @@ class Config:
 
     rotate_pages: bool = Field(default=True, description="Auto-rotate pages with horizontal text")
 
+    rotate_pages_threshold: float = Field(
+        default=2.0,
+        description="Threshold for auto-rotation",
+    )
+
     deskew: bool = Field(default=False, description="Fix skewed text")
 
     clean: bool = Field(default=True, description="Use `unpaper` to clean up artifacts")
@@ -75,7 +80,7 @@ class Config:
     force_ocr: bool = Field(default=False, description="Force OCR on all pages, even if they already have text")
 
     tesseract_timeout: int = Field(
-        default=0, description="Timeout for Tesseract OCR processing in seconds (0 for no timeout)"
+        default=0, description="Timeout for Tesseract OCR processing in seconds (0 to skip Tesseract OCR)"
     )
 
     skip_text: bool = Field(default=True, description="Skip text-based operations (OCR only for images)")
@@ -101,6 +106,7 @@ def get_ocrmypdf_args(self) -> dict:
         return {
             "language": self.language,
             "rotate_pages": self.rotate_pages,
+            "rotate_pages_threshold": self.rotate_pages_threshold,
             "deskew": self.deskew,
             "clean": self.clean,
             "optimize": self.optimize,
@@ -110,7 +116,7 @@ def get_ocrmypdf_args(self) -> dict:
             "tesseract_timeout": self.tesseract_timeout,
             "redo_ocr": self.redo_ocr,
             "progress_bar": self.progress_bar,
-            "output_type": self.output_type,  # skip PDF/A conversion
+            "output_type": self.output_type,
         }
 
 

From b69e49e4dd3963db4e8a3ed267251d787d7bccc8 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Wed, 6 Aug 2025 16:16:50 -0700
Subject: [PATCH 08/22] merge conflicts

---
 extralit-server/.env.dev                             | 12 ++++++++++++
 .../contexts/document/preprocessing.py               |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev
index 9617b610b..51ec548b0 100644
--- a/extralit-server/.env.dev
+++ b/extralit-server/.env.dev
@@ -20,3 +20,15 @@ EXTRALIT_ELASTICSEARCH=http://localhost:9200
 
 # Redis configuration
 EXTRALIT_REDIS_URL=redis://localhost:6379/0
+
+# PDF Preprocessing
+PREPROCESSING_ENABLED=true
+PREPROCESSING_ENABLE_ANALYSIS=false
+PREPROCESSING_LANGUAGE='["eng"]'
+PREPROCESSING_ROTATE_PAGES=true
+PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0
+PREPROCESSING_OPTIMIZE=1
+PREPROCESSING_CLEAN=false
+PREPROCESSING_SKIP_TEXT=true
+# PREPROCESSING_TESSERACT_TIMEOUT=0 # Uncomment to disable Tesseract OCR
+PREPROCESSING_QUIET=true
diff --git a/extralit-server/src/extralit_server/contexts/document/preprocessing.py b/extralit-server/src/extralit_server/contexts/document/preprocessing.py
index 88601eb24..b23f03fe9 100644
--- a/extralit-server/src/extralit_server/contexts/document/preprocessing.py
+++ b/extralit-server/src/extralit_server/contexts/document/preprocessing.py
@@ -33,7 +33,7 @@
     OCRMYPDF_AVAILABLE = False
 
 try:
-    from argilla_server.contexts.document.analysis import PDFAnalyzer, PDFProcessingResult
+    from extralit_server.contexts.document.analysis import PDFAnalyzer, PDFProcessingResult
 
     ANALYSIS_AVAILABLE = True
 except ImportError:

From 713c10749a61c7e462bc595ec49bc6ee6457d538 Mon Sep 17 00:00:00 2001
From: Priyankesh <priyankeshom@gmail.com>
Date: Fri, 8 Aug 2025 21:07:39 +0530
Subject: [PATCH 09/22] initial local commit

---
 extralit-server/.env.dev       |   2 +-
 extralit-server/pdm.lock       | 168 ++++++++++++++++++++-------------
 extralit-server/pyproject.toml |   2 +-
 3 files changed, 104 insertions(+), 68 deletions(-)

diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev
index 51ec548b0..234e8f829 100644
--- a/extralit-server/.env.dev
+++ b/extralit-server/.env.dev
@@ -1,7 +1,7 @@
 OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # Needed by RQ to work with forked processes on MacOS
 ALEMBIC_CONFIG=src/extralit_server/alembic.ini
 EXTRALIT_AUTH_SECRET_KEY=8VO7na5N/jQx+yP/N+HlE8q51vPdrxqlh6OzoebIyko= # With this we avoid using a different key every time the server is reloaded
-EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${HOME}/.extralit/extralit-dev.db?check_same_thread=False
+EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${USERPROFILE}/.extralit/extralit-dev.db?check_same_thread=False
 HF_HUB_DISABLE_TELEMETRY=1
 
 # S3 Configuration (skipped to use LocalFileStorage)
diff --git a/extralit-server/pdm.lock b/extralit-server/pdm.lock
index 731284e16..a52eb2764 100644
--- a/extralit-server/pdm.lock
+++ b/extralit-server/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "postgresql", "test"]
 strategy = []
 lock_version = "4.5.0"
-content_hash = "sha256:037bf9850aef2d48dd2d032bdac1c64e906123f5aab6cea46dff7d66d2035d37"
+content_hash = "sha256:b81b48f68a21fcdb9fe67c3d94a30419667da306d05bde66d807bcf4bb51858e"
 
 [[metadata.targets]]
 requires_python = ">=3.10"
@@ -1682,69 +1682,105 @@ files = [
 
 [[package]]
 name = "pillow"
-version = "11.0.0"
-summary = ""
-files = [
-    {file = "pillow-11.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:6619654954dc4936fcff82db8eb6401d3159ec6be81e33c6000dfd76ae189947"},
-    {file = "pillow-11.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b3c5ac4bed7519088103d9450a1107f76308ecf91d6dabc8a33a2fcfb18d0fba"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a65149d8ada1055029fcb665452b2814fe7d7082fcb0c5bed6db851cb69b2086"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88a58d8ac0cc0e7f3a014509f0455248a76629ca9b604eca7dc5927cc593c5e9"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c26845094b1af3c91852745ae78e3ea47abf3dbcd1cf962f16b9a5fbe3ee8488"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:1a61b54f87ab5786b8479f81c4b11f4d61702830354520837f8cc791ebba0f5f"},
-    {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:674629ff60030d144b7bca2b8330225a9b11c482ed408813924619c6f302fdbb"},
-    {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:598b4e238f13276e0008299bd2482003f48158e2b11826862b1eb2ad7c768b97"},
-    {file = "pillow-11.0.0-cp310-cp310-win32.whl", hash = "sha256:9a0f748eaa434a41fccf8e1ee7a3eed68af1b690e75328fd7a60af123c193b50"},
-    {file = "pillow-11.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:a5629742881bcbc1f42e840af185fd4d83a5edeb96475a575f4da50d6ede337c"},
-    {file = "pillow-11.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:ee217c198f2e41f184f3869f3e485557296d505b5195c513b2bfe0062dc537f1"},
-    {file = "pillow-11.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1c1d72714f429a521d8d2d018badc42414c3077eb187a59579f28e4270b4b0fc"},
-    {file = "pillow-11.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:499c3a1b0d6fc8213519e193796eb1a86a1be4b1877d678b30f83fd979811d1a"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8b2351c85d855293a299038e1f89db92a2f35e8d2f783489c6f0b2b5f3fe8a3"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f4dba50cfa56f910241eb7f883c20f1e7b1d8f7d91c750cd0b318bad443f4d5"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:5ddbfd761ee00c12ee1be86c9c0683ecf5bb14c9772ddbd782085779a63dd55b"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:45c566eb10b8967d71bf1ab8e4a525e5a93519e29ea071459ce517f6b903d7fa"},
-    {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b4fd7bd29610a83a8c9b564d457cf5bd92b4e11e79a4ee4716a63c959699b306"},
-    {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cb929ca942d0ec4fac404cbf520ee6cac37bf35be479b970c4ffadf2b6a1cad9"},
-    {file = "pillow-11.0.0-cp311-cp311-win32.whl", hash = "sha256:006bcdd307cc47ba43e924099a038cbf9591062e6c50e570819743f5607404f5"},
-    {file = "pillow-11.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:52a2d8323a465f84faaba5236567d212c3668f2ab53e1c74c15583cf507a0291"},
-    {file = "pillow-11.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:16095692a253047fe3ec028e951fa4221a1f3ed3d80c397e83541a3037ff67c9"},
-    {file = "pillow-11.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d2c0a187a92a1cb5ef2c8ed5412dd8d4334272617f532d4ad4de31e0495bd923"},
-    {file = "pillow-11.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:084a07ef0821cfe4858fe86652fffac8e187b6ae677e9906e192aafcc1b69903"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8069c5179902dcdce0be9bfc8235347fdbac249d23bd90514b7a47a72d9fecf4"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f02541ef64077f22bf4924f225c0fd1248c168f86e4b7abdedd87d6ebaceab0f"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fcb4621042ac4b7865c179bb972ed0da0218a076dc1820ffc48b1d74c1e37fe9"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:00177a63030d612148e659b55ba99527803288cea7c75fb05766ab7981a8c1b7"},
-    {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8853a3bf12afddfdf15f57c4b02d7ded92c7a75a5d7331d19f4f9572a89c17e6"},
-    {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3107c66e43bda25359d5ef446f59c497de2b5ed4c7fdba0894f8d6cf3822dafc"},
-    {file = "pillow-11.0.0-cp312-cp312-win32.whl", hash = "sha256:86510e3f5eca0ab87429dd77fafc04693195eec7fd6a137c389c3eeb4cfb77c6"},
-    {file = "pillow-11.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:8ec4a89295cd6cd4d1058a5e6aec6bf51e0eaaf9714774e1bfac7cfc9051db47"},
-    {file = "pillow-11.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:27a7860107500d813fcd203b4ea19b04babe79448268403172782754870dac25"},
-    {file = "pillow-11.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:bcd1fb5bb7b07f64c15618c89efcc2cfa3e95f0e3bcdbaf4642509de1942a699"},
-    {file = "pillow-11.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0e038b0745997c7dcaae350d35859c9715c71e92ffb7e0f4a8e8a16732150f38"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ae08bd8ffc41aebf578c2af2f9d8749d91f448b3bfd41d7d9ff573d74f2a6b2"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d69bfd8ec3219ae71bcde1f942b728903cad25fafe3100ba2258b973bd2bc1b2"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:61b887f9ddba63ddf62fd02a3ba7add935d053b6dd7d58998c630e6dbade8527"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:c6a660307ca9d4867caa8d9ca2c2658ab685de83792d1876274991adec7b93fa"},
-    {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:73e3a0200cdda995c7e43dd47436c1548f87a30bb27fb871f352a22ab8dcf45f"},
-    {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fba162b8872d30fea8c52b258a542c5dfd7b235fb5cb352240c8d63b414013eb"},
-    {file = "pillow-11.0.0-cp313-cp313-win32.whl", hash = "sha256:f1b82c27e89fffc6da125d5eb0ca6e68017faf5efc078128cfaa42cf5cb38798"},
-    {file = "pillow-11.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:8ba470552b48e5835f1d23ecb936bb7f71d206f9dfeee64245f30c3270b994de"},
-    {file = "pillow-11.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:846e193e103b41e984ac921b335df59195356ce3f71dcfd155aa79c603873b84"},
-    {file = "pillow-11.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4ad70c4214f67d7466bea6a08061eba35c01b1b89eaa098040a35272a8efb22b"},
-    {file = "pillow-11.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6ec0d5af64f2e3d64a165f490d96368bb5dea8b8f9ad04487f9ab60dc4bb6003"},
-    {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c809a70e43c7977c4a42aefd62f0131823ebf7dd73556fa5d5950f5b354087e2"},
-    {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4b60c9520f7207aaf2e1d94de026682fc227806c6e1f55bba7606d1c94dd623a"},
-    {file = "pillow-11.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1e2688958a840c822279fda0086fec1fdab2f95bf2b717b66871c4ad9859d7e8"},
-    {file = "pillow-11.0.0-cp313-cp313t-win32.whl", hash = "sha256:607bbe123c74e272e381a8d1957083a9463401f7bd01287f50521ecb05a313f8"},
-    {file = "pillow-11.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5c39ed17edea3bc69c743a8dd3e9853b7509625c2462532e62baa0732163a904"},
-    {file = "pillow-11.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:75acbbeb05b86bc53cbe7b7e6fe00fbcf82ad7c684b3ad82e3d711da9ba287d3"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1187739620f2b365de756ce086fdb3604573337cc28a0d3ac4a01ab6b2d2a6d2"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fbbcb7b57dc9c794843e3d1258c0fbf0f48656d46ffe9e09b63bbd6e8cd5d0a2"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d203af30149ae339ad1b4f710d9844ed8796e97fda23ffbc4cc472968a47d0b"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21a0d3b115009ebb8ac3d2ebec5c2982cc693da935f4ab7bb5c8ebe2f47d36f2"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:73853108f56df97baf2bb8b522f3578221e56f646ba345a372c78326710d3830"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e58876c91f97b0952eb766123bfef372792ab3f4e3e1f1a2267834c2ab131734"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:224aaa38177597bb179f3ec87eeefcce8e4f85e608025e9cfac60de237ba6316"},
-    {file = "pillow-11.0.0.tar.gz", hash = "sha256:72bacbaf24ac003fea9bff9837d1eedb6088758d41e100c1552930151f677739"},
+version = "11.3.0"
+requires_python = ">=3.9"
+summary = "Python Imaging Library (Fork)"
+files = [
+    {file = "pillow-11.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1b9c17fd4ace828b3003dfd1e30bff24863e0eb59b535e8f80194d9cc7ecf860"},
+    {file = "pillow-11.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:65dc69160114cdd0ca0f35cb434633c75e8e7fad4cf855177a05bf38678f73ad"},
+    {file = "pillow-11.3.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7107195ddc914f656c7fc8e4a5e1c25f32e9236ea3ea860f257b0436011fddd0"},
+    {file = "pillow-11.3.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc3e831b563b3114baac7ec2ee86819eb03caa1a2cef0b481a5675b59c4fe23b"},
+    {file = "pillow-11.3.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f182ebd2303acf8c380a54f615ec883322593320a9b00438eb842c1f37ae50"},
+    {file = "pillow-11.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4445fa62e15936a028672fd48c4c11a66d641d2c05726c7ec1f8ba6a572036ae"},
+    {file = "pillow-11.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:71f511f6b3b91dd543282477be45a033e4845a40278fa8dcdbfdb07109bf18f9"},
+    {file = "pillow-11.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:040a5b691b0713e1f6cbe222e0f4f74cd233421e105850ae3b3c0ceda520f42e"},
+    {file = "pillow-11.3.0-cp310-cp310-win32.whl", hash = "sha256:89bd777bc6624fe4115e9fac3352c79ed60f3bb18651420635f26e643e3dd1f6"},
+    {file = "pillow-11.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:19d2ff547c75b8e3ff46f4d9ef969a06c30ab2d4263a9e287733aa8b2429ce8f"},
+    {file = "pillow-11.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:819931d25e57b513242859ce1876c58c59dc31587847bf74cfe06b2e0cb22d2f"},
+    {file = "pillow-11.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1cd110edf822773368b396281a2293aeb91c90a2db00d78ea43e7e861631b722"},
+    {file = "pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c412fddd1b77a75aa904615ebaa6001f169b26fd467b4be93aded278266b288"},
+    {file = "pillow-11.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1aa4de119a0ecac0a34a9c8bde33f34022e2e8f99104e47a3ca392fd60e37d"},
+    {file = "pillow-11.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:91da1d88226663594e3f6b4b8c3c8d85bd504117d043740a8e0ec449087cc494"},
+    {file = "pillow-11.3.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:643f189248837533073c405ec2f0bb250ba54598cf80e8c1e043381a60632f58"},
+    {file = "pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:106064daa23a745510dabce1d84f29137a37224831d88eb4ce94bb187b1d7e5f"},
+    {file = "pillow-11.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd8ff254faf15591e724dc7c4ddb6bf4793efcbe13802a4ae3e863cd300b493e"},
+    {file = "pillow-11.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:932c754c2d51ad2b2271fd01c3d121daaa35e27efae2a616f77bf164bc0b3e94"},
+    {file = "pillow-11.3.0-cp311-cp311-win32.whl", hash = "sha256:b4b8f3efc8d530a1544e5962bd6b403d5f7fe8b9e08227c6b255f98ad82b4ba0"},
+    {file = "pillow-11.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:1a992e86b0dd7aeb1f053cd506508c0999d710a8f07b4c791c63843fc6a807ac"},
+    {file = "pillow-11.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:30807c931ff7c095620fe04448e2c2fc673fcbb1ffe2a7da3fb39613489b1ddd"},
+    {file = "pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4"},
+    {file = "pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69"},
+    {file = "pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d"},
+    {file = "pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6"},
+    {file = "pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7"},
+    {file = "pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024"},
+    {file = "pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809"},
+    {file = "pillow-11.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6be31e3fc9a621e071bc17bb7de63b85cbe0bfae91bb0363c893cbe67247780d"},
+    {file = "pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149"},
+    {file = "pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d"},
+    {file = "pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542"},
+    {file = "pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:1c627742b539bba4309df89171356fcb3cc5a9178355b2727d1b74a6cf155fbd"},
+    {file = "pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30b7c02f3899d10f13d7a48163c8969e4e653f8b43416d23d13d1bbfdc93b9f8"},
+    {file = "pillow-11.3.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:7859a4cc7c9295f5838015d8cc0a9c215b77e43d07a25e460f35cf516df8626f"},
+    {file = "pillow-11.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec1ee50470b0d050984394423d96325b744d55c701a439d2bd66089bff963d3c"},
+    {file = "pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7db51d222548ccfd274e4572fdbf3e810a5e66b00608862f947b163e613b67dd"},
+    {file = "pillow-11.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2d6fcc902a24ac74495df63faad1884282239265c6839a0a6416d33faedfae7e"},
+    {file = "pillow-11.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0f5d8f4a08090c6d6d578351a2b91acf519a54986c055af27e7a93feae6d3f1"},
+    {file = "pillow-11.3.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c37d8ba9411d6003bba9e518db0db0c58a680ab9fe5179f040b0463644bc9805"},
+    {file = "pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13f87d581e71d9189ab21fe0efb5a23e9f28552d5be6979e84001d3b8505abe8"},
+    {file = "pillow-11.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:023f6d2d11784a465f09fd09a34b150ea4672e85fb3d05931d89f373ab14abb2"},
+    {file = "pillow-11.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:45dfc51ac5975b938e9809451c51734124e73b04d0f0ac621649821a63852e7b"},
+    {file = "pillow-11.3.0-cp313-cp313-win32.whl", hash = "sha256:a4d336baed65d50d37b88ca5b60c0fa9d81e3a87d4a7930d3880d1624d5b31f3"},
+    {file = "pillow-11.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0bce5c4fd0921f99d2e858dc4d4d64193407e1b99478bc5cacecba2311abde51"},
+    {file = "pillow-11.3.0-cp313-cp313-win_arm64.whl", hash = "sha256:1904e1264881f682f02b7f8167935cce37bc97db457f8e7849dc3a6a52b99580"},
+    {file = "pillow-11.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4c834a3921375c48ee6b9624061076bc0a32a60b5532b322cc0ea64e639dd50e"},
+    {file = "pillow-11.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e05688ccef30ea69b9317a9ead994b93975104a677a36a8ed8106be9260aa6d"},
+    {file = "pillow-11.3.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1019b04af07fc0163e2810167918cb5add8d74674b6267616021ab558dc98ced"},
+    {file = "pillow-11.3.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f944255db153ebb2b19c51fe85dd99ef0ce494123f21b9db4877ffdfc5590c7c"},
+    {file = "pillow-11.3.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f85acb69adf2aaee8b7da124efebbdb959a104db34d3a2cb0f3793dbae422a8"},
+    {file = "pillow-11.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05f6ecbeff5005399bb48d198f098a9b4b6bdf27b8487c7f38ca16eeb070cd59"},
+    {file = "pillow-11.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a7bc6e6fd0395bc052f16b1a8670859964dbd7003bd0af2ff08342eb6e442cfe"},
+    {file = "pillow-11.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:83e1b0161c9d148125083a35c1c5a89db5b7054834fd4387499e06552035236c"},
+    {file = "pillow-11.3.0-cp313-cp313t-win32.whl", hash = "sha256:2a3117c06b8fb646639dce83694f2f9eac405472713fcb1ae887469c0d4f6788"},
+    {file = "pillow-11.3.0-cp313-cp313t-win_amd64.whl", hash = "sha256:857844335c95bea93fb39e0fa2726b4d9d758850b34075a7e3ff4f4fa3aa3b31"},
+    {file = "pillow-11.3.0-cp313-cp313t-win_arm64.whl", hash = "sha256:8797edc41f3e8536ae4b10897ee2f637235c94f27404cac7297f7b607dd0716e"},
+    {file = "pillow-11.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d9da3df5f9ea2a89b81bb6087177fb1f4d1c7146d583a3fe5c672c0d94e55e12"},
+    {file = "pillow-11.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0b275ff9b04df7b640c59ec5a3cb113eefd3795a8df80bac69646ef699c6981a"},
+    {file = "pillow-11.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0743841cabd3dba6a83f38a92672cccbd69af56e3e91777b0ee7f4dba4385632"},
+    {file = "pillow-11.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2465a69cf967b8b49ee1b96d76718cd98c4e925414ead59fdf75cf0fd07df673"},
+    {file = "pillow-11.3.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41742638139424703b4d01665b807c6468e23e699e8e90cffefe291c5832b027"},
+    {file = "pillow-11.3.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93efb0b4de7e340d99057415c749175e24c8864302369e05914682ba642e5d77"},
+    {file = "pillow-11.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7966e38dcd0fa11ca390aed7c6f20454443581d758242023cf36fcb319b1a874"},
+    {file = "pillow-11.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:98a9afa7b9007c67ed84c57c9e0ad86a6000da96eaa638e4f8abe5b65ff83f0a"},
+    {file = "pillow-11.3.0-cp314-cp314-win32.whl", hash = "sha256:02a723e6bf909e7cea0dac1b0e0310be9d7650cd66222a5f1c571455c0a45214"},
+    {file = "pillow-11.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:a418486160228f64dd9e9efcd132679b7a02a5f22c982c78b6fc7dab3fefb635"},
+    {file = "pillow-11.3.0-cp314-cp314-win_arm64.whl", hash = "sha256:155658efb5e044669c08896c0c44231c5e9abcaadbc5cd3648df2f7c0b96b9a6"},
+    {file = "pillow-11.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:59a03cdf019efbfeeed910bf79c7c93255c3d54bc45898ac2a4140071b02b4ae"},
+    {file = "pillow-11.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f8a5827f84d973d8636e9dc5764af4f0cf2318d26744b3d902931701b0d46653"},
+    {file = "pillow-11.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ee92f2fd10f4adc4b43d07ec5e779932b4eb3dbfbc34790ada5a6669bc095aa6"},
+    {file = "pillow-11.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c96d333dcf42d01f47b37e0979b6bd73ec91eae18614864622d9b87bbd5bbf36"},
+    {file = "pillow-11.3.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c96f993ab8c98460cd0c001447bff6194403e8b1d7e149ade5f00594918128b"},
+    {file = "pillow-11.3.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:41342b64afeba938edb034d122b2dda5db2139b9a4af999729ba8818e0056477"},
+    {file = "pillow-11.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:068d9c39a2d1b358eb9f245ce7ab1b5c3246c7c8c7d9ba58cfa5b43146c06e50"},
+    {file = "pillow-11.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a1bc6ba083b145187f648b667e05a2534ecc4b9f2784c2cbe3089e44868f2b9b"},
+    {file = "pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12"},
+    {file = "pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db"},
+    {file = "pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa"},
+    {file = "pillow-11.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3cee80663f29e3843b68199b9d6f4f54bd1d4a6b59bdd91bceefc51238bcb967"},
+    {file = "pillow-11.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b5f56c3f344f2ccaf0dd875d3e180f631dc60a51b314295a3e681fe8cf851fbe"},
+    {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e67d793d180c9df62f1f40aee3accca4829d3794c95098887edc18af4b8b780c"},
+    {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d000f46e2917c705e9fb93a3606ee4a819d1e3aa7a9b442f6444f07e77cf5e25"},
+    {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:527b37216b6ac3a12d7838dc3bd75208ec57c1c6d11ef01902266a5a0c14fc27"},
+    {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:be5463ac478b623b9dd3937afd7fb7ab3d79dd290a28e2b6df292dc75063eb8a"},
+    {file = "pillow-11.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8dc70ca24c110503e16918a658b869019126ecfe03109b754c402daff12b3d9f"},
+    {file = "pillow-11.3.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7c8ec7a017ad1bd562f93dbd8505763e688d388cde6e4a010ae1486916e713e6"},
+    {file = "pillow-11.3.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9ab6ae226de48019caa8074894544af5b53a117ccb9d3b3dcb2871464c829438"},
+    {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fe27fb049cdcca11f11a7bfda64043c37b30e6b91f10cb5bab275806c32f6ab3"},
+    {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:465b9e8844e3c3519a983d58b80be3f668e2a7a5db97f2784e7079fbc9f9822c"},
+    {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5418b53c0d59b3824d05e029669efa023bbef0f3e92e75ec8428f3799487f361"},
+    {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:504b6f59505f08ae014f724b6207ff6222662aab5cc9542577fb084ed0676ac7"},
+    {file = "pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8"},
+    {file = "pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523"},
 ]
 
 [[package]]
@@ -2399,7 +2435,7 @@ name = "sqlalchemy"
 version = "2.0.36"
 summary = ""
 dependencies = [
-    "greenlet; python_full_version < \"3.13\" and (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\")",
+    "greenlet; (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\") and python_full_version < \"3.13\"",
     "typing-extensions",
 ]
 files = [
@@ -2571,7 +2607,7 @@ dependencies = [
     "python-dotenv",
     "pyyaml",
     "uvicorn==0.32.0",
-    "uvloop; platform_python_implementation != \"PyPy\" and (sys_platform != \"cygwin\" and sys_platform != \"win32\")",
+    "uvloop; (sys_platform != \"cygwin\" and sys_platform != \"win32\") and platform_python_implementation != \"PyPy\"",
     "watchfiles",
     "websockets",
 ]
diff --git a/extralit-server/pyproject.toml b/extralit-server/pyproject.toml
index 5e175aa62..1883a509e 100644
--- a/extralit-server/pyproject.toml
+++ b/extralit-server/pyproject.toml
@@ -60,7 +60,7 @@ dependencies = [
     "standardwebhooks>=1.0.0",
     # For HF dataset import
     "datasets >= 3.0.1",
-    "pillow >= 10.4.0",
+    "pillow>=11.3.0",
     # For Telemetry
     "huggingface-hub>=0.26.2",
     "Jinja2>=3.1.4",           # Used by huggingface-hub to render dataset card templates

From 4940b2ee3394c40043e29ac90a2638809187196f Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Fri, 8 Aug 2025 11:54:45 -0700
Subject: [PATCH 10/22] Modified database URL in .env.dev for better
 compatibility with user profiles

---
 .gitignore               | 3 +++
 extralit-server/.env.dev | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 996925b96..7a49b1481 100644
--- a/.gitignore
+++ b/.gitignore
@@ -155,3 +155,6 @@ src/**/server/static/
 # App generated files
 extralit-server/src/extralit_server/static
 extralit/site
+
+# Development files
+*.db
\ No newline at end of file
diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev
index 234e8f829..56815c0bf 100644
--- a/extralit-server/.env.dev
+++ b/extralit-server/.env.dev
@@ -1,7 +1,7 @@
 OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # Needed by RQ to work with forked processes on MacOS
 ALEMBIC_CONFIG=src/extralit_server/alembic.ini
 EXTRALIT_AUTH_SECRET_KEY=8VO7na5N/jQx+yP/N+HlE8q51vPdrxqlh6OzoebIyko= # With this we avoid using a different key every time the server is reloaded
-EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${USERPROFILE}/.extralit/extralit-dev.db?check_same_thread=False
+EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${USERPROFILE:-${HOME}}/.extralit/extralit-dev.db?check_same_thread=False
 HF_HUB_DISABLE_TELEMETRY=1
 
 # S3 Configuration (skipped to use LocalFileStorage)

From 0e5937744eb5d8d71535fd9f2803fc3b154310bf Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Fri, 8 Aug 2025 12:19:12 -0700
Subject: [PATCH 11/22] optimize ocrmypdf params by updating `optimization`
 level to 0, disable fast web view optimization, large image skipping, and set
 number of `jobs` to 0

---
 .../contexts/document/preprocessing.py        | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/extralit-server/src/extralit_server/contexts/document/preprocessing.py b/extralit-server/src/extralit_server/contexts/document/preprocessing.py
index b23f03fe9..9445c9b32 100644
--- a/extralit-server/src/extralit_server/contexts/document/preprocessing.py
+++ b/extralit-server/src/extralit_server/contexts/document/preprocessing.py
@@ -72,7 +72,7 @@ class Config:
     clean: bool = Field(default=True, description="Use `unpaper` to clean up artifacts")
 
     optimize: int = Field(
-        default=1, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)"
+        default=0, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)"
     )
 
     pdf_renderer: str = Field(default="hocr", description="PDF renderer: 'auto', 'hocr', 'sandwich'")
@@ -96,6 +96,21 @@ class Config:
         description="Output type for OCRmyPDF. Set to 'pdf' to skip PDF/A conversion.",
     )
 
+    fast_web_view: int = Field(
+        default=999999,
+        description="Fast web view optimization. Set to 999999 to disable fast web view optimization.",
+    )
+
+    skip_big: bool = Field(
+        default=True,
+        description="Skip large images if some pages have large images.",
+    )
+
+    jobs: int = Field(
+        default=1,
+        description="Number of worker processes to use for OCR. Set to 1 for Docker containers with limited CPU to avoid oversubscription.",
+    )
+
     def get_ocrmypdf_args(self) -> dict:
         """
         Get OCRmyPDF arguments as a dictionary for use with **kwargs.
@@ -117,6 +132,9 @@ def get_ocrmypdf_args(self) -> dict:
             "redo_ocr": self.redo_ocr,
             "progress_bar": self.progress_bar,
             "output_type": self.output_type,
+            "fast_web_view": self.fast_web_view,
+            "skip_big": self.skip_big,
+            "jobs": self.jobs,
         }
 
 

From 804ed861967af9b8a2d48aa06b8a7dfc39e19c52 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Fri, 8 Aug 2025 13:45:36 -0700
Subject: [PATCH 12/22] Update dependencies and optimize imports for lazy
 loading

- Added `lazy-loader` package to manage heavy dependencies like `cv2`, `pdf2image`, and `ocrmypdf` for improved performance.
- Updated `pdm.lock` to reflect the new package addition and modified existing dependencies.
- Cleaned up import statements in `analysis.py` and `preprocessing.py` to utilize lazy loading, ensuring these libraries are only loaded when needed.
- Removed unnecessary try-except blocks for dependency availability checks, as all required packages are now included in the application.
---
 extralit-server/.env.dev                      |  3 +-
 extralit-server/pdm.lock                      | 21 ++++++--
 extralit-server/pyproject.toml                |  6 ++-
 .../contexts/document/analysis.py             | 51 ++++++++-----------
 .../contexts/document/preprocessing.py        | 15 +++---
 .../src/extralit_server/contexts/imports.py   |  1 -
 6 files changed, 51 insertions(+), 46 deletions(-)

diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev
index 56815c0bf..8a3093239 100644
--- a/extralit-server/.env.dev
+++ b/extralit-server/.env.dev
@@ -1,7 +1,7 @@
 OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # Needed by RQ to work with forked processes on MacOS
 ALEMBIC_CONFIG=src/extralit_server/alembic.ini
 EXTRALIT_AUTH_SECRET_KEY=8VO7na5N/jQx+yP/N+HlE8q51vPdrxqlh6OzoebIyko= # With this we avoid using a different key every time the server is reloaded
-EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${USERPROFILE:-${HOME}}/.extralit/extralit-dev.db?check_same_thread=False
+EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${HOME}/.extralit/extralit-dev.db?check_same_thread=False
 HF_HUB_DISABLE_TELEMETRY=1
 
 # S3 Configuration (skipped to use LocalFileStorage)
@@ -27,7 +27,6 @@ PREPROCESSING_ENABLE_ANALYSIS=false
 PREPROCESSING_LANGUAGE='["eng"]'
 PREPROCESSING_ROTATE_PAGES=true
 PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0
-PREPROCESSING_OPTIMIZE=1
 PREPROCESSING_CLEAN=false
 PREPROCESSING_SKIP_TEXT=true
 # PREPROCESSING_TESSERACT_TIMEOUT=0 # Uncomment to disable Tesseract OCR
diff --git a/extralit-server/pdm.lock b/extralit-server/pdm.lock
index a52eb2764..0de71f798 100644
--- a/extralit-server/pdm.lock
+++ b/extralit-server/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "postgresql", "test"]
 strategy = []
 lock_version = "4.5.0"
-content_hash = "sha256:b81b48f68a21fcdb9fe67c3d94a30419667da306d05bde66d807bcf4bb51858e"
+content_hash = "sha256:02264c865d9bc17964c0abbd121810947822b6ff067ac8fcd0cbcb1ef46de191"
 
 [[metadata.targets]]
 requires_python = ">=3.10"
@@ -1149,6 +1149,18 @@ files = [
     {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
 ]
 
+[[package]]
+name = "lazy-loader"
+version = "0.4"
+summary = ""
+dependencies = [
+    "packaging",
+]
+files = [
+    {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"},
+    {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"},
+]
+
 [[package]]
 name = "lxml"
 version = "6.0.0"
@@ -1683,8 +1695,7 @@ files = [
 [[package]]
 name = "pillow"
 version = "11.3.0"
-requires_python = ">=3.9"
-summary = "Python Imaging Library (Fork)"
+summary = ""
 files = [
     {file = "pillow-11.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1b9c17fd4ace828b3003dfd1e30bff24863e0eb59b535e8f80194d9cc7ecf860"},
     {file = "pillow-11.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:65dc69160114cdd0ca0f35cb434633c75e8e7fad4cf855177a05bf38678f73ad"},
@@ -2435,7 +2446,7 @@ name = "sqlalchemy"
 version = "2.0.36"
 summary = ""
 dependencies = [
-    "greenlet; (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\") and python_full_version < \"3.13\"",
+    "greenlet; python_full_version < \"3.13\" and (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\")",
     "typing-extensions",
 ]
 files = [
@@ -2607,7 +2618,7 @@ dependencies = [
     "python-dotenv",
     "pyyaml",
     "uvicorn==0.32.0",
-    "uvloop; (sys_platform != \"cygwin\" and sys_platform != \"win32\") and platform_python_implementation != \"PyPy\"",
+    "uvloop; platform_python_implementation != \"PyPy\" and (sys_platform != \"cygwin\" and sys_platform != \"win32\")",
     "watchfiles",
     "websockets",
 ]
diff --git a/extralit-server/pyproject.toml b/extralit-server/pyproject.toml
index 1883a509e..58ba847d7 100644
--- a/extralit-server/pyproject.toml
+++ b/extralit-server/pyproject.toml
@@ -49,6 +49,7 @@ dependencies = [
     "social-auth-core ~= 4.5.0",
     # Background processing
     "rq ~= 1.16.2",
+    "lazy-loader>=0.4",
     # Info status
     "psutil ~= 5.8, <5.10",
     # For logging, tracebacks, printing, progressbars
@@ -69,7 +70,7 @@ dependencies = [
     # For document processing
     "ocrmypdf>=16.10.4",
     "pdf2image>=1.17.0",
-    "opencv-python>=4.11.0.86"
+    "opencv-python>=4.11.0.86",
 ]
 
 [project.optional-dependencies]
@@ -186,8 +187,9 @@ worker = { cmd = "python -m extralit_server worker" }
 server-dev.composite = [
     "migrate",
     "cli database users create_default",
-    "server",
+    "server-and-worker",
 ]
+server-and-worker = { shell = "pdm run server & pdm run worker & wait" }
 test = { cmd = "pytest --verbosity=1 --disable-warnings", env_file = ".env.test" }
 test-cov = { cmd = "pytest tests --cov=extralit_server --cov-report=term --cov-report=xml --verbosity=0 --disable-warnings", env_file = ".env.test" }
 
diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py
index 1f097cfe6..9d97b4fd5 100644
--- a/extralit-server/src/extralit_server/contexts/document/analysis.py
+++ b/extralit-server/src/extralit_server/contexts/document/analysis.py
@@ -14,31 +14,24 @@
 
 import logging
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 import numpy as np
 
-try:
-    import cv2
+import lazy_loader as lazy
 
-    CV2_AVAILABLE = True
-except ImportError:
-    CV2_AVAILABLE = False
+# These are only loaded when actually used in Redis workers
+cv2 = lazy.load("cv2")
+pdf2image = lazy.load("pdf2image")
+PIL_ImageChops = lazy.load("PIL.ImageChops")
+PIL_ImageDraw = lazy.load("PIL.ImageDraw")
+PIL_Image = lazy.load("PIL.Image")
 
-try:
-    from pdf2image import convert_from_bytes
-    from PIL import ImageChops, ImageDraw
-    from PIL.Image import Image as PILImage
+# Since dependencies are packaged together, they're always available
+CV2_AVAILABLE = True
+PDF2IMAGE_AVAILABLE = True
 
-    PDF2IMAGE_AVAILABLE = True
-except ImportError:
-    PDF2IMAGE_AVAILABLE = False
-
-try:
-    pass
-
-    OCRMYPDF_AVAILABLE = True
-except ImportError:
-    OCRMYPDF_AVAILABLE = False
+# For type hints - use Any to avoid import issues at module load time
+PILImage = Any  # Will be PIL.Image.Image when loaded
 
 logger = logging.getLogger(__name__)
 
@@ -55,7 +48,7 @@ class PDFProcessingResult:
 
 def pil_to_cv(image: PILImage) -> np.ndarray:
     """Convert PIL Image to OpenCV format."""
-    return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)  # type: ignore
 
 
 def classify_and_draw_layout_regions(
@@ -74,19 +67,19 @@ def classify_and_draw_layout_regions(
     h, w = mask_np.shape
 
     # Clean up the mask using morphological operations
-    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
-    cleaned = cv2.morphologyEx(mask_np, cv2.MORPH_CLOSE, kernel)
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))  # type: ignore
+    cleaned = cv2.morphologyEx(mask_np, cv2.MORPH_CLOSE, kernel)  # type: ignore
 
-    contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)  # type: ignore
 
     img = reference.copy() if label else reference
     regions = []
 
     if label:
-        draw = ImageDraw.Draw(img)
+        draw = PIL_ImageDraw.Draw(img)  # type: ignore
 
     for cnt in contours:
-        x, y, rw, rh = cv2.boundingRect(cnt)
+        x, y, rw, rh = cv2.boundingRect(cnt)  # type: ignore
         area = rw * rh
 
         if area < min_area:
@@ -173,7 +166,7 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
 
         try:
             # Convert PDF to images
-            images = convert_from_bytes(pdf_data, dpi=150)  # Lower DPI for analysis
+            images = pdf2image.convert_from_bytes(pdf_data, dpi=150)  # type: ignore  # Lower DPI for analysis
             if not images:
                 return {"analysis_available": False, "error": "No pages found"}
 
@@ -227,8 +220,8 @@ def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) ->
                 compare = compare.resize(reference.size)
 
             # Step 1: Compute difference and invert so white = same
-            diff = ImageChops.difference(reference, compare)
-            sameness_mask = ImageChops.invert(diff.convert("L"))
+            diff = PIL_ImageChops.difference(reference, compare)  # type: ignore
+            sameness_mask = PIL_ImageChops.invert(diff.convert("L"))  # type: ignore
 
             # Step 2: Threshold the mask (keep high-sameness pixels)
             # Create a lookup table for thresholding
diff --git a/extralit-server/src/extralit_server/contexts/document/preprocessing.py b/extralit-server/src/extralit_server/contexts/document/preprocessing.py
index 9445c9b32..7e3dae3c4 100644
--- a/extralit-server/src/extralit_server/contexts/document/preprocessing.py
+++ b/extralit-server/src/extralit_server/contexts/document/preprocessing.py
@@ -22,15 +22,16 @@
 from typing import List, Optional
 from uuid import uuid4
 
+import lazy_loader as lazy
 from pydantic import Field
 from pydantic_settings import BaseSettings
 
-try:
-    import ocrmypdf
+# Lazy load OCRmyPDF to avoid loading it in the main FastAPI process
+# Only loaded when actually used in Redis workers
+ocrmypdf = lazy.load("ocrmypdf")
 
-    OCRMYPDF_AVAILABLE = True
-except ImportError:
-    OCRMYPDF_AVAILABLE = False
+# Since OCRmyPDF is packaged with the application, it's always available
+OCRMYPDF_AVAILABLE = True
 
 try:
     from extralit_server.contexts.document.analysis import PDFAnalyzer, PDFProcessingResult
@@ -212,7 +213,7 @@ def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult:
                 input_buffer = BytesIO(file_data)
                 output_buffer = BytesIO()
 
-                ocrmypdf.ocr(input_buffer, output_buffer, **self.settings.get_ocrmypdf_args())
+                ocrmypdf.ocr(input_buffer, output_buffer, **self.settings.get_ocrmypdf_args())  # type: ignore
 
                 processed_data = output_buffer.getvalue()
                 output_buffer.close()
@@ -256,7 +257,7 @@ def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes:
             )
             output_temp_file.close()
 
-            ocrmypdf.ocr(input_temp_file.name, output_temp_file.name, **self.settings.get_ocrmypdf_args())
+            ocrmypdf.ocr(input_temp_file.name, output_temp_file.name, **self.settings.get_ocrmypdf_args())  # type: ignore
 
             with open(output_temp_file.name, "rb") as f:
                 processed_data = f.read()
diff --git a/extralit-server/src/extralit_server/contexts/imports.py b/extralit-server/src/extralit_server/contexts/imports.py
index 6520d8acc..a5d892c1b 100644
--- a/extralit-server/src/extralit_server/contexts/imports.py
+++ b/extralit-server/src/extralit_server/contexts/imports.py
@@ -390,7 +390,6 @@ async def process_bulk_upload(
                     reference_data=doc.document_create.model_dump(),
                     file_data_list=[],
                     user_id=user_id,
-                    job_timeout=None,  # No timeout for large uploads
                 )
 
                 # Store job ID mapped to reference key for frontend tracking

From 30c970a0ac7d764f8e9218bde4b6fe64bd25364a Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Fri, 8 Aug 2025 15:33:38 -0700
Subject: [PATCH 13/22] Enable PDF analysis and update preprocessing settings

- Updated `.env.dev` to enable PDF analysis and set quiet mode to false.
- Introduced `PDFMetadata` and `PDFProcessingResponse` models for structured metadata handling in `analysis.py`.
- Refactored `PDFPreprocessor` to utilize the new models and improve error handling during preprocessing.
- Adjusted type hints in `analysis.py` and `preprocessing.py` for better clarity and consistency.
- Updated job processing to reflect changes in the preprocessing method.
---
 extralit-server/.env.dev                      |   4 +-
 .../api/schemas/v1/document/preprocessing.py  |  29 ++++
 .../document/__init__.py}                     |   0
 .../contexts/document/analysis.py             |  55 ++----
 .../contexts/document/preprocessing.py        | 158 ++++++++----------
 .../src/extralit_server/jobs/document_jobs.py |   2 +-
 6 files changed, 114 insertions(+), 134 deletions(-)
 create mode 100644 extralit-server/src/extralit_server/api/schemas/v1/document/preprocessing.py
 rename extralit-server/src/extralit_server/{api/schemas/v1/document/analysis.py => contexts/document/__init__.py} (100%)

diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev
index 8a3093239..afbd12a3e 100644
--- a/extralit-server/.env.dev
+++ b/extralit-server/.env.dev
@@ -23,11 +23,11 @@ EXTRALIT_REDIS_URL=redis://localhost:6379/0
 
 # PDF Preprocessing
 PREPROCESSING_ENABLED=true
-PREPROCESSING_ENABLE_ANALYSIS=false
+PREPROCESSING_ENABLE_ANALYSIS=true
 PREPROCESSING_LANGUAGE='["eng"]'
 PREPROCESSING_ROTATE_PAGES=true
 PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0
 PREPROCESSING_CLEAN=false
 PREPROCESSING_SKIP_TEXT=true
 # PREPROCESSING_TESSERACT_TIMEOUT=0 # Uncomment to disable Tesseract OCR
-PREPROCESSING_QUIET=true
+PREPROCESSING_QUIET=false
diff --git a/extralit-server/src/extralit_server/api/schemas/v1/document/preprocessing.py b/extralit-server/src/extralit_server/api/schemas/v1/document/preprocessing.py
new file mode 100644
index 000000000..5448b90ca
--- /dev/null
+++ b/extralit-server/src/extralit_server/api/schemas/v1/document/preprocessing.py
@@ -0,0 +1,29 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional
+from pydantic import BaseModel
+
+
+class PDFMetadata(BaseModel):
+    """
+    Metadata for PDF processing results.
+    """
+
+    filename: str
+    processing_time: float
+    page_count: Optional[int] = None
+    language_detected: Optional[List[str]] = None
+    processing_settings: Optional[Dict] = None
+    analysis_results: Optional[Dict] = None
diff --git a/extralit-server/src/extralit_server/api/schemas/v1/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/__init__.py
similarity index 100%
rename from extralit-server/src/extralit_server/api/schemas/v1/document/analysis.py
rename to extralit-server/src/extralit_server/contexts/document/__init__.py
diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py
index 9d97b4fd5..209ac517a 100644
--- a/extralit-server/src/extralit_server/contexts/document/analysis.py
+++ b/extralit-server/src/extralit_server/contexts/document/analysis.py
@@ -13,55 +13,36 @@
 # limitations under the License.
 
 import logging
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
+
+from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
 import numpy as np
 
 import lazy_loader as lazy
 
-# These are only loaded when actually used in Redis workers
 cv2 = lazy.load("cv2")
 pdf2image = lazy.load("pdf2image")
-PIL_ImageChops = lazy.load("PIL.ImageChops")
-PIL_ImageDraw = lazy.load("PIL.ImageDraw")
-PIL_Image = lazy.load("PIL.Image")
-
-# Since dependencies are packaged together, they're always available
-CV2_AVAILABLE = True
-PDF2IMAGE_AVAILABLE = True
+PIL = lazy.load("PIL")
 
-# For type hints - use Any to avoid import issues at module load time
-PILImage = Any  # Will be PIL.Image.Image when loaded
+if TYPE_CHECKING:
+    from PIL.Image import Image
 
 logger = logging.getLogger(__name__)
 
 
-@dataclass
-class PDFProcessingResult:
-    """
-    Result of PDF preprocessing containing both processed data and analysis metadata.
-    """
-
-    processed_data: bytes
-    metadata: Dict
-
-
-def pil_to_cv(image: PILImage) -> np.ndarray:
+def pil_to_cv(image: Image) -> np.ndarray:
     """Convert PIL Image to OpenCV format."""
     return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)  # type: ignore
 
 
 def classify_and_draw_layout_regions(
-    reference: PILImage, mask: PILImage, min_area: int = 5000, label: bool = True
-) -> Tuple[PILImage, List[Dict]]:
+    reference: Image, mask: Image, min_area: int = 5000, label: bool = True
+) -> Tuple[Image, List[Dict]]:
     """
     Classify and optionally draw layout regions using contour detection.
 
     Returns:
         Tuple of (annotated image, list of detected regions)
     """
-    if not CV2_AVAILABLE:
-        return reference, []
 
     mask_np = np.array(mask.convert("L"))
     h, w = mask_np.shape
@@ -76,7 +57,7 @@ def classify_and_draw_layout_regions(
     regions = []
 
     if label:
-        draw = PIL_ImageDraw.Draw(img)  # type: ignore
+        draw = PIL.ImageDraw.Draw(img)  # type: ignore
 
     for cnt in contours:
         x, y, rw, rh = cv2.boundingRect(cnt)  # type: ignore
@@ -118,7 +99,7 @@ def classify_and_draw_layout_regions(
     return img, regions
 
 
-def find_horizontal_bands(mask: PILImage, min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]:
+def find_horizontal_bands(mask: Image, min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]:
     """Find horizontal bands of similar content across pages."""
     mask_np = np.array(mask.convert("L"))
     h, w = mask_np.shape
@@ -160,13 +141,9 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
         Returns:
             Dictionary containing layout analysis metadata
         """
-        if not (PDF2IMAGE_AVAILABLE and CV2_AVAILABLE):
-            self.logger.warning("PDF analysis requires pdf2image and cv2, skipping layout analysis")
-            return {"analysis_available": False, "error": "Missing dependencies"}
 
         try:
-            # Convert PDF to images
-            images = pdf2image.convert_from_bytes(pdf_data, dpi=150)  # type: ignore  # Lower DPI for analysis
+            images = pdf2image.convert_from_bytes(pdf_data, dpi=150)  # type: ignore
             if not images:
                 return {"analysis_available": False, "error": "No pages found"}
 
@@ -186,7 +163,7 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
             self.logger.error(f"PDF layout analysis failed for {filename}: {e}")
             return {"analysis_available": False, "error": str(e)}
 
-    def _analyze_page_layout(self, images: List[PILImage]) -> Dict:
+    def _analyze_page_layout(self, images: List[Image]) -> Dict:
         """
         Analyze page layout by comparing pages to find common regions.
         """
@@ -209,7 +186,7 @@ def _analyze_page_layout(self, images: List[PILImage]) -> Dict:
         else:
             return self._analyze_single_page(reference_img)
 
-    def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) -> Optional[Dict]:
+    def _compare_pages_for_margins(self, reference: Image, compare: Image) -> Optional[Dict]:
         """
         Compare two pages to identify common regions using advanced CV2 techniques.
         """
@@ -220,8 +197,8 @@ def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) ->
                 compare = compare.resize(reference.size)
 
             # Step 1: Compute difference and invert so white = same
-            diff = PIL_ImageChops.difference(reference, compare)  # type: ignore
-            sameness_mask = PIL_ImageChops.invert(diff.convert("L"))  # type: ignore
+            diff = PIL.ImageChops.difference(reference, compare)  # type: ignore
+            sameness_mask = PIL.ImageChops.invert(diff.convert("L"))  # type: ignore
 
             # Step 2: Threshold the mask (keep high-sameness pixels)
             # Create a lookup table for thresholding
@@ -454,7 +431,7 @@ def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int,
             }
         }
 
-    def _analyze_single_page(self, image: PILImage) -> Dict:
+    def _analyze_single_page(self, image: Image) -> Dict:
         """
         Analyze a single page when comparison isn't possible.
         """
diff --git a/extralit-server/src/extralit_server/contexts/document/preprocessing.py b/extralit-server/src/extralit_server/contexts/document/preprocessing.py
index 7e3dae3c4..b2a22f02b 100644
--- a/extralit-server/src/extralit_server/contexts/document/preprocessing.py
+++ b/extralit-server/src/extralit_server/contexts/document/preprocessing.py
@@ -19,28 +19,30 @@
 import tempfile
 import time
 from io import BytesIO
-from typing import List, Optional
+from typing import List
+from dataclasses import dataclass
 from uuid import uuid4
 
 import lazy_loader as lazy
 from pydantic import Field
 from pydantic_settings import BaseSettings
+from extralit_server.api.schemas.v1.document.preprocessing import PDFMetadata
+from extralit_server.contexts.document.analysis import PDFAnalyzer
+
 
-# Lazy load OCRmyPDF to avoid loading it in the main FastAPI process
-# Only loaded when actually used in Redis workers
 ocrmypdf = lazy.load("ocrmypdf")
 
-# Since OCRmyPDF is packaged with the application, it's always available
-OCRMYPDF_AVAILABLE = True
+_LOGGER = logging.getLogger(__name__)
 
-try:
-    from extralit_server.contexts.document.analysis import PDFAnalyzer, PDFProcessingResult
 
-    ANALYSIS_AVAILABLE = True
-except ImportError:
-    ANALYSIS_AVAILABLE = False
+@dataclass
+class PDFProcessingResponse:
+    """
+    Result of PDF preprocessing containing both processed data and analysis metadata.
+    """
 
-_LOGGER = logging.getLogger(__name__)
+    processed_data: bytes
+    metadata: PDFMetadata
 
 
 class PDFPreprocessingSettings(BaseSettings):
@@ -57,6 +59,8 @@ class Config:
         default=True, description="Enable PDF preprocessing with OCRmyPDF. Set to False to disable all processing."
     )
 
+    enable_analysis: bool = Field(default=True, description="Enable PDF layout analysis and margin detection")
+
     language: List[str] = Field(
         default=["eng"], description="List of languages for OCR processing (e.g., ['eng', 'spa', 'fra'])"
     )
@@ -73,24 +77,22 @@ class Config:
     clean: bool = Field(default=True, description="Use `unpaper` to clean up artifacts")
 
     optimize: int = Field(
-        default=0, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)"
+        default=1, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)"
     )
 
     pdf_renderer: str = Field(default="hocr", description="PDF renderer: 'auto', 'hocr', 'sandwich'")
 
     force_ocr: bool = Field(default=False, description="Force OCR on all pages, even if they already have text")
 
-    tesseract_timeout: int = Field(
-        default=0, description="Timeout for Tesseract OCR processing in seconds (0 to skip Tesseract OCR)"
-    )
-
     skip_text: bool = Field(default=True, description="Skip text-based operations (OCR only for images)")
 
     redo_ocr: bool = Field(default=False, description="Redo OCR on pages that already have OCR")
 
-    progress_bar: bool = Field(default=False, description="Show progress bar during processing")
+    tesseract_timeout: int = Field(
+        default=0, description="Timeout for Tesseract OCR processing in seconds (0 to skip Tesseract OCR)"
+    )
 
-    enable_analysis: bool = Field(default=True, description="Enable PDF layout analysis and margin detection")
+    progress_bar: bool = Field(default=False, description="Show progress bar during processing")
 
     output_type: str = Field(
         default="pdf",
@@ -102,9 +104,9 @@ class Config:
         description="Fast web view optimization. Set to 999999 to disable fast web view optimization.",
     )
 
-    skip_big: bool = Field(
-        default=True,
-        description="Skip large images if some pages have large images.",
+    skip_big: float = Field(
+        default=100.0,
+        description="Image size threshold in MB to skip OCR processing.",
     )
 
     jobs: int = Field(
@@ -139,6 +141,9 @@ def get_ocrmypdf_args(self) -> dict:
         }
 
 
+settings = PDFPreprocessingSettings()
+
+
 class PDFPreprocessor:
     """
     PDF preprocessor that uses OCRmyPDF for rotation, OCR, and optimization.
@@ -147,29 +152,21 @@ class PDFPreprocessor:
     Can be configured with environment variables using the PDFPreprocessingSettings.
     """
 
-    def __init__(self, settings: Optional[PDFPreprocessingSettings] = None):
+    def __init__(self, settings: PDFPreprocessingSettings = settings):
         """
         Initialize the PDF preprocessor.
 
         Args:
             settings: Optional PDFPreprocessingSettings instance. If None, loads from environment.
         """
-        self.settings = settings or PDFPreprocessingSettings()
+        self.settings = settings
 
-        # Initialize analyzer if available and enabled
-        if self.settings.enable_analysis and ANALYSIS_AVAILABLE:
+        if self.settings.enable_analysis:
             self.analyzer = PDFAnalyzer()
         else:
             self.analyzer = None
-            if self.settings.enable_analysis and not ANALYSIS_AVAILABLE:
-                _LOGGER.warning("PDF analysis is enabled but dependencies are not available")
 
-        if not self.settings.enabled:
-            _LOGGER.info("PDF preprocessing is disabled via configuration")
-        elif not OCRMYPDF_AVAILABLE:
-            _LOGGER.warning("OCRmyPDF not available, PDF preprocessing will be skipped")
-
-    def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult:
+    def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResponse:
         """
         Preprocess PDF with OCRmyPDF and analyze layout structure.
 
@@ -180,59 +177,55 @@ def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult:
         Returns:
             PDFProcessingResult containing processed data and layout analysis metadata
         """
-        metadata = {}
+        # Initialize metadata variables
+        analysis_results = None
+        processing_time = 0.0
+        processed_data = file_data
 
-        # Handle non-PDF files or disabled preprocessing
+        # Handle non-PDF files
         if not filename.lower().endswith(".pdf"):
-            return PDFProcessingResult(processed_data=file_data, metadata=metadata)
+            pass  # Use default values
 
-        if not self.settings.enabled:
+        # Handle disabled preprocessing
+        elif not self.settings.enabled:
             if self.analyzer:
-                layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
-                metadata.update(layout_analysis)
-            return PDFProcessingResult(processed_data=file_data, metadata=metadata)
+                analysis_results = self.analyzer.analyze_pdf_layout(file_data, filename)
 
-        if not OCRMYPDF_AVAILABLE:
-            _LOGGER.warning("OCRmyPDF not available, skipping preprocessing")
-            # Still run analysis on original data if enabled and available
-            if self.analyzer:
-                layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
-                metadata.update(layout_analysis)
-            return PDFProcessingResult(processed_data=file_data, metadata=metadata)
+        # Handle PDF processing
+        else:
+            try:
+                start_time = time.time()
 
-        try:
-            start_time = time.time()
+                # Step 1: Analyze original PDF layout (if enabled)
+                if self.analyzer:
+                    analysis_results = self.analyzer.analyze_pdf_layout(file_data, filename)
 
-            # Step 1: Analyze original PDF layout (if enabled and available)
-            if self.analyzer:
-                layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename)
-                metadata.update(layout_analysis)
+                # Step 2: OCR preprocessing
+                try:
+                    input_buffer = BytesIO(file_data)
+                    output_buffer = BytesIO()
 
-            # Step 2: OCR preprocessing
-            try:
-                input_buffer = BytesIO(file_data)
-                output_buffer = BytesIO()
+                    ocrmypdf.ocr(input_buffer, output_buffer, **self.settings.get_ocrmypdf_args())  # type: ignore
 
-                ocrmypdf.ocr(input_buffer, output_buffer, **self.settings.get_ocrmypdf_args())  # type: ignore
+                    processed_data = output_buffer.getvalue()
+                    output_buffer.close()
+                    input_buffer.close()
 
-                processed_data = output_buffer.getvalue()
-                output_buffer.close()
-                input_buffer.close()
+                except Exception as buffer_error:
+                    _LOGGER.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}")
+                    processed_data = self._preprocess_with_temp_files(file_data, filename)
 
-            except Exception as buffer_error:
-                _LOGGER.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}")
-                processed_data = self._preprocess_with_temp_files(file_data, filename)
+                processing_time = time.time() - start_time
+                print(filename, analysis_results)
 
-            processing_time = time.time() - start_time
-            metadata["processing_time_seconds"] = processing_time
-            print(metadata)
-            _LOGGER.info(f"PDF preprocessing completed for {filename} in {processing_time:.2f} seconds")
+            except Exception:
+                # Use default values on error
+                pass
 
-            return PDFProcessingResult(processed_data=processed_data, metadata=metadata)
+        # Single PDFMetadata initialization for all code paths
+        metadata = PDFMetadata(filename=filename, processing_time=processing_time, analysis_results=analysis_results)
 
-        except Exception as e:
-            _LOGGER.error(f"PDF preprocessing failed for {filename}: {e}")
-            return PDFProcessingResult(processed_data=file_data, metadata=metadata)
+        return PDFProcessingResponse(processed_data=processed_data, metadata=metadata)
 
     def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes:
         """
@@ -274,23 +267,4 @@ def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes:
                         _LOGGER.warning(f"Failed to clean up temp file: {e}")
 
 
-# Global preprocessor instance (can be configured via environment variables)
-pdf_preprocessor = PDFPreprocessor()
-
-
-def preprocess_pdf_with_ocrmypdf(file_data: bytes, filename: str) -> bytes:
-    """
-    Preprocess PDF with OCRmyPDF to add OCR layer and fix orientation.
-
-    This function provides backward compatibility by using the global pdf_preprocessor instance.
-    For new code, consider using PDFPreprocessor directly for better configuration control.
-
-    Args:
-        file_data: PDF file data as bytes
-        filename: Original filename for logging purposes
-
-    Returns:
-        Processed PDF data as bytes (or original bytes if processing fails)
-    """
-    result = pdf_preprocessor.preprocess(file_data, filename)
-    return result.processed_data
+preprocessor = PDFPreprocessor()
diff --git a/extralit-server/src/extralit_server/jobs/document_jobs.py b/extralit-server/src/extralit_server/jobs/document_jobs.py
index db8c433ce..d6c2788c3 100644
--- a/extralit-server/src/extralit_server/jobs/document_jobs.py
+++ b/extralit-server/src/extralit_server/jobs/document_jobs.py
@@ -132,7 +132,7 @@ async def upload_reference_documents_job(
 
                     try:
                         # Preprocess PDF files with OCRmyPDF for rotation and OCR, plus layout analysis
-                        preprocessing_result = preprocessing.pdf_preprocessor.preprocess(
+                        preprocessing_result = preprocessing.preprocessor.preprocess(
                             file_data=file_data, filename=filename
                         )
                         processed_file_data = preprocessing_result.processed_data

From 1eacf585a4baee2402414b0a5d599c07dcdf14f3 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Fri, 8 Aug 2025 15:36:30 -0700
Subject: [PATCH 14/22] fix typechecking

---
 .../extralit_server/contexts/document/analysis.py  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py
index 209ac517a..c488ae91e 100644
--- a/extralit-server/src/extralit_server/contexts/document/analysis.py
+++ b/extralit-server/src/extralit_server/contexts/document/analysis.py
@@ -29,14 +29,14 @@
 logger = logging.getLogger(__name__)
 
 
-def pil_to_cv(image: Image) -> np.ndarray:
+def pil_to_cv(image: "Image") -> np.ndarray:
     """Convert PIL Image to OpenCV format."""
     return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)  # type: ignore
 
 
 def classify_and_draw_layout_regions(
-    reference: Image, mask: Image, min_area: int = 5000, label: bool = True
-) -> Tuple[Image, List[Dict]]:
+    reference: "Image", mask: "Image", min_area: int = 5000, label: bool = True
+) -> Tuple["Image", List[Dict]]:
     """
     Classify and optionally draw layout regions using contour detection.
 
@@ -99,7 +99,7 @@ def classify_and_draw_layout_regions(
     return img, regions
 
 
-def find_horizontal_bands(mask: Image, min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]:
+def find_horizontal_bands(mask: "Image", min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]:
     """Find horizontal bands of similar content across pages."""
     mask_np = np.array(mask.convert("L"))
     h, w = mask_np.shape
@@ -163,7 +163,7 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
             self.logger.error(f"PDF layout analysis failed for {filename}: {e}")
             return {"analysis_available": False, "error": str(e)}
 
-    def _analyze_page_layout(self, images: List[Image]) -> Dict:
+    def _analyze_page_layout(self, images: List["Image"]) -> Dict:
         """
         Analyze page layout by comparing pages to find common regions.
         """
@@ -186,7 +186,7 @@ def _analyze_page_layout(self, images: List[Image]) -> Dict:
         else:
             return self._analyze_single_page(reference_img)
 
-    def _compare_pages_for_margins(self, reference: Image, compare: Image) -> Optional[Dict]:
+    def _compare_pages_for_margins(self, reference: "Image", compare: "Image") -> Optional[Dict]:
         """
         Compare two pages to identify common regions using advanced CV2 techniques.
         """
@@ -431,7 +431,7 @@ def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int,
             }
         }
 
-    def _analyze_single_page(self, image: Image) -> Dict:
+    def _analyze_single_page(self, image: "Image") -> Dict:
         """
         Analyze a single page when comparison isn't possible.
         """

From 8d0704c89cb2d3f4f7bb38ca463802837c926fd0 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Fri, 8 Aug 2025 16:35:21 -0700
Subject: [PATCH 15/22] Refactor image handling and lazy load dependencies

- Updated image handling in `_media.py`, `_hub.py`, and `_datasets.py` to utilize lazy loading for `PIL` and `datasets` modules.
- Adjusted type hints and checks to ensure compatibility with lazy-loaded imports.
- Enhanced error handling and type checking for image processing functions.
- Modified `DataframeData` schema to use an alias for schema definition.
---
 .../extralit_server/api/schemas/v1/imports.py |  2 +-
 .../src/extralit_server/cli/__init__.py       | 34 +++++++++++++------
 .../src/extralit_server/cli/__main__.py       | 16 +--------
 .../src/extralit_server/contexts/hub.py       | 29 ++++++++++------
 extralit/src/extralit/_helpers/_media.py      | 21 +++++++-----
 extralit/src/extralit/datasets/_io/_hub.py    | 20 ++++++-----
 .../src/extralit/records/_dataset_records.py  |  9 ++---
 extralit/src/extralit/records/_io/__init__.py |  1 -
 .../src/extralit/records/_io/_datasets.py     | 33 ++++++++++--------
 9 files changed, 91 insertions(+), 74 deletions(-)

diff --git a/extralit-server/src/extralit_server/api/schemas/v1/imports.py b/extralit-server/src/extralit_server/api/schemas/v1/imports.py
index 654fff606..837a004cf 100644
--- a/extralit-server/src/extralit_server/api/schemas/v1/imports.py
+++ b/extralit-server/src/extralit_server/api/schemas/v1/imports.py
@@ -68,7 +68,7 @@ class DataframeSchema(BaseModel):
 class DataframeData(BaseModel):
     """Tabular dataframe representation for generalized import support."""
 
-    schema: DataframeSchema = Field(..., description="Schema definition with fields and primary key")
+    schema_: DataframeSchema = Field(..., alias="schema", description="Schema definition with fields and primary key")
     data: List[Dict[str, Any]] = Field(..., description="List of data rows as dictionaries")
 
 
diff --git a/extralit-server/src/extralit_server/cli/__init__.py b/extralit-server/src/extralit_server/cli/__init__.py
index b0ad568f3..0e75fb520 100644
--- a/extralit-server/src/extralit_server/cli/__init__.py
+++ b/extralit-server/src/extralit_server/cli/__init__.py
@@ -1,18 +1,30 @@
-#  Copyright 2021-present, the Recognai S.L. team.
+# Copyright 2024-present, Extralit Labs, Inc.
 #
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#      http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-from .__main__ import app
+import typer
+
+from .database import app as database_app
+from .search_engine import app as search_engine_app
+from .start import start
+from .worker import worker
+
+app = typer.Typer(help="Commands for Extralit server management", no_args_is_help=True)
+
+app.add_typer(database_app, name="database")
+app.add_typer(search_engine_app, name="search-engine")
+app.command(name="worker", help="Starts rq workers")(worker)
+app.command(name="start", help="Starts the Extralit server")(start)
 
 if __name__ == "__main__":
     app()
diff --git a/extralit-server/src/extralit_server/cli/__main__.py b/extralit-server/src/extralit_server/cli/__main__.py
index 3c98db185..7d7ff85aa 100644
--- a/extralit-server/src/extralit_server/cli/__main__.py
+++ b/extralit-server/src/extralit_server/cli/__main__.py
@@ -12,21 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import typer
-
-from .database import app as database_app
-from .search_engine import app as search_engine_app
-from .start import start
-from .worker import worker
-
-app = typer.Typer(help="Commands for Extralit server management", no_args_is_help=True)
-
-
-app.add_typer(database_app, name="database")
-app.add_typer(search_engine_app, name="search-engine")
-app.command(name="worker", help="Starts rq workers")(worker)
-app.command(name="start", help="Starts the Extralit server")(start)
-
+from extralit_server.cli import app
 
 if __name__ == "__main__":
     app()
diff --git a/extralit-server/src/extralit_server/contexts/hub.py b/extralit-server/src/extralit_server/contexts/hub.py
index 8f8ceb981..9d8fbf1a9 100644
--- a/extralit-server/src/extralit_server/contexts/hub.py
+++ b/extralit-server/src/extralit_server/contexts/hub.py
@@ -19,16 +19,23 @@
 
 from uuid import uuid4
 from pathlib import Path
-from typing import Any, Optional, List
+from typing import Any, Optional, List, TYPE_CHECKING
 from typing_extensions import Self
 from tempfile import TemporaryDirectory
 
-from PIL import Image
+import lazy_loader as lazy
+
+datasets = lazy.load("datasets")
+PIL = lazy.load("PIL")
+
 from sqlalchemy.orm import selectinload
 from sqlalchemy.ext.asyncio import AsyncSession
 from pydantic import BaseModel
 from huggingface_hub import HfApi, DatasetCard, DatasetCardData
-from datasets import Dataset as HFDataset, NamedSplit, load_dataset, features
+
+if TYPE_CHECKING:
+    from PIL import Image
+    from datasets import Dataset as HFDataset
 
 from extralit_server.contexts import info
 from extralit_server.database import get_sync_db
@@ -62,7 +69,7 @@
 
 class HubDataset:
     def __init__(self, name: str, subset: str, split: str, mapping: HubDatasetMapping):
-        self.dataset: HFDataset = load_dataset(path=name, name=subset, split=split, streaming=True)  # type: ignore
+        self.dataset: "HFDataset" = datasets.load_dataset(path=name, name=subset, split=split, streaming=True)  # type: ignore
         self.split = split
         self.mapping = mapping
         self.mapping_feature_names = mapping.sources
@@ -121,14 +128,14 @@ def _batch_index_to_row(self, batch: dict, index: int) -> dict:
         return row
 
     def _cast_feature_value(self, feature: Any, value: Any) -> Any:
-        if isinstance(feature, features.ClassLabel):
+        if isinstance(feature, datasets.features.ClassLabel):  # type: ignore
             if value == FEATURE_CLASS_LABEL_NO_LABEL:
                 return None
             else:
                 return feature.int2str(value)
-        elif isinstance(feature, features.Sequence):
+        elif isinstance(feature, datasets.features.Sequence):  # type: ignore
             return [self._cast_feature_value(feature.feature, v) for v in value]
-        elif isinstance(feature, features.Image) and isinstance(value, Image.Image):
+        elif isinstance(feature, datasets.features.Image) and isinstance(value, PIL.Image.Image):  # type: ignore
             return pil_image_to_data_url(value)
         else:
             return value
@@ -231,7 +238,9 @@ def __init__(self, dataset: Dataset):
         self.cache_version = uuid4()
 
     def export_to(self, name: str, subset: str, split: str, private: bool, token: str) -> None:
-        hf_dataset: HFDataset = HFDataset.from_generator(self._rows_generator, split=NamedSplit(split))  # type: ignore
+        hf_dataset: "HFDataset" = datasets.Dataset.from_generator(
+            self._rows_generator, split=datasets.NamedSplit(split)
+        )  # type: ignore
         hf_dataset.push_to_hub(
             repo_id=name,
             config_name=subset,
@@ -285,7 +294,7 @@ def _row_fields(self, record: Record) -> dict:
             feature_value = record.fields.get(field.name)
 
             if field.is_image and feature_value is not None and feature_value.startswith("data:"):
-                row_fields[feature_name] = Image.open(io.BytesIO(data_url_to_bytes(feature_value)))
+                row_fields[feature_name] = PIL.Image.open(io.BytesIO(data_url_to_bytes(feature_value)))  # type: ignore
             else:
                 row_fields[feature_name] = feature_value
 
@@ -474,7 +483,7 @@ def _create_readme_file(self, directory: str, repo_id: str) -> None:
         card.save(os.path.join(directory, "README.md"))
 
 
-def pil_image_to_data_url(image: Image.Image):
+def pil_image_to_data_url(image: "Image"):
     buffer = io.BytesIO()
 
     image_format = image.format or DATA_URL_DEFAULT_IMAGE_FORMAT
diff --git a/extralit/src/extralit/_helpers/_media.py b/extralit/src/extralit/_helpers/_media.py
index 01a7edc02..09c0ceb72 100644
--- a/extralit/src/extralit/_helpers/_media.py
+++ b/extralit/src/extralit/_helpers/_media.py
@@ -16,9 +16,14 @@
 import io
 import warnings
 from pathlib import Path
-from typing import Union, Optional
+from typing import Union, Optional, TYPE_CHECKING
 
-from PIL import Image
+import lazy_loader as lazy
+
+PIL = lazy.load("PIL")
+
+if TYPE_CHECKING:
+    from PIL.Image import Image
 
 
 def pil_to_data_uri(image_object: Optional["Image"]) -> Optional[str]:
@@ -30,7 +35,7 @@ def pil_to_data_uri(image_object: Optional["Image"]) -> Optional[str]:
     """
     if image_object is None:
         return None
-    if not isinstance(image_object, Image.Image):
+    if not isinstance(image_object, PIL.Image.Image):  # type: ignore
         raise ValueError("The image_object must be a PIL Image object.")
 
     image_format = image_object.format
@@ -82,28 +87,28 @@ def cast_image(image: Union["Image", str, Path]) -> str:
             return filepath_to_data_uri(image)
     elif isinstance(image, Path):
         return filepath_to_data_uri(image)
-    elif isinstance(image, Image.Image):
+    elif isinstance(image, PIL.Image.Image):  # type: ignore
         return pil_to_data_uri(image)
     else:
         raise ValueError("The image must be a data URI string, a file path, or a PIL Image object.")
 
 
-def uncast_image(image: str) -> "Image":
+def uncast_image(image: Union[str, "Image"]) -> "Image":
     """Convert a base64 data URI string to a PIL image."""
-    if isinstance(image, Image.Image):
+    if isinstance(image, PIL.Image.Image):  # type: ignore
         return image
     elif not isinstance(image, str):
         raise ValueError("The image must be a data URI string.")
     elif image.startswith("data:image"):
         try:
             image_data = base64.b64decode(image.split(",")[1])
-            image = Image.open(io.BytesIO(image_data))
+            image = PIL.Image.open(io.BytesIO(image_data))  # type: ignore
         except Exception as e:
             raise ValueError("An error occurred while converting the data URI to a PIL image.") from e
         return image
     elif image.startswith("http"):
         return image
     elif Path(image).exists():
-        return Image.open(image)
+        return PIL.Image.open(image)  # type: ignore
     else:
         raise ValueError("The image must be a data URI string.")
diff --git a/extralit/src/extralit/datasets/_io/_hub.py b/extralit/src/extralit/datasets/_io/_hub.py
index 15851ae4d..00a5ded4c 100644
--- a/extralit/src/extralit/datasets/_io/_hub.py
+++ b/extralit/src/extralit/datasets/_io/_hub.py
@@ -19,10 +19,8 @@
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any, Dict, Optional, Type, Union, Literal
 from uuid import UUID
+import lazy_loader as lazy
 
-from datasets import DatasetDict
-from datasets.data_files import EmptyDatasetError
-from PIL import Image
 
 from extralit._exceptions import ImportDatasetError
 from extralit._exceptions._api import UnprocessableEntityError
@@ -34,6 +32,9 @@
 from extralit.records._mapping import IngestedRecordMapper
 from extralit.responses import Response
 
+datasets = lazy.load("datasets")
+PIL = lazy.load("PIL")
+
 if TYPE_CHECKING:
     from datasets import Dataset as HFDataset
 
@@ -143,7 +144,8 @@ def from_hub(
             A `Dataset` loaded from the Hugging Face Hub.
         """
         from extralit.settings import Settings
-        from datasets import load_dataset
+
+        # load_dataset is accessed via lazy loaded datasets module
         from huggingface_hub import snapshot_download
 
         settings = settings or "ui"
@@ -194,15 +196,15 @@ def from_hub(
 
         if with_records:
             try:
-                hf_dataset = load_dataset(
+                hf_dataset = datasets.load_dataset(  # type: ignore
                     path=repo_id,
                     split=split,
                     name=subset,
                     **kwargs,
-                )  # type: ignore
+                )
                 hf_dataset = cls._get_dataset_split(hf_dataset=hf_dataset, split=split, **kwargs)
                 cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset)
-            except EmptyDatasetError:
+            except datasets.data_files.EmptyDatasetError:  # type: ignore
                 warnings.warn(
                     message="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
                     category=UserWarning,
@@ -298,7 +300,7 @@ def _get_dataset_split(hf_dataset: "HFDataset", split: Optional[str] = None, **k
             HFDataset: The single dataset.
         """
 
-        if isinstance(hf_dataset, DatasetDict) and split is None:
+        if isinstance(hf_dataset, datasets.DatasetDict) and split is None:  # type: ignore
             split = next(iter(hf_dataset.keys()))
             if len(hf_dataset.keys()) > 1:
                 warnings.warn(
@@ -326,7 +328,7 @@ def _get_sample_hf_record(hf_dataset: "HFDataset") -> Dict:
                     json.dumps(value)
                     sample_huggingface_record[key] = value
                 except TypeError:
-                    if isinstance(value, Image.Image):
+                    if isinstance(value, PIL.Image.Image):  # type: ignore
                         sample_huggingface_record[key] = pil_to_data_uri(value)
                     else:
                         sample_huggingface_record[key] = "Record value is not serializable"
diff --git a/extralit/src/extralit/records/_dataset_records.py b/extralit/src/extralit/records/_dataset_records.py
index 4c9b3fe15..1bfae04f6 100644
--- a/extralit/src/extralit/records/_dataset_records.py
+++ b/extralit/src/extralit/records/_dataset_records.py
@@ -24,13 +24,14 @@
 from extralit._models import RecordModel
 from extralit._exceptions import RecordsIngestionError
 from extralit.client import Extralit
-from extralit.records._io import GenericIO, HFDataset, HFDatasetsIO, JsonIO
+from extralit.records._io import GenericIO, HFDatasetsIO, JsonIO
 from extralit.records._mapping import IngestedRecordMapper
 from extralit.records._resource import Record
 from extralit.records._search import Query
 
 if TYPE_CHECKING:
     from extralit.datasets import Dataset
+    from datasets import Dataset as HFDataset
 
 
 class RecordErrorHandling(Enum):
@@ -246,7 +247,7 @@ def __repr__(self) -> str:
 
     def log(
         self,
-        records: Union[List[dict], List[Record], HFDataset],
+        records: Union[List[dict], List[Record], "HFDataset"],
         mapping: Optional[Dict[str, Union[str, Sequence[str]]]] = None,
         user_id: Optional[UUID] = None,
         batch_size: int = DEFAULT_BATCH_SIZE,
@@ -403,7 +404,7 @@ def from_json(self, path: Union[Path, str]) -> List[Record]:
         records = JsonIO._records_from_json(path=path)
         return self.log(records=records)
 
-    def to_datasets(self) -> HFDataset:
+    def to_datasets(self) -> "HFDataset":
         """
         Export the records to a HFDataset.
 
@@ -420,7 +421,7 @@ def to_datasets(self) -> HFDataset:
 
     def _ingest_records(
         self,
-        records: Union[List[Dict[str, Any]], List[Record], HFDataset],
+        records: Union[List[Dict[str, Any]], List[Record], "HFDataset"],
         mapping: Optional[Dict[str, Union[str, Sequence[str]]]] = None,
         user_id: Optional[UUID] = None,
         on_error: RecordErrorHandling = RecordErrorHandling.RAISE,
diff --git a/extralit/src/extralit/records/_io/__init__.py b/extralit/src/extralit/records/_io/__init__.py
index 206b7a71d..398ce4a30 100644
--- a/extralit/src/extralit/records/_io/__init__.py
+++ b/extralit/src/extralit/records/_io/__init__.py
@@ -15,4 +15,3 @@
 from extralit.records._io._datasets import HFDatasetsIO  # noqa: F401
 from extralit.records._io._generic import GenericIO  # noqa: F401
 from extralit.records._io._json import JsonIO  # noqa: F401
-from extralit.records._io._datasets import HFDataset  # noqa: F401
diff --git a/extralit/src/extralit/records/_io/_datasets.py b/extralit/src/extralit/records/_io/_datasets.py
index 9de0afddb..2432ca65e 100644
--- a/extralit/src/extralit/records/_io/_datasets.py
+++ b/extralit/src/extralit/records/_io/_datasets.py
@@ -14,14 +14,17 @@
 
 import warnings
 from typing import TYPE_CHECKING, Any, Dict, List, Union, Optional, Tuple
-
-from datasets import Dataset as HFDataset, Sequence
-from datasets import Image, ClassLabel, Value
+import lazy_loader as lazy
 
 from extralit._helpers._media import pil_to_data_uri, uncast_image
 from extralit.records._io._generic import GenericIO
 
+datasets = lazy.load("datasets")
+
+
 if TYPE_CHECKING:
+    from datasets import Dataset as HFDataset, ClassLabel
+
     from extralit.records import Record
     from extralit.datasets import Dataset
     from extralit.records._mapping import IngestedRecordMapper
@@ -41,7 +44,7 @@ def _cast_images_as_urls(hf_dataset: "HFDataset", columns: List[str]) -> "HFData
     for column in columns:
         # make an updated features object with the new column type
         features = hf_dataset.features.copy()
-        features[column] = Value("string")
+        features[column] = datasets.Value("string")  # type: ignore
         # cast the column in batches
         hf_dataset = hf_dataset.map(
             function=lambda batch: {column: [pil_to_data_uri(sample) for sample in batch]},
@@ -55,7 +58,7 @@ def _cast_images_as_urls(hf_dataset: "HFDataset", columns: List[str]) -> "HFData
     return hf_dataset
 
 
-def _int2class_name(feature: ClassLabel, value: int) -> Optional[str]:
+def _int2class_name(feature: "ClassLabel", value: int) -> Optional[str]:
     try:
         return feature.int2str(value)
     except Exception as ex:
@@ -73,7 +76,7 @@ def map2str_list(x: dict, column_name: str, features: dict):
 
     for column in columns:
         features = hf_dataset.features.copy()
-        features[column] = Sequence(Value("string"))
+        features[column] = datasets.Sequence(datasets.Value("string"))  # type: ignore
         hf_dataset = hf_dataset.map(
             map2str_list,
             fn_kwargs={"column_name": column, "features": hf_dataset.features},
@@ -103,7 +106,7 @@ def label_column2str(x: dict, column: str, features: dict) -> Dict[str, Union[st
 
     for column in columns:
         features = hf_dataset.features.copy()
-        features[column] = Value("string")
+        features[column] = datasets.Value("string")  # type: ignore
         hf_dataset = hf_dataset.map(
             label_column2str, fn_kwargs={"column": column, "features": hf_dataset.features}, features=features
         )
@@ -126,7 +129,7 @@ def _uncast_uris_as_images(hf_dataset: "HFDataset", columns: List[str]) -> "HFDa
 
     for column in columns:
         features = hf_dataset.features.copy()
-        features[column] = Image()
+        features[column] = datasets.Image()  # type: ignore
         casted_hf_dataset = hf_dataset.map(
             function=lambda batch: {column: [uncast_image(sample) for sample in batch]},
             with_indices=False,
@@ -162,7 +165,7 @@ def _uncast_label_questions_as_classlabels(hf_dataset: "HFDataset", columns: Lis
             continue
         values = list(hf_dataset.unique(column))
         features = hf_dataset.features.copy()
-        features[column] = ClassLabel(names=values)
+        features[column] = datasets.ClassLabel(names=values)  # type: ignore
         hf_dataset = hf_dataset.map(
             function=lambda batch: {column: [values.index(sample) for sample in batch]},
             with_indices=False,
@@ -191,10 +194,10 @@ def _is_hf_dataset(dataset: Any) -> bool:
         Returns:
             bool: True if the object is a Hugging Face dataset, False otherwise.
         """
-        return isinstance(dataset, HFDataset)
+        return isinstance(dataset, datasets.Dataset)  # type: ignore
 
     @staticmethod
-    def to_datasets(records: List[Union["Record", Tuple["Record", float]]], dataset: "Dataset") -> HFDataset:
+    def to_datasets(records: List[Union["Record", Tuple["Record", float]]], dataset: "Dataset") -> "HFDataset":
         """
         Export the records to a Hugging Face dataset.
 
@@ -202,7 +205,7 @@ def to_datasets(records: List[Union["Record", Tuple["Record", float]]], dataset:
             The dataset containing the records.
         """
         record_dicts = GenericIO.to_dict(records, flatten=True)
-        hf_dataset = HFDataset.from_dict(record_dicts)
+        hf_dataset = datasets.Dataset.from_dict(record_dicts)  # type: ignore
         hf_dataset = HFDatasetsIO._uncast_argilla_attributes_to_datasets(hf_dataset, dataset.schema)
         return hf_dataset
 
@@ -277,11 +280,11 @@ def to_argilla(hf_dataset: "HFDataset", mapper: "IngestedRecordMapper") -> "HFDa
         class_label_sequence_columns = []
 
         for name, feature in hf_dataset.features.items():
-            if isinstance(feature, Image):
+            if isinstance(feature, datasets.Image):  # type: ignore
                 image_columns.append(name)
-            elif isinstance(feature, ClassLabel):
+            elif isinstance(feature, datasets.ClassLabel):  # type: ignore
                 class_label_columns.append(name)
-            elif isinstance(feature, Sequence) and isinstance(feature.feature, ClassLabel):
+            elif isinstance(feature, datasets.Sequence) and isinstance(feature.feature, datasets.ClassLabel):  # type: ignore
                 class_label_sequence_columns.append(name)
 
         if image_columns:

From 484efd82ab07ce4a261a17f8050101ac80a24a28 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Fri, 8 Aug 2025 16:41:05 -0700
Subject: [PATCH 16/22] add lazy-loader

---
 extralit/pdm.lock       | 14 +++++++++++++-
 extralit/pyproject.toml |  3 ++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/extralit/pdm.lock b/extralit/pdm.lock
index 37cb4cf1a..fef71b678 100644
--- a/extralit/pdm.lock
+++ b/extralit/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "dev"]
 strategy = []
 lock_version = "4.5.0"
-content_hash = "sha256:70ce069646b7109d8dc4217449d7053c16b14700891f5b9902207ac35ba7943c"
+content_hash = "sha256:4eed977ec0ae819f547f71e5131d350c7686fa2a8399a998cb62994db78d442c"
 
 [[metadata.targets]]
 requires_python = ">=3.9.2,<3.14"
@@ -2130,6 +2130,18 @@ files = [
     {file = "language_data-1.3.0.tar.gz", hash = "sha256:7600ef8aa39555145d06c89f0c324bf7dab834ea0b0a439d8243762e3ebad7ec"},
 ]
 
+[[package]]
+name = "lazy-loader"
+version = "0.4"
+summary = ""
+dependencies = [
+    "packaging",
+]
+files = [
+    {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"},
+    {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"},
+]
+
 [[package]]
 name = "llama-cloud"
 version = "0.1.23"
diff --git a/extralit/pyproject.toml b/extralit/pyproject.toml
index 328c37fb4..e2865f785 100644
--- a/extralit/pyproject.toml
+++ b/extralit/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "pillow>=9.5.0",
     "standardwebhooks>=1.0.0",
     "typer>=0.9.0",
+    "lazy-loader>=0.4",
 
     # for environment variables
     "python-dotenv~=1.1.0",
@@ -49,6 +50,7 @@ dependencies = [
     "fastparquet >= 2023.10.0; python_version < '3.13'",
     "fastparquet >= 2024.4.0; python_version >= '3.13'",
     "tiktoken ~= 0.9.0",
+    "bibtexparser>=1.4.3",
 
     # for llama-index
     "llama-index ~= 0.10.68",
@@ -61,7 +63,6 @@ dependencies = [
     # for weaviate vector db
     "weaviate-client >= 4",
     "llama-index-vector-stores-weaviate ~= 1.0.0",
-    "bibtexparser>=1.4.3",
 ]
 nlp = [
     "textdescriptives",

From 669d989ae85f35a42cfbeeb6e6d80314a09cfd38 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Fri, 8 Aug 2025 18:01:24 -0700
Subject: [PATCH 17/22] Refactor document upload jobs and logging

- Renamed `upload_reference_documents_job` to `upload_and_preprocess_documents_job` for clarity in functionality.
- Updated references in `imports.py`, `analysis.py`, and test files to reflect the new job name.
- Improved logging consistency by standardizing logger usage across the `PDFAnalyzer` class.
---
 .../contexts/document/analysis.py               | 17 +++++------------
 .../src/extralit_server/contexts/imports.py     |  6 +++---
 .../src/extralit_server/jobs/document_jobs.py   |  4 ++--
 .../tests/unit/jobs/test_document_jobs.py       |  8 ++++----
 4 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py
index c488ae91e..e470a11fd 100644
--- a/extralit-server/src/extralit_server/contexts/document/analysis.py
+++ b/extralit-server/src/extralit_server/contexts/document/analysis.py
@@ -26,7 +26,7 @@
 if TYPE_CHECKING:
     from PIL.Image import Image
 
-logger = logging.getLogger(__name__)
+_LOGGER = logging.getLogger(__name__)
 
 
 def pil_to_cv(image: "Image") -> np.ndarray:
@@ -123,13 +123,6 @@ def find_horizontal_bands(mask: "Image", min_height: int = 15, min_ratio: float
 
 
 class PDFAnalyzer:
-    """
-    Analyzes PDF layout structure to detect margins, headers, footers, and other regions.
-    """
-
-    def __init__(self):
-        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
-
     def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
         """
         Analyze PDF layout to extract margin and region information.
@@ -147,7 +140,7 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
             if not images:
                 return {"analysis_available": False, "error": "No pages found"}
 
-            self.logger.info(f"Analyzing layout for {filename} with {len(images)} pages")
+            _LOGGER.info(f"Analyzing layout for {filename} with {len(images)} pages")
 
             # Analyze layout
             layout_data = self._analyze_page_layout(images)
@@ -160,7 +153,7 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
             }
 
         except Exception as e:
-            self.logger.error(f"PDF layout analysis failed for {filename}: {e}")
+            _LOGGER.error(f"PDF layout analysis failed for {filename}: {e}")
             return {"analysis_available": False, "error": str(e)}
 
     def _analyze_page_layout(self, images: List["Image"]) -> Dict:
@@ -193,7 +186,7 @@ def _compare_pages_for_margins(self, reference: "Image", compare: "Image") -> Op
         try:
             # Ensure same size
             if reference.size != compare.size:
-                self.logger.debug(f"Resizing page to match reference size")
+                _LOGGER.debug(f"Resizing page to match reference size")
                 compare = compare.resize(reference.size)
 
             # Step 1: Compute difference and invert so white = same
@@ -220,7 +213,7 @@ def _compare_pages_for_margins(self, reference: "Image", compare: "Image") -> Op
             return regions
 
         except Exception as e:
-            self.logger.debug(f"Page comparison failed: {e}")
+            _LOGGER.debug(f"Page comparison failed: {e}")
             return None
 
     def _classify_regions_advanced(
diff --git a/extralit-server/src/extralit_server/contexts/imports.py b/extralit-server/src/extralit_server/contexts/imports.py
index a5d892c1b..21a2f1ae9 100644
--- a/extralit-server/src/extralit_server/contexts/imports.py
+++ b/extralit-server/src/extralit_server/contexts/imports.py
@@ -37,7 +37,7 @@
     ImportHistoryCreate,
     ImportHistoryCreateResponse,
 )
-from extralit_server.jobs.document_jobs import upload_reference_documents_job
+from extralit_server.jobs.document_jobs import upload_and_preprocess_documents_job
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -385,7 +385,7 @@ async def process_bulk_upload(
             if not doc.associated_files:
                 # Create a reference-based job for documents without files
                 job = DEFAULT_QUEUE.enqueue(
-                    upload_reference_documents_job,
+                    upload_and_preprocess_documents_job,
                     reference=reference,
                     reference_data=doc.document_create.model_dump(),
                     file_data_list=[],
@@ -435,7 +435,7 @@ async def process_bulk_upload(
 
             # Create a reference-based job for multiple files
             job = DEFAULT_QUEUE.enqueue(
-                upload_reference_documents_job,
+                upload_and_preprocess_documents_job,
                 reference=reference,
                 reference_data=doc.document_create.model_dump(),
                 file_data_list=file_data_list,
diff --git a/extralit-server/src/extralit_server/jobs/document_jobs.py b/extralit-server/src/extralit_server/jobs/document_jobs.py
index d6c2788c3..6a248c000 100644
--- a/extralit-server/src/extralit_server/jobs/document_jobs.py
+++ b/extralit-server/src/extralit_server/jobs/document_jobs.py
@@ -33,7 +33,7 @@
 
 
 @job(DEFAULT_QUEUE, timeout=JOB_TIMEOUT_DISABLED, retry=Retry(max=3, interval=[10, 30, 60]))
-async def upload_reference_documents_job(
+async def upload_and_preprocess_documents_job(
     reference: str,
     reference_data: Dict[str, Any],
     file_data_list: List[Tuple[str, bytes]],  # List of (filename, file_data) tuples
@@ -183,7 +183,7 @@ async def upload_reference_documents_job(
             results["success"] = results["failed_files"] == 0
 
     except Exception as e:
-        error_msg = f"Error in upload_reference_documents_job for reference {reference}: {str(e)}"
+        error_msg = f"Error uploading documents for reference {reference}: {str(e)}"
         _LOGGER.error(error_msg)
         results["success"] = False
         results["errors"].append(str(e))
diff --git a/extralit-server/tests/unit/jobs/test_document_jobs.py b/extralit-server/tests/unit/jobs/test_document_jobs.py
index a47cb17e6..ca70ca4ac 100644
--- a/extralit-server/tests/unit/jobs/test_document_jobs.py
+++ b/extralit-server/tests/unit/jobs/test_document_jobs.py
@@ -16,7 +16,7 @@
 from unittest.mock import patch, MagicMock
 from uuid import uuid4
 
-from extralit_server.jobs.document_jobs import upload_reference_documents_job
+from extralit_server.jobs.document_jobs import upload_and_preprocess_documents_job
 from tests.factories import WorkspaceFactory, UserFactory
 
 
@@ -70,7 +70,7 @@ async def test_upload_reference_documents_job_success(self, mock_imports, mock_d
             mock_model_dump.return_value = {"file_name": "test.pdf", "pmid": None, "doi": "10.1234/test.doi"}
 
             # Execute job
-            result = await upload_reference_documents_job(reference, document_data, file_data_list, user.id)
+            result = await upload_and_preprocess_documents_job(reference, document_data, file_data_list, user.id)
 
             # Debug: print the actual result
             print(f"DEBUG: result = {result}")
@@ -107,7 +107,7 @@ async def test_upload_reference_documents_job_workspace_not_found(self):
         # Use non-existent workspace ID - the job will handle the lookup internally
 
         # Execute job
-        result = await upload_reference_documents_job(reference, document_data, file_data_list, user.id)
+        result = await upload_and_preprocess_documents_job(reference, document_data, file_data_list, user.id)
 
         # Verify result
         assert result["success"] is False
@@ -165,7 +165,7 @@ async def test_upload_reference_documents_job_partial_failure(self, mock_imports
             mock_model_dump.return_value = {"file_name": "test.pdf", "pmid": None, "doi": "10.1234/test.doi"}
 
             # Execute job
-            result = await upload_reference_documents_job(reference, document_data, file_data_list, user.id)
+            result = await upload_and_preprocess_documents_job(reference, document_data, file_data_list, user.id)
 
             # Debug: print the actual result
             print(f"DEBUG: result = {result}")

From fcd8fe2d4710a04de404f2aba60331fa267ba486 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Fri, 8 Aug 2025 22:27:06 -0700
Subject: [PATCH 18/22] Implement PDF text layer detection using OCRmyPDF

- Introduced `PDFTextLayerDetector` class to analyze PDF files for existing text layers.
- Added methods for detecting text layers, checking OCR requirements, and retrieving pages needing OCR.
- Refactored existing code to improve clarity and functionality, including the use of dataclasses for structured results.
- Enhanced error handling for encrypted and invalid PDF files.
- Updated module documentation to reflect new functionality.
---
 .../contexts/document/analysis.py             | 612 ++++++------------
 .../contexts/document/margin.py               | 458 +++++++++++++
 .../contexts/embeddings/__init__.py           |  14 +
 .../extralit_server/contexts/ocr/__init__.py  |  14 +
 .../src/extralit_server/jobs/ocr_jobs.py      |  14 +
 5 files changed, 712 insertions(+), 400 deletions(-)
 create mode 100644 extralit-server/src/extralit_server/contexts/document/margin.py
 create mode 100644 extralit-server/src/extralit_server/contexts/embeddings/__init__.py
 create mode 100644 extralit-server/src/extralit_server/contexts/ocr/__init__.py
 create mode 100644 extralit-server/src/extralit_server/jobs/ocr_jobs.py

diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py
index e470a11fd..cc1ecdbf7 100644
--- a/extralit-server/src/extralit_server/contexts/document/analysis.py
+++ b/extralit-server/src/extralit_server/contexts/document/analysis.py
@@ -12,447 +12,259 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
-
-from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
-import numpy as np
+"""
+PDF text layer detection using OCRmyPDF internal functions.
 
-import lazy_loader as lazy
+This module provides functionality to detect whether a PDF already has an OCR text layer
+by leveraging OCRmyPDF's internal PdfInfo and PageInfo classes.
+"""
 
-cv2 = lazy.load("cv2")
-pdf2image = lazy.load("pdf2image")
-PIL = lazy.load("PIL")
-
-if TYPE_CHECKING:
-    from PIL.Image import Image
+import logging
+from dataclasses import dataclass
+from io import BytesIO
+from pathlib import Path
+from typing import List, Optional, Union
+from concurrent.futures import ThreadPoolExecutor
+
+try:
+    from ocrmypdf.pdfinfo.info import PageInfo
+    from ocrmypdf.exceptions import EncryptedPdfError, InputFileError
+    from ocrmypdf._pipeline import get_pdfinfo
+    from ocrmypdf._concurrent import Executor
+except ImportError as e:
+    raise ImportError(
+        "OCRmyPDF is required for PDF text layer detection. " "Please install it with: pip install ocrmypdf"
+    ) from e
 
 _LOGGER = logging.getLogger(__name__)
 
+DEFAULT_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 
-def pil_to_cv(image: "Image") -> np.ndarray:
-    """Convert PIL Image to OpenCV format."""
-    return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)  # type: ignore
 
+@dataclass
+class PageTextInfo:
+    """Information about text content on a specific PDF page."""
 
-def classify_and_draw_layout_regions(
-    reference: "Image", mask: "Image", min_area: int = 5000, label: bool = True
-) -> Tuple["Image", List[Dict]]:
-    """
-    Classify and optionally draw layout regions using contour detection.
-
-    Returns:
-        Tuple of (annotated image, list of detected regions)
-    """
+    page_number: int
+    has_text: bool
+    has_images: bool
+    has_corrupt_text: bool = False
+    width_pixels: Optional[int] = None
+    height_pixels: Optional[int] = None
+    text_extraction_confidence: Optional[float] = None
+    needs_ocr: bool = True
 
-    mask_np = np.array(mask.convert("L"))
-    h, w = mask_np.shape
 
-    # Clean up the mask using morphological operations
-    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))  # type: ignore
-    cleaned = cv2.morphologyEx(mask_np, cv2.MORPH_CLOSE, kernel)  # type: ignore
+@dataclass
+class PDFTextAnalysisResult:
+    """Result of PDF text layer analysis."""
 
-    contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)  # type: ignore
+    total_pages: int
+    has_text_layer: bool
+    pages_with_text: int
+    pages_with_images: int
+    pages_needing_ocr: int
+    is_encrypted: bool
+    analysis_error: Optional[str] = None
+    pages: List[PageTextInfo] = []
 
-    img = reference.copy() if label else reference
-    regions = []
 
-    if label:
-        draw = PIL.ImageDraw.Draw(img)  # type: ignore
-
-    for cnt in contours:
-        x, y, rw, rh = cv2.boundingRect(cnt)  # type: ignore
-        area = rw * rh
+class PDFTextLayerDetector:
+    """
+    Detector for PDF text layers using OCRmyPDF internal functions.
 
-        if area < min_area:
-            continue
+    This class uses OCRmyPDF's PdfInfo to analyze PDF pages and determine
+    which pages already have text content and which would require OCR processing.
+    """
 
-        cx, cy = x + rw // 2, y + rh // 2
+    def __init__(self, executor: Optional["Executor"] = None):
+        """
+        Initialize the PDF text layer detector.
 
-        # Classify region based on position
-        if cy < h * 0.25:
-            region = "header"
-        elif cy > h * 0.75:
-            region = "footer"
-        elif cx < w * 0.15:
-            region = "left_margin"
-        elif cx > w * 0.85:
-            region = "right_margin"
-        else:
-            region = "body"
-
-        region_data = {
-            "type": region,
-            "x": x,
-            "y": y,
-            "width": rw,
-            "height": rh,
-            "area": area,
-            "center_x": cx,
-            "center_y": cy,
-        }
-        regions.append(region_data)
-
-        if label:
-            draw.rectangle([x, y, x + rw, y + rh], outline="green", width=2)
-            draw.text((x, y - 10), region, fill="green")
-
-    return img, regions
-
-
-def find_horizontal_bands(mask: "Image", min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]:
-    """Find horizontal bands of similar content across pages."""
-    mask_np = np.array(mask.convert("L"))
-    h, w = mask_np.shape
-
-    row_sums = np.sum(mask_np == 255, axis=1) / w  # white = same
-    same_rows = row_sums >= min_ratio
-
-    bands = []
-    start = None
-    for i, val in enumerate(same_rows):
-        if val and start is None:
-            start = i
-        elif not val and start is not None:
-            if i - start >= min_height:
-                bands.append((start, i))
-            start = None
-    if start is not None and h - start >= min_height:
-        bands.append((start, h))
-
-    return bands
-
-
-class PDFAnalyzer:
-    def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
+        Args:
+            executor: Optional executor for concurrent processing. Defaults to ThreadPoolExecutor.
         """
-        Analyze PDF layout to extract margin and region information.
+        self.executor = executor or DEFAULT_EXECUTOR
+
+    def detect_text_layer(
+        self,
+        pdf_data: Union[bytes, str, Path],
+        filename: str,
+        detailed_analysis: bool = True,
+        check_pages: Optional[range] = None,
+    ) -> PDFTextAnalysisResult:
+        """
+        Detect if a PDF has an OCR text layer.
 
         Args:
-            pdf_data: PDF file data as bytes
-            filename: Filename for logging
+            pdf_data: PDF data as bytes, file path string, or Path object
+            filename: Filename for logging and identification (required)
+            detailed_analysis: Whether to perform detailed page-by-page analysis
+            check_pages: Optional range of pages to check (None = check all pages)
 
         Returns:
-            Dictionary containing layout analysis metadata
+            PDFTextAnalysisResult containing text layer analysis information
         """
+        # Handle different input types
+        if isinstance(pdf_data, bytes):
+            # Use BytesIO for bytes input - OCRmyPDF can work with file-like objects
+            input_file = BytesIO(pdf_data)
+        else:
+            # Handle string or Path input
+            input_path = Path(pdf_data)
+            if filename is None:
+                filename = input_path.name
+            input_file = input_path
 
         try:
-            images = pdf2image.convert_from_bytes(pdf_data, dpi=150)  # type: ignore
-            if not images:
-                return {"analysis_available": False, "error": "No pages found"}
-
-            _LOGGER.info(f"Analyzing layout for {filename} with {len(images)} pages")
+            # Use OCRmyPDF's get_pdfinfo function to analyze the PDF
+            pdf_info = get_pdfinfo(
+                input_file,
+                executor=self.executor,  # type: ignore
+                detailed_analysis=detailed_analysis,
+                progbar=False,
+                check_pages=check_pages,
+            )
 
-            # Analyze layout
-            layout_data = self._analyze_page_layout(images)
+            # Analyze pages
+            pages_info = []
+            pages_with_text = 0
+            pages_with_images = 0
+            pages_needing_ocr = 0
+
+            for page_num, page_info in enumerate(pdf_info.pages):
+                if page_info is None:
+                    continue
+
+                # Create PageTextInfo from OCRmyPDF's PageInfo
+                page_text_info = PageTextInfo(
+                    page_number=page_num + 1,  # 1-based page numbering
+                    has_text=page_info.has_text,
+                    has_images=bool(page_info.images),
+                    has_corrupt_text=getattr(page_info, "has_corrupt_text", False),
+                    width_pixels=getattr(page_info, "width_pixels", None),
+                    height_pixels=getattr(page_info, "height_pixels", None),
+                    needs_ocr=self._determine_ocr_requirement(page_info),
+                )
+
+                pages_info.append(page_text_info)
+
+                if page_text_info.has_text:
+                    pages_with_text += 1
+                if page_text_info.has_images:
+                    pages_with_images += 1
+                if page_text_info.needs_ocr:
+                    pages_needing_ocr += 1
+
+            # Determine overall text layer status
+            has_text_layer = pages_with_text > 0
+
+            result = PDFTextAnalysisResult(
+                total_pages=len(pdf_info.pages),
+                has_text_layer=has_text_layer,
+                pages_with_text=pages_with_text,
+                pages_with_images=pages_with_images,
+                pages_needing_ocr=pages_needing_ocr,
+                is_encrypted=False,
+                pages=pages_info,
+            )
 
-            return {
-                "analysis_available": True,
-                "total_pages": len(images),
-                "page_dimensions": {"width": images[0].size[0], "height": images[0].size[1]} if images else {},
-                **layout_data,
-            }
+            _LOGGER.info(
+                f"PDF text analysis for {filename}: "
+                f"{pages_with_text}/{len(pdf_info.pages)} pages have text, "
+                f"{pages_needing_ocr} pages need OCR"
+            )
 
-        except Exception as e:
-            _LOGGER.error(f"PDF layout analysis failed for {filename}: {e}")
-            return {"analysis_available": False, "error": str(e)}
+            return result
+
+        except EncryptedPdfError:
+            _LOGGER.warning(f"PDF {filename} is encrypted")
+            return PDFTextAnalysisResult(
+                total_pages=0,
+                has_text_layer=False,
+                pages_with_text=0,
+                pages_with_images=0,
+                pages_needing_ocr=0,
+                is_encrypted=True,
+                analysis_error="PDF is encrypted",
+            )
 
-    def _analyze_page_layout(self, images: List["Image"]) -> Dict:
-        """
-        Analyze page layout by comparing pages to find common regions.
-        """
-        if len(images) < 2:
-            return self._analyze_single_page(images[0]) if images else {}
-
-        # Use first page as reference, compare with others
-        reference_img = images[0].convert("RGB")
-        margin_data = []
-
-        for i in range(1, min(len(images), 5)):  # Analyze up to 5 pages for efficiency
-            compare_img = images[i].convert("RGB")
-            page_margins = self._compare_pages_for_margins(reference_img, compare_img)
-            if page_margins:
-                margin_data.append(page_margins)
-
-        # Aggregate margin data
-        if margin_data:
-            return self._aggregate_margin_data(margin_data, reference_img.size)
-        else:
-            return self._analyze_single_page(reference_img)
+        except InputFileError as e:
+            _LOGGER.error(f"Invalid PDF file {filename}: {e}")
+            return PDFTextAnalysisResult(
+                total_pages=0,
+                has_text_layer=False,
+                pages_with_text=0,
+                pages_with_images=0,
+                pages_needing_ocr=0,
+                is_encrypted=False,
+                analysis_error=f"Invalid PDF file: {e}",
+            )
 
-    def _compare_pages_for_margins(self, reference: "Image", compare: "Image") -> Optional[Dict]:
-        """
-        Compare two pages to identify common regions using advanced CV2 techniques.
-        """
-        try:
-            # Ensure same size
-            if reference.size != compare.size:
-                _LOGGER.debug(f"Resizing page to match reference size")
-                compare = compare.resize(reference.size)
-
-            # Step 1: Compute difference and invert so white = same
-            diff = PIL.ImageChops.difference(reference, compare)  # type: ignore
-            sameness_mask = PIL.ImageChops.invert(diff.convert("L"))  # type: ignore
-
-            # Step 2: Threshold the mask (keep high-sameness pixels)
-            # Create a lookup table for thresholding
-            threshold = 30
-            lut = [255 if i > threshold else 0 for i in range(256)]
-            sameness_mask.point(lut).convert("1")
-
-            # Step 3: Find horizontal bands (potential headers/footers)
-            horizontal_bands = find_horizontal_bands(sameness_mask)
-
-            # Step 4: Use contour-based region classification
-            annotated_img, detected_regions = classify_and_draw_layout_regions(
-                reference, sameness_mask, min_area=5000, label=False
+        except Exception as e:
+            _LOGGER.error(f"PDF text analysis failed for {filename}: {e}")
+            return PDFTextAnalysisResult(
+                total_pages=0,
+                has_text_layer=False,
+                pages_with_text=0,
+                pages_with_images=0,
+                pages_needing_ocr=0,
+                is_encrypted=False,
+                analysis_error=str(e),
             )
 
-            # Step 5: Classify and aggregate results
-            regions = self._classify_regions_advanced(horizontal_bands, detected_regions, reference.size)
+    def _determine_ocr_requirement(self, page_info: PageInfo) -> bool:
+        """
+        Determine if a page requires OCR processing based on OCRmyPDF logic.
 
-            return regions
+        This mirrors the logic from OCRmyPDF's is_ocr_required function but
+        simplified for detection purposes.
 
-        except Exception as e:
-            _LOGGER.debug(f"Page comparison failed: {e}")
-            return None
+        Args:
+            page_info: PageInfo object from OCRmyPDF
 
-    def _classify_regions_advanced(
-        self, bands: List[Tuple[int, int]], detected_regions: List[Dict], page_size: Tuple[int, int]
-    ) -> Dict:
-        """
-        Advanced region classification combining horizontal bands and contour detection.
-        """
-        width, height = page_size
-        regions = {
-            "header_bands": [],
-            "footer_bands": [],
-            "detected_regions": detected_regions,
-            "estimated_margins": {},
-        }
-
-        # Process horizontal bands
-        for start_y, end_y in bands:
-            band_center = (start_y + end_y) / 2
-            band_height = end_y - start_y
-
-            # Classify based on position
-            if band_center < height * 0.25:  # Top 25%
-                regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
-            elif band_center > height * 0.75:  # Bottom 25%
-                regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
-
-        # Estimate margins using both techniques
-        regions["estimated_margins"] = self._estimate_margins_advanced(regions, detected_regions, page_size)
-
-        return regions
-
-    def _estimate_margins_advanced(
-        self, regions: Dict, detected_regions: List[Dict], page_size: Tuple[int, int]
-    ) -> Dict:
-        """
-        Advanced margin estimation using both band and contour information.
-        """
-        width, height = page_size
-        margins = {
-            "top": 0,
-            "bottom": 0,
-            "left": 50,  # Default estimates
-            "right": 50,
-        }
-
-        # Calculate top margin from header regions
-        header_sources = []
-        if regions["header_bands"]:
-            header_sources.append(max(band["end_y"] for band in regions["header_bands"]))
-
-        # Add header regions from contour detection
-        header_regions = [r for r in detected_regions if r["type"] == "header"]
-        if header_regions:
-            header_sources.append(max(r["y"] + r["height"] for r in header_regions))
-
-        if header_sources:
-            margins["top"] = max(header_sources)
-
-        # Calculate bottom margin from footer regions
-        footer_sources = []
-        if regions["footer_bands"]:
-            footer_sources.append(min(band["start_y"] for band in regions["footer_bands"]))
-
-        # Add footer regions from contour detection
-        footer_regions = [r for r in detected_regions if r["type"] == "footer"]
-        if footer_regions:
-            footer_sources.append(min(r["y"] for r in footer_regions))
-
-        if footer_sources:
-            margins["bottom"] = height - min(footer_sources)
-
-        # Calculate left/right margins from contour detection
-        left_regions = [r for r in detected_regions if r["type"] == "left_margin"]
-        if left_regions:
-            margins["left"] = max(r["x"] + r["width"] for r in left_regions)
-
-        right_regions = [r for r in detected_regions if r["type"] == "right_margin"]
-        if right_regions:
-            margins["right"] = width - min(r["x"] for r in right_regions)
-
-        # Convert to relative percentages for consistency
-        return {
-            "top_px": margins["top"],
-            "bottom_px": margins["bottom"],
-            "left_px": margins["left"],
-            "right_px": margins["right"],
-            "top_percent": (margins["top"] / height) * 100 if height > 0 else 0,
-            "bottom_percent": (margins["bottom"] / height) * 100 if height > 0 else 0,
-            "left_percent": (margins["left"] / width) * 100 if width > 0 else 0,
-            "right_percent": (margins["right"] / width) * 100 if width > 0 else 0,
-        }
-
-    def _classify_regions(self, bands: List[Tuple[int, int]], page_size: Tuple[int, int]) -> Dict:
-        """
-        Classify horizontal bands into headers, footers, and margins.
+        Returns:
+            True if the page needs OCR, False otherwise
         """
-        width, height = page_size
-        regions = {"header_bands": [], "footer_bands": [], "estimated_margins": {}}
+        # If page has text, it typically doesn't need OCR (unless forcing)
+        if page_info.has_text:
+            return False
 
-        for start_y, end_y in bands:
-            band_center = (start_y + end_y) / 2
-            band_height = end_y - start_y
+        # If page has images, it likely needs OCR
+        if page_info.images:
+            return True
 
-            # Classify based on position
-            if band_center < height * 0.25:  # Top 25%
-                regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
-            elif band_center > height * 0.75:  # Bottom 25%
-                regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
+        # If page has no text and no images, it might be vector art
+        # For detection purposes, we'll assume it doesn't need OCR
+        return False
 
-        # Estimate margins based on bands
-        regions["estimated_margins"] = self._estimate_margins_from_bands(regions, page_size)
+    def has_text_layer(self, pdf_data: Union[bytes, str, Path], filename: str) -> bool:
+        """
+        Simple boolean check if PDF has any text layer.
 
-        return regions
+        Args:
+            pdf_data: PDF data as bytes, file path string, or Path object
+            filename: Filename for logging (required)
 
-    def _estimate_margins_from_bands(self, regions: Dict, page_size: Tuple[int, int]) -> Dict:
-        """
-        Estimate page margins based on detected bands.
-        """
-        width, height = page_size
-        margins = {
-            "top": 0,
-            "bottom": 0,
-            "left": 50,  # Default estimates
-            "right": 50,
-        }
-
-        # Calculate top margin from header bands
-        if regions["header_bands"]:
-            max_header_end = max(band["end_y"] for band in regions["header_bands"])
-            margins["top"] = max_header_end
-
-        # Calculate bottom margin from footer bands
-        if regions["footer_bands"]:
-            min_footer_start = min(band["start_y"] for band in regions["footer_bands"])
-            margins["bottom"] = height - min_footer_start
-
-        # Convert to relative percentages for consistency
-        return {
-            "top_px": margins["top"],
-            "bottom_px": margins["bottom"],
-            "left_px": margins["left"],
-            "right_px": margins["right"],
-            "top_percent": (margins["top"] / height) * 100,
-            "bottom_percent": (margins["bottom"] / height) * 100,
-            "left_percent": (margins["left"] / width) * 100,
-            "right_percent": (margins["right"] / width) * 100,
-        }
-
-    def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, int]) -> Dict:
-        """
-        Aggregate margin data from multiple page comparisons.
-        """
-        # Average the margin estimates
-        all_margins = [data.get("estimated_margins", {}) for data in margin_data if data.get("estimated_margins")]
-
-        if not all_margins:
-            return self._analyze_single_page_size(page_size)
-
-        # Calculate average margins
-        avg_margins = {}
-        for key in [
-            "top_px",
-            "bottom_px",
-            "left_px",
-            "right_px",
-            "top_percent",
-            "bottom_percent",
-            "left_percent",
-            "right_percent",
-        ]:
-            values = [m.get(key, 0) for m in all_margins if key in m]
-            avg_margins[key] = sum(values) / len(values) if values else 0
-
-        # Collect all bands and regions
-        all_header_bands = []
-        all_footer_bands = []
-        all_detected_regions = []
-
-        for data in margin_data:
-            all_header_bands.extend(data.get("header_bands", []))
-            all_footer_bands.extend(data.get("footer_bands", []))
-            all_detected_regions.extend(data.get("detected_regions", []))
-
-        # Aggregate detected regions by type
-        region_stats = {}
-        for region in all_detected_regions:
-            region_type = region["type"]
-            if region_type not in region_stats:
-                region_stats[region_type] = []
-            region_stats[region_type].append(region)
-
-        return {
-            "layout_analysis": {
-                "header_bands": all_header_bands,
-                "footer_bands": all_footer_bands,
-                "detected_regions": all_detected_regions,
-                "region_statistics": {
-                    region_type: {
-                        "count": len(regions),
-                        "avg_area": sum(r["area"] for r in regions) / len(regions) if regions else 0,
-                        "total_area": sum(r["area"] for r in regions),
-                    }
-                    for region_type, regions in region_stats.items()
-                },
-                "estimated_margins": avg_margins,
-                "analysis_method": "multi_page_comparison_advanced",
-            }
-        }
-
-    def _analyze_single_page(self, image: "Image") -> Dict:
-        """
-        Analyze a single page when comparison isn't possible.
+        Returns:
+            True if PDF has any text content, False otherwise
         """
-        return self._analyze_single_page_size(image.size)
+        result = self.detect_text_layer(pdf_data, filename, detailed_analysis=False)
+        return result.has_text_layer and result.analysis_error is None
 
-    def _analyze_single_page_size(self, page_size: Tuple[int, int]) -> Dict:
+    def get_pages_needing_ocr(self, pdf_data: Union[bytes, str, Path], filename: str) -> List[int]:
         """
-        Provide default margin estimates for single page analysis.
+        Get list of page numbers that need OCR processing.
+
+        Args:
+            pdf_data: PDF data as bytes, file path string, or Path object
+            filename: Filename for logging (required)
+
+        Returns:
+            List of 1-based page numbers that need OCR
         """
-        width, height = page_size
-
-        # Use common academic paper margins as defaults
-        default_margins = {
-            "top_px": int(height * 0.1),  # 10% top margin
-            "bottom_px": int(height * 0.1),  # 10% bottom margin
-            "left_px": int(width * 0.1),  # 10% left margin
-            "right_px": int(width * 0.1),  # 10% right margin
-            "top_percent": 10.0,
-            "bottom_percent": 10.0,
-            "left_percent": 10.0,
-            "right_percent": 10.0,
-        }
-
-        return {
-            "layout_analysis": {
-                "header_bands": [],
-                "footer_bands": [],
-                "estimated_margins": default_margins,
-                "analysis_method": "default_estimates",
-            }
-        }
+        result = self.detect_text_layer(pdf_data, filename, detailed_analysis=True)
+        if result.analysis_error:
+            return []
+
+        return [page.page_number for page in result.pages if page.needs_ocr]
diff --git a/extralit-server/src/extralit_server/contexts/document/margin.py b/extralit-server/src/extralit_server/contexts/document/margin.py
new file mode 100644
index 000000000..e470a11fd
--- /dev/null
+++ b/extralit-server/src/extralit_server/contexts/document/margin.py
@@ -0,0 +1,458 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
+import numpy as np
+
+import lazy_loader as lazy
+
+cv2 = lazy.load("cv2")
+pdf2image = lazy.load("pdf2image")
+PIL = lazy.load("PIL")
+
+if TYPE_CHECKING:
+    from PIL.Image import Image
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def pil_to_cv(image: "Image") -> np.ndarray:
+    """Convert PIL Image to OpenCV format."""
+    return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)  # type: ignore
+
+
+def classify_and_draw_layout_regions(
+    reference: "Image", mask: "Image", min_area: int = 5000, label: bool = True
+) -> Tuple["Image", List[Dict]]:
+    """
+    Classify and optionally draw layout regions using contour detection.
+
+    Returns:
+        Tuple of (annotated image, list of detected regions)
+    """
+
+    mask_np = np.array(mask.convert("L"))
+    h, w = mask_np.shape
+
+    # Clean up the mask using morphological operations
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))  # type: ignore
+    cleaned = cv2.morphologyEx(mask_np, cv2.MORPH_CLOSE, kernel)  # type: ignore
+
+    contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)  # type: ignore
+
+    img = reference.copy() if label else reference
+    regions = []
+
+    if label:
+        draw = PIL.ImageDraw.Draw(img)  # type: ignore
+
+    for cnt in contours:
+        x, y, rw, rh = cv2.boundingRect(cnt)  # type: ignore
+        area = rw * rh
+
+        if area < min_area:
+            continue
+
+        cx, cy = x + rw // 2, y + rh // 2
+
+        # Classify region based on position
+        if cy < h * 0.25:
+            region = "header"
+        elif cy > h * 0.75:
+            region = "footer"
+        elif cx < w * 0.15:
+            region = "left_margin"
+        elif cx > w * 0.85:
+            region = "right_margin"
+        else:
+            region = "body"
+
+        region_data = {
+            "type": region,
+            "x": x,
+            "y": y,
+            "width": rw,
+            "height": rh,
+            "area": area,
+            "center_x": cx,
+            "center_y": cy,
+        }
+        regions.append(region_data)
+
+        if label:
+            draw.rectangle([x, y, x + rw, y + rh], outline="green", width=2)
+            draw.text((x, y - 10), region, fill="green")
+
+    return img, regions
+
+
+def find_horizontal_bands(mask: "Image", min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]:
+    """Find horizontal bands of similar content across pages."""
+    mask_np = np.array(mask.convert("L"))
+    h, w = mask_np.shape
+
+    row_sums = np.sum(mask_np == 255, axis=1) / w  # white = same
+    same_rows = row_sums >= min_ratio
+
+    bands = []
+    start = None
+    for i, val in enumerate(same_rows):
+        if val and start is None:
+            start = i
+        elif not val and start is not None:
+            if i - start >= min_height:
+                bands.append((start, i))
+            start = None
+    if start is not None and h - start >= min_height:
+        bands.append((start, h))
+
+    return bands
+
+
+class PDFAnalyzer:
+    def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict:
+        """
+        Analyze PDF layout to extract margin and region information.
+
+        Args:
+            pdf_data: PDF file data as bytes
+            filename: Filename for logging
+
+        Returns:
+            Dictionary containing layout analysis metadata
+        """
+
+        try:
+            images = pdf2image.convert_from_bytes(pdf_data, dpi=150)  # type: ignore
+            if not images:
+                return {"analysis_available": False, "error": "No pages found"}
+
+            _LOGGER.info(f"Analyzing layout for {filename} with {len(images)} pages")
+
+            # Analyze layout
+            layout_data = self._analyze_page_layout(images)
+
+            return {
+                "analysis_available": True,
+                "total_pages": len(images),
+                "page_dimensions": {"width": images[0].size[0], "height": images[0].size[1]} if images else {},
+                **layout_data,
+            }
+
+        except Exception as e:
+            _LOGGER.error(f"PDF layout analysis failed for {filename}: {e}")
+            return {"analysis_available": False, "error": str(e)}
+
+    def _analyze_page_layout(self, images: List["Image"]) -> Dict:
+        """
+        Analyze page layout by comparing pages to find common regions.
+        """
+        if len(images) < 2:
+            return self._analyze_single_page(images[0]) if images else {}
+
+        # Use first page as reference, compare with others
+        reference_img = images[0].convert("RGB")
+        margin_data = []
+
+        for i in range(1, min(len(images), 5)):  # Analyze up to 5 pages for efficiency
+            compare_img = images[i].convert("RGB")
+            page_margins = self._compare_pages_for_margins(reference_img, compare_img)
+            if page_margins:
+                margin_data.append(page_margins)
+
+        # Aggregate margin data
+        if margin_data:
+            return self._aggregate_margin_data(margin_data, reference_img.size)
+        else:
+            return self._analyze_single_page(reference_img)
+
+    def _compare_pages_for_margins(self, reference: "Image", compare: "Image") -> Optional[Dict]:
+        """
+        Compare two pages to identify common regions using advanced CV2 techniques.
+        """
+        try:
+            # Ensure same size
+            if reference.size != compare.size:
+                _LOGGER.debug(f"Resizing page to match reference size")
+                compare = compare.resize(reference.size)
+
+            # Step 1: Compute difference and invert so white = same
+            diff = PIL.ImageChops.difference(reference, compare)  # type: ignore
+            sameness_mask = PIL.ImageChops.invert(diff.convert("L"))  # type: ignore
+
+            # Step 2: Threshold the mask (keep high-sameness pixels)
+            # Create a lookup table for thresholding
+            threshold = 30
+            lut = [255 if i > threshold else 0 for i in range(256)]
+            sameness_mask.point(lut).convert("1")
+
+            # Step 3: Find horizontal bands (potential headers/footers)
+            horizontal_bands = find_horizontal_bands(sameness_mask)
+
+            # Step 4: Use contour-based region classification
+            annotated_img, detected_regions = classify_and_draw_layout_regions(
+                reference, sameness_mask, min_area=5000, label=False
+            )
+
+            # Step 5: Classify and aggregate results
+            regions = self._classify_regions_advanced(horizontal_bands, detected_regions, reference.size)
+
+            return regions
+
+        except Exception as e:
+            _LOGGER.debug(f"Page comparison failed: {e}")
+            return None
+
+    def _classify_regions_advanced(
+        self, bands: List[Tuple[int, int]], detected_regions: List[Dict], page_size: Tuple[int, int]
+    ) -> Dict:
+        """
+        Advanced region classification combining horizontal bands and contour detection.
+        """
+        width, height = page_size
+        regions = {
+            "header_bands": [],
+            "footer_bands": [],
+            "detected_regions": detected_regions,
+            "estimated_margins": {},
+        }
+
+        # Process horizontal bands
+        for start_y, end_y in bands:
+            band_center = (start_y + end_y) / 2
+            band_height = end_y - start_y
+
+            # Classify based on position
+            if band_center < height * 0.25:  # Top 25%
+                regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
+            elif band_center > height * 0.75:  # Bottom 25%
+                regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
+
+        # Estimate margins using both techniques
+        regions["estimated_margins"] = self._estimate_margins_advanced(regions, detected_regions, page_size)
+
+        return regions
+
+    def _estimate_margins_advanced(
+        self, regions: Dict, detected_regions: List[Dict], page_size: Tuple[int, int]
+    ) -> Dict:
+        """
+        Advanced margin estimation using both band and contour information.
+        """
+        width, height = page_size
+        margins = {
+            "top": 0,
+            "bottom": 0,
+            "left": 50,  # Default estimates
+            "right": 50,
+        }
+
+        # Calculate top margin from header regions
+        header_sources = []
+        if regions["header_bands"]:
+            header_sources.append(max(band["end_y"] for band in regions["header_bands"]))
+
+        # Add header regions from contour detection
+        header_regions = [r for r in detected_regions if r["type"] == "header"]
+        if header_regions:
+            header_sources.append(max(r["y"] + r["height"] for r in header_regions))
+
+        if header_sources:
+            margins["top"] = max(header_sources)
+
+        # Calculate bottom margin from footer regions
+        footer_sources = []
+        if regions["footer_bands"]:
+            footer_sources.append(min(band["start_y"] for band in regions["footer_bands"]))
+
+        # Add footer regions from contour detection
+        footer_regions = [r for r in detected_regions if r["type"] == "footer"]
+        if footer_regions:
+            footer_sources.append(min(r["y"] for r in footer_regions))
+
+        if footer_sources:
+            margins["bottom"] = height - min(footer_sources)
+
+        # Calculate left/right margins from contour detection
+        left_regions = [r for r in detected_regions if r["type"] == "left_margin"]
+        if left_regions:
+            margins["left"] = max(r["x"] + r["width"] for r in left_regions)
+
+        right_regions = [r for r in detected_regions if r["type"] == "right_margin"]
+        if right_regions:
+            margins["right"] = width - min(r["x"] for r in right_regions)
+
+        # Convert to relative percentages for consistency
+        return {
+            "top_px": margins["top"],
+            "bottom_px": margins["bottom"],
+            "left_px": margins["left"],
+            "right_px": margins["right"],
+            "top_percent": (margins["top"] / height) * 100 if height > 0 else 0,
+            "bottom_percent": (margins["bottom"] / height) * 100 if height > 0 else 0,
+            "left_percent": (margins["left"] / width) * 100 if width > 0 else 0,
+            "right_percent": (margins["right"] / width) * 100 if width > 0 else 0,
+        }
+
+    def _classify_regions(self, bands: List[Tuple[int, int]], page_size: Tuple[int, int]) -> Dict:
+        """
+        Classify horizontal bands into headers, footers, and margins.
+        """
+        width, height = page_size
+        regions = {"header_bands": [], "footer_bands": [], "estimated_margins": {}}
+
+        for start_y, end_y in bands:
+            band_center = (start_y + end_y) / 2
+            band_height = end_y - start_y
+
+            # Classify based on position
+            if band_center < height * 0.25:  # Top 25%
+                regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
+            elif band_center > height * 0.75:  # Bottom 25%
+                regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height})
+
+        # Estimate margins based on bands
+        regions["estimated_margins"] = self._estimate_margins_from_bands(regions, page_size)
+
+        return regions
+
+    def _estimate_margins_from_bands(self, regions: Dict, page_size: Tuple[int, int]) -> Dict:
+        """
+        Estimate page margins based on detected bands.
+        """
+        width, height = page_size
+        margins = {
+            "top": 0,
+            "bottom": 0,
+            "left": 50,  # Default estimates
+            "right": 50,
+        }
+
+        # Calculate top margin from header bands
+        if regions["header_bands"]:
+            max_header_end = max(band["end_y"] for band in regions["header_bands"])
+            margins["top"] = max_header_end
+
+        # Calculate bottom margin from footer bands
+        if regions["footer_bands"]:
+            min_footer_start = min(band["start_y"] for band in regions["footer_bands"])
+            margins["bottom"] = height - min_footer_start
+
+        # Convert to relative percentages for consistency
+        return {
+            "top_px": margins["top"],
+            "bottom_px": margins["bottom"],
+            "left_px": margins["left"],
+            "right_px": margins["right"],
+            "top_percent": (margins["top"] / height) * 100,
+            "bottom_percent": (margins["bottom"] / height) * 100,
+            "left_percent": (margins["left"] / width) * 100,
+            "right_percent": (margins["right"] / width) * 100,
+        }
+
+    def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, int]) -> Dict:
+        """
+        Aggregate margin data from multiple page comparisons.
+        """
+        # Average the margin estimates
+        all_margins = [data.get("estimated_margins", {}) for data in margin_data if data.get("estimated_margins")]
+
+        if not all_margins:
+            return self._analyze_single_page_size(page_size)
+
+        # Calculate average margins
+        avg_margins = {}
+        for key in [
+            "top_px",
+            "bottom_px",
+            "left_px",
+            "right_px",
+            "top_percent",
+            "bottom_percent",
+            "left_percent",
+            "right_percent",
+        ]:
+            values = [m.get(key, 0) for m in all_margins if key in m]
+            avg_margins[key] = sum(values) / len(values) if values else 0
+
+        # Collect all bands and regions
+        all_header_bands = []
+        all_footer_bands = []
+        all_detected_regions = []
+
+        for data in margin_data:
+            all_header_bands.extend(data.get("header_bands", []))
+            all_footer_bands.extend(data.get("footer_bands", []))
+            all_detected_regions.extend(data.get("detected_regions", []))
+
+        # Aggregate detected regions by type
+        region_stats = {}
+        for region in all_detected_regions:
+            region_type = region["type"]
+            if region_type not in region_stats:
+                region_stats[region_type] = []
+            region_stats[region_type].append(region)
+
+        return {
+            "layout_analysis": {
+                "header_bands": all_header_bands,
+                "footer_bands": all_footer_bands,
+                "detected_regions": all_detected_regions,
+                "region_statistics": {
+                    region_type: {
+                        "count": len(regions),
+                        "avg_area": sum(r["area"] for r in regions) / len(regions) if regions else 0,
+                        "total_area": sum(r["area"] for r in regions),
+                    }
+                    for region_type, regions in region_stats.items()
+                },
+                "estimated_margins": avg_margins,
+                "analysis_method": "multi_page_comparison_advanced",
+            }
+        }
+
+    def _analyze_single_page(self, image: "Image") -> Dict:
+        """
+        Analyze a single page when comparison isn't possible.
+        """
+        return self._analyze_single_page_size(image.size)
+
+    def _analyze_single_page_size(self, page_size: Tuple[int, int]) -> Dict:
+        """
+        Provide default margin estimates for single page analysis.
+        """
+        width, height = page_size
+
+        # Use common academic paper margins as defaults
+        default_margins = {
+            "top_px": int(height * 0.1),  # 10% top margin
+            "bottom_px": int(height * 0.1),  # 10% bottom margin
+            "left_px": int(width * 0.1),  # 10% left margin
+            "right_px": int(width * 0.1),  # 10% right margin
+            "top_percent": 10.0,
+            "bottom_percent": 10.0,
+            "left_percent": 10.0,
+            "right_percent": 10.0,
+        }
+
+        return {
+            "layout_analysis": {
+                "header_bands": [],
+                "footer_bands": [],
+                "estimated_margins": default_margins,
+                "analysis_method": "default_estimates",
+            }
+        }
diff --git a/extralit-server/src/extralit_server/contexts/embeddings/__init__.py b/extralit-server/src/extralit_server/contexts/embeddings/__init__.py
new file mode 100644
index 000000000..fb5dffc96
--- /dev/null
+++ b/extralit-server/src/extralit_server/contexts/embeddings/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/extralit-server/src/extralit_server/contexts/ocr/__init__.py b/extralit-server/src/extralit_server/contexts/ocr/__init__.py
new file mode 100644
index 000000000..fb5dffc96
--- /dev/null
+++ b/extralit-server/src/extralit_server/contexts/ocr/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/extralit-server/src/extralit_server/jobs/ocr_jobs.py b/extralit-server/src/extralit_server/jobs/ocr_jobs.py
new file mode 100644
index 000000000..fb5dffc96
--- /dev/null
+++ b/extralit-server/src/extralit_server/jobs/ocr_jobs.py
@@ -0,0 +1,14 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+

From 960a4009ec50fd36cec1241e774ea40ba0f498c7 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Sat, 9 Aug 2025 16:34:40 -0700
Subject: [PATCH 19/22] fix opengl issues and import errors

- Updated `PDFTextAnalysisResult` to use `field(default_factory=list)` for better default list handling.
- Enhanced OpenCV loading in `margin.py` to set CPU-only mode and added error handling for loading failures.
- Adjusted imports in `preprocessing.py` to correctly reference `PDFAnalyzer` from the margin module.
---
 .../extralit_server/contexts/document/analysis.py  |  4 ++--
 .../extralit_server/contexts/document/margin.py    | 14 +++++++++++++-
 .../contexts/document/preprocessing.py             |  2 +-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py
index cc1ecdbf7..fa43e85dc 100644
--- a/extralit-server/src/extralit_server/contexts/document/analysis.py
+++ b/extralit-server/src/extralit_server/contexts/document/analysis.py
@@ -20,7 +20,7 @@
 """
 
 import logging
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union
@@ -66,7 +66,7 @@ class PDFTextAnalysisResult:
     pages_needing_ocr: int
     is_encrypted: bool
     analysis_error: Optional[str] = None
-    pages: List[PageTextInfo] = []
+    pages: List[PageTextInfo] = field(default_factory=list)
 
 
 class PDFTextLayerDetector:
diff --git a/extralit-server/src/extralit_server/contexts/document/margin.py b/extralit-server/src/extralit_server/contexts/document/margin.py
index e470a11fd..dbcfb87a9 100644
--- a/extralit-server/src/extralit_server/contexts/document/margin.py
+++ b/extralit-server/src/extralit_server/contexts/document/margin.py
@@ -13,13 +13,25 @@
 # limitations under the License.
 
 import logging
+import os
 
 from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
 import numpy as np
 
 import lazy_loader as lazy
 
-cv2 = lazy.load("cv2")
+os.environ["OPENCV_VIDEOIO_PRIORITY_MSMF"] = "0"
+os.environ["OPENCV_VIDEOIO_PRIORITY_INTEL_MFX"] = "0"
+
+try:
+    cv2 = lazy.load("cv2")
+    # Set OpenCV to use CPU-only mode to avoid OpenGL issues
+    cv2.setUseOptimized(False)  # type: ignore
+    cv2.setNumThreads(1)  # type: ignore
+except Exception as e:
+    _LOGGER = logging.getLogger(__name__)
+    _LOGGER.warning(f"OpenCV not available or failed to load: {e}")
+
 pdf2image = lazy.load("pdf2image")
 PIL = lazy.load("PIL")
 
diff --git a/extralit-server/src/extralit_server/contexts/document/preprocessing.py b/extralit-server/src/extralit_server/contexts/document/preprocessing.py
index b2a22f02b..b467eed3e 100644
--- a/extralit-server/src/extralit_server/contexts/document/preprocessing.py
+++ b/extralit-server/src/extralit_server/contexts/document/preprocessing.py
@@ -26,8 +26,8 @@
 import lazy_loader as lazy
 from pydantic import Field
 from pydantic_settings import BaseSettings
+from extralit_server.contexts.document.margin import PDFAnalyzer
 from extralit_server.api.schemas.v1.document.preprocessing import PDFMetadata
-from extralit_server.contexts.document.analysis import PDFAnalyzer
 
 
 ocrmypdf = lazy.load("ocrmypdf")

From a8d9ef05e9d9d9e85c2e8e5c5101247a4af8710f Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Sun, 10 Aug 2025 10:09:26 -0700
Subject: [PATCH 20/22] chore: EXTRALIT_DATABASE_URL to use a relative path in
 .env.dev

---
 extralit-server/.env.dev | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev
index afbd12a3e..fd993b7d0 100644
--- a/extralit-server/.env.dev
+++ b/extralit-server/.env.dev
@@ -1,7 +1,7 @@
 OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # Needed by RQ to work with forked processes on MacOS
 ALEMBIC_CONFIG=src/extralit_server/alembic.ini
 EXTRALIT_AUTH_SECRET_KEY=8VO7na5N/jQx+yP/N+HlE8q51vPdrxqlh6OzoebIyko= # With this we avoid using a different key every time the server is reloaded
-EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${HOME}/.extralit/extralit-dev.db?check_same_thread=False
+EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///./extralit-dev.db?check_same_thread=False
 HF_HUB_DISABLE_TELEMETRY=1
 
 # S3 Configuration (skipped to use LocalFileStorage)
@@ -24,10 +24,9 @@ EXTRALIT_REDIS_URL=redis://localhost:6379/0
 # PDF Preprocessing
 PREPROCESSING_ENABLED=true
 PREPROCESSING_ENABLE_ANALYSIS=true
-PREPROCESSING_LANGUAGE='["eng"]'
 PREPROCESSING_ROTATE_PAGES=true
 PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0
 PREPROCESSING_CLEAN=false
 PREPROCESSING_SKIP_TEXT=true
-# PREPROCESSING_TESSERACT_TIMEOUT=0 # Uncomment to disable Tesseract OCR
+PREPROCESSING_TESSERACT_TIMEOUT=0
 PREPROCESSING_QUIET=false

From 290b51917debc755957302b708f8e7fbdd335ed5 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Sun, 10 Aug 2025 12:27:57 -0700
Subject: [PATCH 21/22] Refactor PDF text layer detection and analysis

- Replaced the `PDFTextLayerDetector` class with `PDFOCRLayerDetector` to streamline OCR text layer detection using `pdfminer`.
- Introduced methods for checking font resources and analyzing character quality in PDFs.
- Removed unused `figures.py` and `tables.py` files to clean up the codebase.
- Enhanced error handling and logging for better debugging and user feedback.
---
 .../contexts/document/analysis.py             | 380 +++++++-----------
 .../contexts/{document => ocr}/figures.py     |   0
 .../contexts/{document => ocr}/tables.py      |   0
 .../src/extralit_server/contexts/ocr/text.py  |  14 +
 4 files changed, 162 insertions(+), 232 deletions(-)
 rename extralit-server/src/extralit_server/contexts/{document => ocr}/figures.py (100%)
 rename extralit-server/src/extralit_server/contexts/{document => ocr}/tables.py (100%)
 create mode 100644 extralit-server/src/extralit_server/contexts/ocr/text.py

diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py
index fa43e85dc..5b3008db8 100644
--- a/extralit-server/src/extralit_server/contexts/document/analysis.py
+++ b/extralit-server/src/extralit_server/contexts/document/analysis.py
@@ -12,259 +12,175 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""
-PDF text layer detection using OCRmyPDF internal functions.
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.converter import PDFPageAggregator
+from pdfminer.layout import LAParams, LTTextBox, LTChar
+from typing import Dict, List
+from io import BytesIO
 
-This module provides functionality to detect whether a PDF already has an OCR text layer
-by leveraging OCRmyPDF's internal PdfInfo and PageInfo classes.
-"""
 
-import logging
-from dataclasses import dataclass, field
-from io import BytesIO
-from pathlib import Path
-from typing import List, Optional, Union
-from concurrent.futures import ThreadPoolExecutor
-
-try:
-    from ocrmypdf.pdfinfo.info import PageInfo
-    from ocrmypdf.exceptions import EncryptedPdfError, InputFileError
-    from ocrmypdf._pipeline import get_pdfinfo
-    from ocrmypdf._concurrent import Executor
-except ImportError as e:
-    raise ImportError(
-        "OCRmyPDF is required for PDF text layer detection. " "Please install it with: pip install ocrmypdf"
-    ) from e
-
-_LOGGER = logging.getLogger(__name__)
-
-DEFAULT_EXECUTOR = ThreadPoolExecutor(max_workers=1)
-
-
-@dataclass
-class PageTextInfo:
-    """Information about text content on a specific PDF page."""
-
-    page_number: int
-    has_text: bool
-    has_images: bool
-    has_corrupt_text: bool = False
-    width_pixels: Optional[int] = None
-    height_pixels: Optional[int] = None
-    text_extraction_confidence: Optional[float] = None
-    needs_ocr: bool = True
-
-
-@dataclass
-class PDFTextAnalysisResult:
-    """Result of PDF text layer analysis."""
-
-    total_pages: int
-    has_text_layer: bool
-    pages_with_text: int
-    pages_with_images: int
-    pages_needing_ocr: int
-    is_encrypted: bool
-    analysis_error: Optional[str] = None
-    pages: List[PageTextInfo] = field(default_factory=list)
-
-
-class PDFTextLayerDetector:
-    """
-    Detector for PDF text layers using OCRmyPDF internal functions.
-
-    This class uses OCRmyPDF's PdfInfo to analyze PDF pages and determine
-    which pages already have text content and which would require OCR processing.
-    """
-
-    def __init__(self, executor: Optional["Executor"] = None):
-        """
-        Initialize the PDF text layer detector.
+class PDFOCRLayerDetector:
+    def __init__(self):
+        self.resource_manager = PDFResourceManager()
+        self.laparams = LAParams()
+        self.device = PDFPageAggregator(self.resource_manager, laparams=self.laparams)
+        self.interpreter = PDFPageInterpreter(self.resource_manager, self.device)
 
-        Args:
-            executor: Optional executor for concurrent processing. Defaults to ThreadPoolExecutor.
+    def has_ocr_text_layer(self, pdf_bytes: bytes, threshold: float = 0.5, verbose=False) -> bool:
         """
-        self.executor = executor or DEFAULT_EXECUTOR
-
-    def detect_text_layer(
-        self,
-        pdf_data: Union[bytes, str, Path],
-        filename: str,
-        detailed_analysis: bool = True,
-        check_pages: Optional[range] = None,
-    ) -> PDFTextAnalysisResult:
-        """
-        Detect if a PDF has an OCR text layer.
+        Detect if PDF has OCR text layer by analyzing font resources per page.
+        Returns True if more than 50% of pages have font resources (indicating searchable text).
 
         Args:
-            pdf_data: PDF data as bytes, file path string, or Path object
-            filename: Filename for logging and identification (required)
-            detailed_analysis: Whether to perform detailed page-by-page analysis
-            check_pages: Optional range of pages to check (None = check all pages)
+            pdf_bytes: PDF file content as bytes
 
         Returns:
-            PDFTextAnalysisResult containing text layer analysis information
-        """
-        # Handle different input types
-        if isinstance(pdf_data, bytes):
-            # Use BytesIO for bytes input - OCRmyPDF can work with file-like objects
-            input_file = BytesIO(pdf_data)
-        else:
-            # Handle string or Path input
-            input_path = Path(pdf_data)
-            if filename is None:
-                filename = input_path.name
-            input_file = input_path
-
-        try:
-            # Use OCRmyPDF's get_pdfinfo function to analyze the PDF
-            pdf_info = get_pdfinfo(
-                input_file,
-                executor=self.executor,  # type: ignore
-                detailed_analysis=detailed_analysis,
-                progbar=False,
-                check_pages=check_pages,
-            )
-
-            # Analyze pages
-            pages_info = []
-            pages_with_text = 0
-            pages_with_images = 0
-            pages_needing_ocr = 0
-
-            for page_num, page_info in enumerate(pdf_info.pages):
-                if page_info is None:
-                    continue
-
-                # Create PageTextInfo from OCRmyPDF's PageInfo
-                page_text_info = PageTextInfo(
-                    page_number=page_num + 1,  # 1-based page numbering
-                    has_text=page_info.has_text,
-                    has_images=bool(page_info.images),
-                    has_corrupt_text=getattr(page_info, "has_corrupt_text", False),
-                    width_pixels=getattr(page_info, "width_pixels", None),
-                    height_pixels=getattr(page_info, "height_pixels", None),
-                    needs_ocr=self._determine_ocr_requirement(page_info),
-                )
-
-                pages_info.append(page_text_info)
-
-                if page_text_info.has_text:
-                    pages_with_text += 1
-                if page_text_info.has_images:
-                    pages_with_images += 1
-                if page_text_info.needs_ocr:
-                    pages_needing_ocr += 1
-
-            # Determine overall text layer status
-            has_text_layer = pages_with_text > 0
-
-            result = PDFTextAnalysisResult(
-                total_pages=len(pdf_info.pages),
-                has_text_layer=has_text_layer,
-                pages_with_text=pages_with_text,
-                pages_with_images=pages_with_images,
-                pages_needing_ocr=pages_needing_ocr,
-                is_encrypted=False,
-                pages=pages_info,
-            )
-
-            _LOGGER.info(
-                f"PDF text analysis for {filename}: "
-                f"{pages_with_text}/{len(pdf_info.pages)} pages have text, "
-                f"{pages_needing_ocr} pages need OCR"
-            )
-
-            return result
-
-        except EncryptedPdfError:
-            _LOGGER.warning(f"PDF {filename} is encrypted")
-            return PDFTextAnalysisResult(
-                total_pages=0,
-                has_text_layer=False,
-                pages_with_text=0,
-                pages_with_images=0,
-                pages_needing_ocr=0,
-                is_encrypted=True,
-                analysis_error="PDF is encrypted",
-            )
-
-        except InputFileError as e:
-            _LOGGER.error(f"Invalid PDF file {filename}: {e}")
-            return PDFTextAnalysisResult(
-                total_pages=0,
-                has_text_layer=False,
-                pages_with_text=0,
-                pages_with_images=0,
-                pages_needing_ocr=0,
-                is_encrypted=False,
-                analysis_error=f"Invalid PDF file: {e}",
-            )
-
-        except Exception as e:
-            _LOGGER.error(f"PDF text analysis failed for {filename}: {e}")
-            return PDFTextAnalysisResult(
-                total_pages=0,
-                has_text_layer=False,
-                pages_with_text=0,
-                pages_with_images=0,
-                pages_needing_ocr=0,
-                is_encrypted=False,
-                analysis_error=str(e),
-            )
-
-    def _determine_ocr_requirement(self, page_info: PageInfo) -> bool:
+            bool: True if PDF has OCR text layer, False otherwise
         """
-        Determine if a page requires OCR processing based on OCRmyPDF logic.
+        page_info = self._check_font_resources_per_page(pdf_bytes)
 
-        This mirrors the logic from OCRmyPDF's is_ocr_required function but
-        simplified for detection purposes.
+        if not page_info:
+            return False
 
-        Args:
-            page_info: PageInfo object from OCRmyPDF
+        if verbose:
+            print(f"Total pages: {len(page_info)}")
+            print(page_info)
 
-        Returns:
-            True if the page needs OCR, False otherwise
+        pages_with_fonts = sum(1 for page in page_info if page.get("has_fonts", False))
+        total_pages = len(page_info)
+
+        # Return True if more than 50% of pages have fonts
+        return pages_with_fonts > (total_pages * threshold)
+
+    def _check_font_resources_per_page(self, pdf_bytes: bytes) -> List[Dict]:
         """
-        # If page has text, it typically doesn't need OCR (unless forcing)
-        if page_info.has_text:
-            return False
+        Check each page for font resources - indicates searchable text
+        """
+        page_info = []
+
+        pdf_stream = BytesIO(pdf_bytes)
+        for page_num, page in enumerate(PDFPage.get_pages(pdf_stream)):
+            page_data = {
+                "page_number": page_num + 1,
+                "has_fonts": False,
+                "font_count": 0,
+                "has_images": False,
+                "resource_types": [],
+            }
+
+            if hasattr(page, "resources") and page.resources:
+                resources = page.resources
+
+                if "Font" in resources:
+                    page_data["has_fonts"] = True
+                    font_resource = resources["Font"]
+                    try:
+                        page_data["font_count"] = len(font_resource)  # type: ignore
+                    except (TypeError, AttributeError):
+                        page_data["font_count"] = 1
+
+                if "XObject" in resources:
+                    page_data["has_images"] = True
+
+                page_data["resource_types"] = list(resources.keys())
+
+            page_info.append(page_data)
+
+        return page_info
+
+    def analyze_character_quality(self, pdf_bytes: bytes) -> Dict:
+        char_stats = {
+            "total_chars": 0,
+            "font_variations": set(),
+            "suspicious_patterns": 0,
+            "ocr_artifacts": 0,
+            "avg_char_size": 0,
+            "size_variations": [],
+        }
+
+        pdf_stream = BytesIO(pdf_bytes)
+        for page in PDFPage.get_pages(pdf_stream):
+            self.interpreter.process_page(page)
+            layout = self.device.get_result()
+
+            for element in layout:
+                if isinstance(element, LTTextBox):
+                    for line in element:
+                        for char in line:
+                            if isinstance(char, LTChar):
+                                char_stats["total_chars"] += 1
+
+                                if self._is_ocr_artifact(char):
+                                    char_stats["ocr_artifacts"] += 1
+
+                                if self._is_suspicious_char(char):
+                                    char_stats["suspicious_patterns"] += 1
+
+        char_stats["ocr_quality_score"] = self._calculate_quality_score(char_stats)
+
+        return char_stats
+
+    def _is_ocr_artifact(self, char: LTChar) -> bool:
+        if "hidden" in char.fontname.lower() or "ocr" in char.fontname.lower():
+            return True
+
+        char_text = char.get_text()
+        if len(char_text) == 1:
+            # Look for replacement characters or unusual Unicode
+            if ord(char_text) > 65535 or char_text in ["�", "□", "▯"]:
+                return True
+
+        return False
+
+    def _is_suspicious_char(self, char: LTChar) -> bool:
+        char_text = char.get_text()
+
+        # Single character that's not alphanumeric or common punctuation
+        if len(char_text) == 1 and not (char_text.isalnum() or char_text in ".,!?;: "):
+            return True
 
-        # If page has images, it likely needs OCR
-        if page_info.images:
+        # Very small font size (might indicate hidden text)
+        if char.size < 1.0:
             return True
 
-        # If page has no text and no images, it might be vector art
-        # For detection purposes, we'll assume it doesn't need OCR
         return False
 
-    def has_text_layer(self, pdf_data: Union[bytes, str, Path], filename: str) -> bool:
-        """
-        Simple boolean check if PDF has any text layer.
+    def _calculate_quality_score(self, char_stats: Dict) -> float:
+        if char_stats["total_chars"] == 0:
+            return 0.0
 
-        Args:
-            pdf_data: PDF data as bytes, file path string, or Path object
-            filename: Filename for logging (required)
+        score = 1.0
 
-        Returns:
-            True if PDF has any text content, False otherwise
-        """
-        result = self.detect_text_layer(pdf_data, filename, detailed_analysis=False)
-        return result.has_text_layer and result.analysis_error is None
+        # Penalize OCR artifacts
+        artifact_ratio = char_stats["ocr_artifacts"] / char_stats["total_chars"]
+        score -= artifact_ratio * 0.5
 
-    def get_pages_needing_ocr(self, pdf_data: Union[bytes, str, Path], filename: str) -> List[int]:
-        """
-        Get list of page numbers that need OCR processing.
+        # Penalize suspicious patterns
+        suspicious_ratio = char_stats["suspicious_patterns"] / char_stats["total_chars"]
+        score -= suspicious_ratio * 0.3
 
-        Args:
-            pdf_data: PDF data as bytes, file path string, or Path object
-            filename: Filename for logging (required)
+        return max(0.0, min(1.0, score))
 
-        Returns:
-            List of 1-based page numbers that need OCR
-        """
-        result = self.detect_text_layer(pdf_data, filename, detailed_analysis=True)
-        if result.analysis_error:
-            return []
 
-        return [page.page_number for page in result.pages if page.needs_ocr]
+if __name__ == "__main__":
+    import sys
+    from pathlib import Path
+
+    if len(sys.argv) != 2:
+        print("Usage: python analysis.py <pdf_file_path>")
+        sys.exit(1)
+
+    pdf_path = sys.argv[1]
+    if not Path(pdf_path).is_file():
+        print(f"File not found: {pdf_path}")
+        sys.exit(1)
+
+    with open(pdf_path, "rb") as f:
+        pdf_bytes = f.read()
+
+    ocr_detector = PDFOCRLayerDetector()
+    has_ocr = ocr_detector.has_ocr_text_layer(pdf_bytes)
+    print(f"PDF has_ocr_text_layer: {has_ocr}")
+    ocr_quality = ocr_detector.analyze_character_quality(pdf_bytes)
+    print(f"PDF analyze_character_quality: {ocr_quality}")
diff --git a/extralit-server/src/extralit_server/contexts/document/figures.py b/extralit-server/src/extralit_server/contexts/ocr/figures.py
similarity index 100%
rename from extralit-server/src/extralit_server/contexts/document/figures.py
rename to extralit-server/src/extralit_server/contexts/ocr/figures.py
diff --git a/extralit-server/src/extralit_server/contexts/document/tables.py b/extralit-server/src/extralit_server/contexts/ocr/tables.py
similarity index 100%
rename from extralit-server/src/extralit_server/contexts/document/tables.py
rename to extralit-server/src/extralit_server/contexts/ocr/tables.py
diff --git a/extralit-server/src/extralit_server/contexts/ocr/text.py b/extralit-server/src/extralit_server/contexts/ocr/text.py
new file mode 100644
index 000000000..fb5dffc96
--- /dev/null
+++ b/extralit-server/src/extralit_server/contexts/ocr/text.py
@@ -0,0 +1,14 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+

From edb2b0cd3b828322c1ab1c49524286f3fa018b14 Mon Sep 17 00:00:00 2001
From: JonnyTran <nhat.c.tran@gmail.com>
Date: Mon, 11 Aug 2025 18:31:41 -0700
Subject: [PATCH 22/22] chore: lazy import bibtexparser

---
 extralit/src/extralit/cli/documents/import_bib.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/extralit/src/extralit/cli/documents/import_bib.py b/extralit/src/extralit/cli/documents/import_bib.py
index 3f597e791..8084d48eb 100644
--- a/extralit/src/extralit/cli/documents/import_bib.py
+++ b/extralit/src/extralit/cli/documents/import_bib.py
@@ -38,16 +38,19 @@
 
 import pandas as pd
 import typer
-import bibtexparser
 from rich.console import Console
 from rich.progress import Progress, SpinnerColumn, TextColumn
 from rich.table import Table
+import lazy_loader as lazy
 
 from extralit.workspaces._resource import Workspace
 from extralit.client import Extralit
 from extralit.cli.rich import get_themed_panel
 
 
+bibtexparser = lazy.load("bibtexparser")
+
+
 def _clean_bibtex_field(value: str) -> str:
     """Clean BibTeX field by removing braces and extra whitespace."""
     if not value: