From 025864121a0a743ce71493ecf19851e804d4c890 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Mon, 4 Aug 2025 23:17:28 -0700 Subject: [PATCH 01/22] added ocrmypdf --- argilla-server/pdm.lock | 294 +++++++++++++++++- argilla-server/pyproject.toml | 2 + .../argilla_server/api/schemas/v1/segments.py | 122 ++++++++ .../contexts/document/figures.py | 14 + .../contexts/document/preprocessing.py | 158 ++++++++++ .../contexts/document/tables.py | 14 + .../src/argilla_server/jobs/document_jobs.py | 8 +- 7 files changed, 609 insertions(+), 3 deletions(-) create mode 100644 argilla-server/src/argilla_server/api/schemas/v1/segments.py create mode 100644 argilla-server/src/argilla_server/contexts/document/figures.py create mode 100644 argilla-server/src/argilla_server/contexts/document/preprocessing.py create mode 100644 argilla-server/src/argilla_server/contexts/document/tables.py diff --git a/argilla-server/pdm.lock b/argilla-server/pdm.lock index 86777a985..c40a808db 100644 --- a/argilla-server/pdm.lock +++ b/argilla-server/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "postgresql", "test"] strategy = [] lock_version = "4.5.0" -content_hash = "sha256:9ff6bc1261cec9c0bdce741c04a63bf63a0a8e194e380d560c5032a6aef76b11" +content_hash = "sha256:21af2a3f2dff0688e08aed10184370c7c0aed450f814cf0d3509e964d9582654" [[metadata.targets]] requires_python = ">=3.9" @@ -835,6 +835,18 @@ files = [ {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"}, ] +[[package]] +name = "deprecation" +version = "2.1.0" +summary = "" +dependencies = [ + "packaging", +] +files = [ + {file = "deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a"}, + {file = "deprecation-2.1.0.tar.gz", hash = "sha256:72b3bde64e5d778694b0cf68178aed03d15e15477116add3fb773e581f9518ff"}, +] + [[package]] name = "dill" version = "0.3.8" @@ -1228,12 +1240,24 @@ files = [ {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, ] +[[package]] +name = "img2pdf" +version = "0.6.1" +summary = "" +dependencies = [ + "pikepdf", + "pillow", +] +files = [ + {file = "img2pdf-0.6.1.tar.gz", hash = "sha256:306e279eb832bc159d7d6294b697a9fbd11b4be1f799b14b3b2174fb506af289"}, +] + [[package]] name = "importlib-metadata" version = "8.5.0" summary = "" dependencies = [ - "zipp; python_full_version < \"3.13\"", + "zipp; python_full_version < \"3.10\"", ] files = [ {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"}, @@ -1261,6 +1285,100 @@ files = [ {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, ] +[[package]] +name = "lxml" +version = "6.0.0" +summary = "" +files = [ + {file = "lxml-6.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:35bc626eec405f745199200ccb5c6b36f202675d204aa29bb52e27ba2b71dea8"}, + {file = "lxml-6.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:246b40f8a4aec341cbbf52617cad8ab7c888d944bfe12a6abd2b1f6cfb6f6082"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:2793a627e95d119e9f1e19720730472f5543a6d84c50ea33313ce328d870f2dd"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:46b9ed911f36bfeb6338e0b482e7fe7c27d362c52fde29f221fddbc9ee2227e7"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b4790b558bee331a933e08883c423f65bbcd07e278f91b2272489e31ab1e2b4"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2030956cf4886b10be9a0285c6802e078ec2391e1dd7ff3eb509c2c95a69b76"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d23854ecf381ab1facc8f353dcd9adeddef3652268ee75297c1164c987c11dc"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:43fe5af2d590bf4691531b1d9a2495d7aab2090547eaacd224a3afec95706d76"}, + {file = "lxml-6.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74e748012f8c19b47f7d6321ac929a9a94ee92ef12bc4298c47e8b7219b26541"}, + {file = "lxml-6.0.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:43cfbb7db02b30ad3926e8fceaef260ba2fb7df787e38fa2df890c1ca7966c3b"}, + {file = "lxml-6.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:34190a1ec4f1e84af256495436b2d196529c3f2094f0af80202947567fdbf2e7"}, + {file = "lxml-6.0.0-cp310-cp310-win32.whl", hash = "sha256:5967fe415b1920a3877a4195e9a2b779249630ee49ece22021c690320ff07452"}, + {file = "lxml-6.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:f3389924581d9a770c6caa4df4e74b606180869043b9073e2cec324bad6e306e"}, + {file = "lxml-6.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:522fe7abb41309e9543b0d9b8b434f2b630c5fdaf6482bee642b34c8c70079c8"}, + {file = "lxml-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4ee56288d0df919e4aac43b539dd0e34bb55d6a12a6562038e8d6f3ed07f9e36"}, + {file = "lxml-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b8dd6dd0e9c1992613ccda2bcb74fc9d49159dbe0f0ca4753f37527749885c25"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:d7ae472f74afcc47320238b5dbfd363aba111a525943c8a34a1b657c6be934c3"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5592401cdf3dc682194727c1ddaa8aa0f3ddc57ca64fd03226a430b955eab6f6"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:58ffd35bd5425c3c3b9692d078bf7ab851441434531a7e517c4984d5634cd65b"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f720a14aa102a38907c6d5030e3d66b3b680c3e6f6bc95473931ea3c00c59967"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2a5e8d207311a0170aca0eb6b160af91adc29ec121832e4ac151a57743a1e1e"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:2dd1cc3ea7e60bfb31ff32cafe07e24839df573a5e7c2d33304082a5019bcd58"}, + {file = "lxml-6.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2cfcf84f1defed7e5798ef4f88aa25fcc52d279be731ce904789aa7ccfb7e8d2"}, + {file = "lxml-6.0.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:a52a4704811e2623b0324a18d41ad4b9fabf43ce5ff99b14e40a520e2190c851"}, + {file = "lxml-6.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c16304bba98f48a28ae10e32a8e75c349dd742c45156f297e16eeb1ba9287a1f"}, + {file = "lxml-6.0.0-cp311-cp311-win32.whl", hash = "sha256:f8d19565ae3eb956d84da3ef367aa7def14a2735d05bd275cd54c0301f0d0d6c"}, + {file = "lxml-6.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:b2d71cdefda9424adff9a3607ba5bbfc60ee972d73c21c7e3c19e71037574816"}, + {file = "lxml-6.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:8a2e76efbf8772add72d002d67a4c3d0958638696f541734304c7f28217a9cab"}, + {file = "lxml-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78718d8454a6e928470d511bf8ac93f469283a45c354995f7d19e77292f26108"}, + {file = "lxml-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:84ef591495ffd3f9dcabffd6391db7bb70d7230b5c35ef5148354a134f56f2be"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:2930aa001a3776c3e2601cb8e0a15d21b8270528d89cc308be4843ade546b9ab"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:219e0431ea8006e15005767f0351e3f7f9143e793e58519dc97fe9e07fae5563"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bd5913b4972681ffc9718bc2d4c53cde39ef81415e1671ff93e9aa30b46595e7"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:390240baeb9f415a82eefc2e13285016f9c8b5ad71ec80574ae8fa9605093cd7"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux_2_27_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d6e200909a119626744dd81bae409fc44134389e03fbf1d68ed2a55a2fb10991"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ca50bd612438258a91b5b3788c6621c1f05c8c478e7951899f492be42defc0da"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:c24b8efd9c0f62bad0439283c2c795ef916c5a6b75f03c17799775c7ae3c0c9e"}, + {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:afd27d8629ae94c5d863e32ab0e1d5590371d296b87dae0a751fb22bf3685741"}, + {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:54c4855eabd9fc29707d30141be99e5cd1102e7d2258d2892314cf4c110726c3"}, + {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c907516d49f77f6cd8ead1322198bdfd902003c3c330c77a1c5f3cc32a0e4d16"}, + {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:36531f81c8214e293097cd2b7873f178997dae33d3667caaae8bdfb9666b76c0"}, + {file = "lxml-6.0.0-cp312-cp312-win32.whl", hash = "sha256:690b20e3388a7ec98e899fd54c924e50ba6693874aa65ef9cb53de7f7de9d64a"}, + {file = "lxml-6.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:310b719b695b3dd442cdfbbe64936b2f2e231bb91d998e99e6f0daf991a3eba3"}, + {file = "lxml-6.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:8cb26f51c82d77483cdcd2b4a53cda55bbee29b3c2f3ddeb47182a2a9064e4eb"}, + {file = "lxml-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6da7cd4f405fd7db56e51e96bff0865b9853ae70df0e6720624049da76bde2da"}, + {file = "lxml-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b34339898bb556a2351a1830f88f751679f343eabf9cf05841c95b165152c9e7"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:51a5e4c61a4541bd1cd3ba74766d0c9b6c12d6a1a4964ef60026832aac8e79b3"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d18a25b19ca7307045581b18b3ec9ead2b1db5ccd8719c291f0cd0a5cec6cb81"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d4f0c66df4386b75d2ab1e20a489f30dc7fd9a06a896d64980541506086be1f1"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f4b481b6cc3a897adb4279216695150bbe7a44c03daba3c894f49d2037e0a24"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux_2_27_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8a78d6c9168f5bcb20971bf3329c2b83078611fbe1f807baadc64afc70523b3a"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ae06fbab4f1bb7db4f7c8ca9897dc8db4447d1a2b9bee78474ad403437bcc29"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:1fa377b827ca2023244a06554c6e7dc6828a10aaf74ca41965c5d8a4925aebb4"}, + {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1676b56d48048a62ef77a250428d1f31f610763636e0784ba67a9740823988ca"}, + {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:0e32698462aacc5c1cf6bdfebc9c781821b7e74c79f13e5ffc8bfe27c42b1abf"}, + {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4d6036c3a296707357efb375cfc24bb64cd955b9ec731abf11ebb1e40063949f"}, + {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7488a43033c958637b1a08cddc9188eb06d3ad36582cebc7d4815980b47e27ef"}, + {file = "lxml-6.0.0-cp313-cp313-win32.whl", hash = "sha256:5fcd7d3b1d8ecb91445bd71b9c88bdbeae528fefee4f379895becfc72298d181"}, + {file = "lxml-6.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:2f34687222b78fff795feeb799a7d44eca2477c3d9d3a46ce17d51a4f383e32e"}, + {file = "lxml-6.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:21db1ec5525780fd07251636eb5f7acb84003e9382c72c18c542a87c416ade03"}, + {file = "lxml-6.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:85b14a4689d5cff426c12eefe750738648706ea2753b20c2f973b2a000d3d261"}, + {file = "lxml-6.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f64ccf593916e93b8d36ed55401bb7fe9c7d5de3180ce2e10b08f82a8f397316"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:b372d10d17a701b0945f67be58fae4664fd056b85e0ff0fbc1e6c951cdbc0512"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a674c0948789e9136d69065cc28009c1b1874c6ea340253db58be7622ce6398f"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:edf6e4c8fe14dfe316939711e3ece3f9a20760aabf686051b537a7562f4da91a"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:048a930eb4572829604982e39a0c7289ab5dc8abc7fc9f5aabd6fbc08c154e93"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0b5fa5eda84057a4f1bbb4bb77a8c28ff20ae7ce211588d698ae453e13c6281"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:c352fc8f36f7e9727db17adbf93f82499457b3d7e5511368569b4c5bd155a922"}, + {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8db5dc617cb937ae17ff3403c3a70a7de9df4852a046f93e71edaec678f721d0"}, + {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:2181e4b1d07dde53986023482673c0f1fba5178ef800f9ab95ad791e8bdded6a"}, + {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b3c98d5b24c6095e89e03d65d5c574705be3d49c0d8ca10c17a8a4b5201b72f5"}, + {file = "lxml-6.0.0-cp39-cp39-win32.whl", hash = "sha256:04d67ceee6db4bcb92987ccb16e53bef6b42ced872509f333c04fb58a3315256"}, + {file = "lxml-6.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:e0b1520ef900e9ef62e392dd3d7ae4f5fa224d1dd62897a792cf353eb20b6cae"}, + {file = "lxml-6.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:e35e8aaaf3981489f42884b59726693de32dabfc438ac10ef4eb3409961fd402"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:dbdd7679a6f4f08152818043dbb39491d1af3332128b3752c3ec5cebc0011a72"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:40442e2a4456e9910875ac12951476d36c0870dcb38a68719f8c4686609897c4"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:db0efd6bae1c4730b9c863fc4f5f3c0fa3e8f05cae2c44ae141cb9dfc7d091dc"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ab542c91f5a47aaa58abdd8ea84b498e8e49fe4b883d67800017757a3eb78e8"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:013090383863b72c62a702d07678b658fa2567aa58d373d963cca245b017e065"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c86df1c9af35d903d2b52d22ea3e66db8058d21dc0f59842ca5deb0595921141"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4337e4aec93b7c011f7ee2e357b0d30562edd1955620fdd4aeab6aacd90d43c5"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ae74f7c762270196d2dda56f8dd7309411f08a4084ff2dfcc0b095a218df2e06"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:059c4cbf3973a621b62ea3132934ae737da2c132a788e6cfb9b08d63a0ef73f9"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:17f090a9bc0ce8da51a5632092f98a7e7f84bca26f33d161a98b57f7fb0004ca"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9da022c14baeec36edfcc8daf0e281e2f55b950249a455776f0d1adeeada4734"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a55da151d0b0c6ab176b4e761670ac0e2667817a1e0dadd04a01d0561a219349"}, + {file = "lxml-6.0.0.tar.gz", hash = "sha256:032e65120339d44cdc3efc326c9f660f5f7205f3a535c1fdbf898b29ea01fb72"}, +] + [[package]] name = "mako" version = "1.3.6" @@ -1537,6 +1655,47 @@ files = [ {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"}, ] +[[package]] +name = "ocrmypdf" +version = "15.4.4" +summary = "" +dependencies = [ + "deprecation; python_full_version < \"3.10\"", + "img2pdf; python_full_version < \"3.10\"", + "packaging; python_full_version < \"3.10\"", + "pdfminer-six; python_full_version < \"3.10\"", + "pikepdf; python_full_version < \"3.10\"", + "pillow; python_full_version < \"3.10\"", + "pluggy; python_full_version < \"3.10\"", + "reportlab; python_full_version < \"3.10\"", + "rich; python_full_version < \"3.10\"", + "typing-extensions; python_full_version < \"3.10\"", +] +files = [ + {file = "ocrmypdf-15.4.4-py39-none-any.whl", hash = "sha256:13fd388035b5f4bb673bff570cfc2cf72e51168646d5401de9e48ca355917c6d"}, + {file = "ocrmypdf-15.4.4.tar.gz", hash = "sha256:4696c81cc5b5d64f31ccfe685d10baeb69b42bb0974acddf292d8cf9d97605c3"}, +] + +[[package]] +name = "ocrmypdf" +version = "16.10.4" +summary = "" +dependencies = [ + "deprecation; python_full_version >= \"3.10\"", + "img2pdf; python_full_version >= \"3.10\"", + "packaging; python_full_version >= \"3.10\"", + "pdfminer-six; python_full_version >= \"3.10\"", + "pi-heif; python_full_version >= \"3.10\"", + "pikepdf; python_full_version >= \"3.10\"", + "pillow; python_full_version >= \"3.10\"", + "pluggy; python_full_version >= \"3.10\"", + "rich; python_full_version >= \"3.10\"", +] +files = [ + {file = "ocrmypdf-16.10.4-py3-none-any.whl", hash = "sha256:061f3165d09ffafac975cea00803802b8a75551ada9965292ea86ea382673688"}, + {file = "ocrmypdf-16.10.4.tar.gz", hash = "sha256:de749ef5f554b63d57e68d032e7cba5500cbd5030835bf24f658f7b7a04f3dc1"}, +] + [[package]] name = "opensearch-py" version = "2.0.1" @@ -1615,6 +1774,124 @@ files = [ {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"}, ] +[[package]] +name = "pdfminer-six" +version = "20250506" +summary = "" +dependencies = [ + "charset-normalizer", + "cryptography", +] +files = [ + {file = "pdfminer_six-20250506-py3-none-any.whl", hash = "sha256:d81ad173f62e5f841b53a8ba63af1a4a355933cfc0ffabd608e568b9193909e3"}, + {file = "pdfminer_six-20250506.tar.gz", hash = "sha256:b03cc8df09cf3c7aba8246deae52e0bca7ebb112a38895b5e1d4f5dd2b8ca2e7"}, +] + +[[package]] +name = "pi-heif" +version = "0.22.0" +summary = "" +dependencies = [ + "pillow; python_full_version >= \"3.10\"", +] +files = [ + {file = "pi_heif-0.22.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:fca84436339eee2c91ff09cd7e301cfa2a0f7a9d83d5bc6a9d1db8587221d239"}, + {file = "pi_heif-0.22.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:46b0fcf876d85c8684d3bc1a0b7a4e4bc5673b72084807dc6bf85caa2da9173b"}, + {file = "pi_heif-0.22.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d85a8b09e28f3234a9a64796fc3ed71516b14a9ba08cad416ebd0db251e5f263"}, + {file = "pi_heif-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21416131308fabaeadbd1eae4d4daf218443832409f91ea6571edb64a0dc8d1c"}, + {file = "pi_heif-0.22.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d308f32ec557ec9f8cfee1225d83d391ffc72a1a8f03106a5805693c02359678"}, + {file = "pi_heif-0.22.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:94359418200d7ed61f1910c5b3318fcaf0bb6e25c3e6361fbf986b320d4b7e80"}, + {file = "pi_heif-0.22.0-cp310-cp310-win_amd64.whl", hash = "sha256:0292a1c4b58a7bfeaad0e315ca713beee3051600cf2c100a0fa96fb32377c8fd"}, + {file = "pi_heif-0.22.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:98dab5eb6bd70bdbe8ce021b4287c42ca779f6ee6d6f6fc91609d950e135d6dd"}, + {file = "pi_heif-0.22.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ed1731ebece9dcaea50db251b891318ebfc6971161664cca1fd1367e75aa815f"}, + {file = "pi_heif-0.22.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d92149bad299390a96f29dc584bc0020c88d36d3edf073f03a6ac6b595673f63"}, + {file = "pi_heif-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd9f1688caa359ad9c6a66fc167fa41fa24dc0fa8ceed65be2c31563d42eb700"}, + {file = "pi_heif-0.22.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6339784cd447664faa4705373b7f4d7bc9c4133bc0e0a1140516614cd047e9a8"}, + {file = "pi_heif-0.22.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2c5cfa7b8610750751cd414f7e276093080b38e1728d721f5d315f03a9ebd25c"}, + {file = "pi_heif-0.22.0-cp311-cp311-win_amd64.whl", hash = "sha256:e739bfe4a1785e34b52eecf092d5c511b673f20f053c728472167fe3ddcbe202"}, + {file = "pi_heif-0.22.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:fe7b539c1924973de96a58477dab29475ed8bfbc81cb4588db9655e3661710ba"}, + {file = "pi_heif-0.22.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:322fd33c75ccf1208f08d07aea06c7582eed6e577a3400fe6efcbaab0c1677ff"}, + {file = "pi_heif-0.22.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3965be305b4a5bbe4c7585f45feeab18ed18228e729a970e9b8a09b25434c885"}, + {file = "pi_heif-0.22.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebd91145a1ab9229ce330e5a7cb8a95c875c16a1cb1f2b0b5ed86e61a9fb6bd4"}, + {file = "pi_heif-0.22.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ed229d31a4e0037f0ba417a21f403fb8f965a40e3e5abaedafe717f6b710f544"}, + {file = "pi_heif-0.22.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6d95b90d5b005c35839120e934bfa5746fdf88ba344d1e58a814a33e5e9f057c"}, + {file = "pi_heif-0.22.0-cp312-cp312-win_amd64.whl", hash = "sha256:943dee9b05c768acbc06662b327518b2a257dd08ced79dce7c11fab5ac2d5c4b"}, + {file = "pi_heif-0.22.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:95dd7ec2cbcef6ef1110c6ba539fa7e1489a023589076ca8b3eebcb1e38d256c"}, + {file = "pi_heif-0.22.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:0e635dceb40424b5d88c7a2183d8dabb844c7776118df12f275ead2a10d275f6"}, + {file = "pi_heif-0.22.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f668c27a564c7373a462c0484d49166084ec608b65f9d6763fef7a1c80eee8c0"}, + {file = "pi_heif-0.22.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24ea5ba8cbd871ae09a856dbb9a7e6376ba70b5207085d0302f539574614b9e0"}, + {file = "pi_heif-0.22.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a89b57cd839b09ee749d12397d2027e20fe7a64a44883688ab44a873b16b507b"}, + {file = "pi_heif-0.22.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93acd60ef14e3ea835b7e3dafe284c07116349b0df05507520f10520c3ad09c1"}, + {file = "pi_heif-0.22.0-cp313-cp313-win_amd64.whl", hash = "sha256:6415b0005216ad08f86d0ef75ec24e13e60bf5f45273ab54a4a22f008b9f41ac"}, + {file = "pi_heif-0.22.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:3f85ac3c0e2fb18af10e5b9789dcfd73f091b1d6ea2090d70d6e87f8744b8fe9"}, + {file = "pi_heif-0.22.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2635cbcf35206dd3d7f6453df8a6a5cd6a83bcdc9818d999b7342837482d614e"}, + {file = "pi_heif-0.22.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:893a49c195563a9bbbef571daad995110b47e3e6b624b92269c281cf1b70b8da"}, + {file = "pi_heif-0.22.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b160a20dd6fa9d951a556006f02ec601a433ec4002953fdb67025f42e5fa89ea"}, + {file = "pi_heif-0.22.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e2508317837ad6da6b6e2ba154faab766a0cdc189a86dd45b4b7decd641bfa5"}, + {file = "pi_heif-0.22.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a7a1666070cffce08027b4309fb7f270c0e3a4715a3e5a7a7202b05f65a849f2"}, + {file = "pi_heif-0.22.0-cp39-cp39-win_amd64.whl", hash = "sha256:c73e651cb17b7da3a740881c479e224084c95380df0d9d4f72d4858a422e80ae"}, + {file = "pi_heif-0.22.0-pp310-pypy310_pp73-macosx_13_0_x86_64.whl", hash = "sha256:6b83ec2f6db2dd61e09940006ee0a854eb58d91a52023be057da13a08a9f0517"}, + {file = "pi_heif-0.22.0-pp310-pypy310_pp73-macosx_14_0_arm64.whl", hash = "sha256:f33211fa2afa756b13a63e21aeab577cdc7ddb18a929a012cbbcd3b7d8a772d0"}, + {file = "pi_heif-0.22.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a82bb03e5ab429b6aee5f1446c7c1925b1fb4fd58d74c960c7995734285db269"}, + {file = "pi_heif-0.22.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79d72744708949bd9028516d860bd2c341371bca13aa2196e4f2267263834608"}, + {file = "pi_heif-0.22.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7bb583f93bb4c1dfaf3b6e689a9fa0de7c83182730c16ec8798c459cf8c3e8cf"}, + {file = "pi_heif-0.22.0-pp39-pypy39_pp73-macosx_13_0_x86_64.whl", hash = "sha256:052fffb0b65c51adf90993a696dd51dddc5f5707d5f40e7bd9f4ad958bb505d9"}, + {file = "pi_heif-0.22.0-pp39-pypy39_pp73-macosx_14_0_arm64.whl", hash = "sha256:b326a48001a97906e5eb4110113d0cfe1203704f3572100dd177782568c9fc32"}, + {file = "pi_heif-0.22.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8cc68012a870d5e39d8fd5468dfd1d452ca10388cab5fac30f90ddfa0772a3e"}, + {file = "pi_heif-0.22.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:350c49ac597d1b8cdaa8a35f2c0901a3847067b9d0a9fdc07d2d6851e5d63382"}, + {file = "pi_heif-0.22.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3de6fb5a58cf271897adc31e045db45003ae1e32116efc30fa20c72e1c90b2b"}, + {file = "pi_heif-0.22.0.tar.gz", hash = "sha256:489ddda3c9fed948715a9c8642c6ee24c3b438a7fbf85b3a8f097d632d7082a8"}, +] + +[[package]] +name = "pikepdf" +version = "9.10.2" +summary = "" +dependencies = [ + "deprecated", + "lxml", + "packaging", + "pillow", +] +files = [ + {file = "pikepdf-9.10.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:c2b40697c8aa48316c1846195afb8f12a3adf242c31fb3e960f067b4e3f47256"}, + {file = "pikepdf-9.10.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:975b2f2924617cae299f5cc219cd6a4d07576566fac4d28aa87a2c93024f9d74"}, + {file = "pikepdf-9.10.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:df5e66acc1f24c22cbf76089603045b9fab3e881e7bc3fd8d63630b395ee4865"}, + {file = "pikepdf-9.10.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cb83e0296ea74b18bf5fec5860d16167e3cef0ce074a21bd93b73bdd60daf6e4"}, + {file = "pikepdf-9.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5ea08e7df49e5e75b5f03d18ec901b77b202333393a01d88bfc73374cffd12a8"}, + {file = "pikepdf-9.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ea12192f0cc3bc6fcfaedd0f98161a7f0ca8630cbf972d55d208fb56e7f57120"}, + {file = "pikepdf-9.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:5aa2d4b8f28588cd4755211058ecb46941e0c73ec59ffd9744c59f1b924c6bd7"}, + {file = "pikepdf-9.10.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:fa1cfcd725624910fc57c5b6305c5958cd28f1d40b1f9ad26723aba7caaae345"}, + {file = "pikepdf-9.10.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:1575cb082b4ea39913ed90b96ff55d12d40f21a322f06144ab531d097c03b58c"}, + {file = "pikepdf-9.10.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a3e92458f2fc0a5e0a98a65a69534deac7a5fdf0791618afed6ca1a3623e972"}, + {file = "pikepdf-9.10.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c245099f9187d3c636430b941d72fa9e639b1dbed2b8f291b95b561a315fca4"}, + {file = "pikepdf-9.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c4bfe38e2dfa47f6c5e7e4ff166c6663b149c071e7b7c745595d3e3272cdc625"}, + {file = "pikepdf-9.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f49f12fef155bf92174f57d21724507427ee20ec43b61460120b8f7870905028"}, + {file = "pikepdf-9.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:58105543a2b671cc2ffb2d2da385e383d4731a19def86de656bd7da36755e444"}, + {file = "pikepdf-9.10.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:3b14cacd1f0275654a7803af2611e933f5d57a98cba08aa9041792bb0f38c073"}, + {file = "pikepdf-9.10.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:aaeee4676b99655c0f655404c1fca7ba483c5b4d96a790786dd4caa21e11ac18"}, + {file = "pikepdf-9.10.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:efee3a3cd8047e796508f56cefac4eb45d1173e81813dbeb3d8e9dd2e857de60"}, + {file = "pikepdf-9.10.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:83cb30d947fae647876d2dba3c0295c0e7aa75e915bf0ea2350c72a6b652b2fa"}, + {file = "pikepdf-9.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:cc498904eabec3f9d144f1c259080508b3c5809720ba8f142c3971b1525ebed8"}, + {file = "pikepdf-9.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25fb3e0d15c2c3cd77735335d09ca968df693dd0f9c6f028e9c9ce7b0ac86b48"}, + {file = "pikepdf-9.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:66819bd6edbca64fe2ec2020e85d339bee969aed051c2b7f256574da1a073ff6"}, + {file = "pikepdf-9.10.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:a2ed7c8eabfe35b4ae2564b26cc6946b40c4efccfaa9acf91bac8e0cfc31a467"}, + {file = "pikepdf-9.10.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:c7aec253420d69cbaf6228ade29ab1e2b501dd0d9561ea4c90f16c849ec5f9ea"}, + {file = "pikepdf-9.10.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c6bc4851b2978198143908b9a0e845ecc6587904754436bf0ee488fd6ec4aba"}, + {file = "pikepdf-9.10.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:860eec8cda5d7b6d168d6fd4a956d8101577d9ea4a585fafab3fc0b1bbaddea1"}, + {file = "pikepdf-9.10.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:048f3d5138c44f8c452d818e14130fa30d809f61d70063b6e615e91148342188"}, + {file = "pikepdf-9.10.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0fff140da5a75b41b4cdf34354366620c206f31fc513356c70cf5da6b81d2483"}, + {file = "pikepdf-9.10.2-cp313-cp313-win_amd64.whl", hash = "sha256:1b5af8e233ed232f02e31a281134eed94504c72e9de88326433e34641f04a113"}, + {file = "pikepdf-9.10.2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:ed7032dfe0f280e87908e025b22ecd49b230d2b753c4ef66d0f6ce2952f5e721"}, + {file = "pikepdf-9.10.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:9d5f9fa9513e600752acdd81fd1b987b6bf85a36c25779bd9a7e0986626424d7"}, + {file = "pikepdf-9.10.2-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1899d0d9dd1ebdf13125159029a2c89afc66d87f0f3bcdbca9adbda6ad2bce15"}, + {file = "pikepdf-9.10.2-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77ec60c230f11797e94a0659523c579fd8d25969de9091b2d6c7799868cd60c3"}, + {file = "pikepdf-9.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2ddc1cb0aba4f2fa0d95ed68460688e3efcd3a70973901faf5b8c85e81438bcf"}, + {file = "pikepdf-9.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a0ee549af6560be2c3f7b9c37b4c9c814bcd24249323b0525ba0b00a11988d90"}, + {file = "pikepdf-9.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:f1d7417a1b49d77f13f9e9310e5d122a0e69d5e06afd21e06d12b0baa5cd9578"}, + {file = "pikepdf-9.10.2.tar.gz", hash = "sha256:f62fc2183888f2ca1d271bf4faa440a2e2d0159221620a9c6a314f9c9a95680c"}, +] + [[package]] name = "pillow" version = "11.0.0" @@ -2289,6 +2566,19 @@ files = [ {file = "redis-5.2.0.tar.gz", hash = "sha256:0b1087665a771b1ff2e003aa5bdd354f15a70c9e25d5a7dbf9c722c16528a7b0"}, ] +[[package]] +name = "reportlab" +version = "4.4.3" +summary = "" +dependencies = [ + "charset-normalizer; python_full_version < \"3.10\"", + "pillow; python_full_version < \"3.10\"", +] +files = [ + {file = "reportlab-4.4.3-py3-none-any.whl", hash = "sha256:df905dc5ec5ddaae91fc9cb3371af863311271d555236410954961c5ee6ee1b5"}, + {file = "reportlab-4.4.3.tar.gz", hash = "sha256:073b0975dab69536acd3251858e6b0524ed3e087e71f1d0d1895acb50acf9c7b"}, +] + [[package]] name = "requests" version = "2.32.3" diff --git a/argilla-server/pyproject.toml b/argilla-server/pyproject.toml index 14c3e49c5..098bfdd57 100644 --- a/argilla-server/pyproject.toml +++ b/argilla-server/pyproject.toml @@ -66,6 +66,8 @@ dependencies = [ "Jinja2>=3.1.4", # Used by huggingface-hub to render dataset card templates # For file storage "minio>=7.2.7", + # For document processing + "ocrmypdf>=16.10.4" ] [project.optional-dependencies] diff --git a/argilla-server/src/argilla_server/api/schemas/v1/segments.py b/argilla-server/src/argilla_server/api/schemas/v1/segments.py new file mode 100644 index 000000000..f22da6284 --- /dev/null +++ b/argilla-server/src/argilla_server/api/schemas/v1/segments.py @@ -0,0 +1,122 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import uuid +from typing import Optional, Any, List, Union + +from pydantic import BaseModel, Field, validator + +""" +This is deprecated code that is outdated and should be used for reference only. +We may want to switch to using LlamaIndexDocument or other document models in the future. +""" + + +class Segments(BaseModel): + items: List[Union["TextSegment", "TableSegment", "FigureSegment"]] = Field( + default_factory=list, + description="List of segments in the reading order of the document", + ) + + def get(self, id: str, header: str | None = None, default=None): + for item in self.items: + if item.id == id or (header and item.header == header): + return item + + return default + + def __repr_str__(self, join_str: str) -> str: + return "\n " + f"{join_str}\n ".join(f"{type(item).__name__}({item})" for item in self.items) + + @validator("items", pre=True, each_item=True) + def parse_segments(cls, v): + if not isinstance(v, dict): + v = v.dict() + + segment_type = v.get("type", "").lower() + if segment_type in {"figure", "image"}: + return FigureSegment(**v) + elif segment_type == "table" or "html" in v: + return TableSegment(**v) + else: + return TextSegment(**v) + + def __getitem__(self, index): + return self.items[index] + + def __len__(self): + return len(self.items) + + +class Coordinates(BaseModel): + points: List[List[float]] = Field( + ..., description="List of 4 points, e.g. [[x1, y1], [x2, y1], [x1, y2], [x2, y2]]" + ) + layout_width: Optional[int] = Field(None, description="Width of the layout") + layout_height: Optional[int] = Field(None, description="Height of the layout") + system: Optional[str] = Field(description="System of coordinates") + + def __repr_str__(self, join_str: str) -> str: + return "" + + +class TextSegment(BaseModel): + id: str = Field( + default_factory=lambda: str(uuid.uuid4()), description="Unique identifier of the segment", repr=False + ) + + header: Optional[str] = Field( + None, + description="Header of the element", + ) + text: str = Field(..., description="Content as plain text", repr=False) + summary: Optional[str] = Field(None, description="Summary of the content") + page_number: Optional[int] = Field(None, description="Page number of the segment") + coordinates: Optional["Coordinates"] = Field( + None, description="Coordinates of the element in the document", repr=False + ) + level: Optional[int] = Field(None, description="Level of the header") + source: Optional[str] = Field(None, description="Source of the element", repr=False) + type: Optional[str] = Field("text", description="Type of the element", repr=False) + original: Optional[Any] = Field( + None, exclude=True, description="Original object from which the segment was extracted", repr=False + ) + + def text_cleaned(self): + return self.text.replace(" | ", " ").replace("---", "").strip() + + def __repr_str__(self, join_str: str) -> str: + return join_str.join( + repr(v) + if a is None + else ( + f'{a}="{v[:100]}...{v[-100:]}"'.replace("\n", "") + if isinstance(v, str) and len(v) > 200 + else f"{a}={v!r}" + ) + for a, v in self.__repr_args__() + if v and a not in {"INCLUDE_METADATA_KEYS"} + ) + + +class TableSegment(TextSegment): + footer: Optional[str] = Field(None, description="Footer of the table or figure, to explain variable acronyms.") + html: Optional[str] = Field(None, description="Content as HTML structured", repr=False) + image: Optional[str] = Field(None, description="URL/filepath of the element's image", repr=False) + probability: Optional[float] = Field(None, description="Probability or confidence of the segment's extraction") + type: Optional[str] = Field("table", description="Type of the element", repr=False) + + +class FigureSegment(TableSegment): + type: Optional[str] = Field("figure", description="Type of the element", repr=False) diff --git a/argilla-server/src/argilla_server/contexts/document/figures.py b/argilla-server/src/argilla_server/contexts/document/figures.py new file mode 100644 index 000000000..fb5dffc96 --- /dev/null +++ b/argilla-server/src/argilla_server/contexts/document/figures.py @@ -0,0 +1,14 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py new file mode 100644 index 000000000..6b28a8ebb --- /dev/null +++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py @@ -0,0 +1,158 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Document preprocessing utilities.""" + +import logging +import os +import tempfile +import time +from io import BytesIO +from uuid import uuid4 + +try: + import ocrmypdf + + OCRMYPDF_AVAILABLE = True +except ImportError: + OCRMYPDF_AVAILABLE = False + +logger = logging.getLogger(__name__) + + +def preprocess_pdf_with_ocrmypdf(file_data: bytes, filename: str) -> bytes: + """ + Preprocess PDF with OCRmyPDF to add OCR layer and fix orientation. + Works with bytes data and returns processed bytes, minimizing disk I/O. + + Args: + file_data: PDF file data as bytes + filename: Original filename for logging purposes + + Returns: + Processed PDF data as bytes (or original bytes if processing fails) + """ + if not OCRMYPDF_AVAILABLE: + logger.warning("OCRmyPDF not available, skipping preprocessing") + return file_data + + # Only process PDF files + if not filename.lower().endswith(".pdf"): + logger.debug(f"Skipping OCRmyPDF for non-PDF file: {filename}") + return file_data + + try: + logger.info(f"Starting OCRmyPDF preprocessing for: {filename}") + start_time = time.time() + + # Try using BytesIO objects first to minimize disk I/O + try: + input_buffer = BytesIO(file_data) + output_buffer = BytesIO() + + # OCRmyPDF configuration for optimal processing + ocrmypdf.ocr( + input_buffer, + output_buffer, + language=["eng"], # Can be configured for other languages + rotate_pages=True, # Auto-rotate pages with horizontal text + deskew=True, # Fix skewed text + clean=True, # Clean up artifacts + optimize=1, # Optimize output file size + pdf_renderer="hocr", # Use hOCR for better text positioning + force_ocr=False, # Only OCR pages that need it + skip_text=False, # Don't skip existing text + redo_ocr=False, # Don't redo existing OCR + progress_bar=False, + quiet=True, + ) + + # Get processed PDF data + processed_data = output_buffer.getvalue() + output_buffer.close() + input_buffer.close() + + except Exception as buffer_error: + # Fallback to temporary files if BytesIO approach fails + logger.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}") + processed_data = _preprocess_pdf_with_temp_files(file_data, filename) + + processing_time = time.time() - start_time + logger.info(f"OCRmyPDF completed for {filename} in {processing_time:.2f} seconds") + + return processed_data + + except Exception as e: + logger.error(f"OCRmyPDF preprocessing failed for {filename}: {e}") + return file_data + + +def _preprocess_pdf_with_temp_files(file_data: bytes, filename: str) -> bytes: + """ + Fallback implementation using unique temporary files to avoid concurrency issues. + """ + input_temp_file = None + output_temp_file = None + + try: + # Generate unique identifiers to avoid filename collisions in concurrent jobs + unique_id = str(uuid4()) + temp_dir = tempfile.gettempdir() + + # Create input temp file with unique identifier + input_temp_file = tempfile.NamedTemporaryFile( + suffix=".pdf", prefix=f"ocr_input_{unique_id}_", dir=temp_dir, delete=False + ) + input_temp_file.write(file_data) + input_temp_file.flush() + input_temp_file.close() + + # Create output temp file with unique identifier + output_temp_file = tempfile.NamedTemporaryFile( + suffix=".pdf", prefix=f"ocr_output_{unique_id}_", dir=temp_dir, delete=False + ) + output_temp_file.close() + + # OCRmyPDF configuration for optimal processing + ocrmypdf.ocr( + input_temp_file.name, + output_temp_file.name, + language=["eng"], # Can be configured for other languages + rotate_pages=True, # Auto-rotate pages with horizontal text + deskew=True, # Fix skewed text + clean=True, # Clean up artifacts + optimize=1, # Optimize output file size + pdf_renderer="hocr", # Use hOCR for better text positioning + force_ocr=False, # Only OCR pages that need it + skip_text=False, # Don't skip existing text + redo_ocr=False, # Don't redo existing OCR + progress_bar=False, + quiet=True, + ) + + # Read processed PDF data + with open(output_temp_file.name, "rb") as f: + processed_data = f.read() + + return processed_data + + finally: + # Clean up temporary files + for temp_file in [input_temp_file, output_temp_file]: + if temp_file is not None: + try: + if hasattr(temp_file, "name"): + os.unlink(temp_file.name) + except OSError as e: + logger.warning(f"Failed to clean up temp file: {e}") diff --git a/argilla-server/src/argilla_server/contexts/document/tables.py b/argilla-server/src/argilla_server/contexts/document/tables.py new file mode 100644 index 000000000..fb5dffc96 --- /dev/null +++ b/argilla-server/src/argilla_server/contexts/document/tables.py @@ -0,0 +1,14 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/argilla-server/src/argilla_server/jobs/document_jobs.py b/argilla-server/src/argilla_server/jobs/document_jobs.py index 3e2fbfa0d..8f3f99e90 100644 --- a/argilla-server/src/argilla_server/jobs/document_jobs.py +++ b/argilla-server/src/argilla_server/jobs/document_jobs.py @@ -26,6 +26,7 @@ from argilla_server.jobs import DEFAULT_QUEUE, JOB_TIMEOUT_DISABLED from argilla_server.api.schemas.v1.documents import DocumentCreate from argilla_server.contexts import files, imports +from argilla_server.contexts.document import preprocessing _LOGGER = logging.getLogger(__name__) @@ -129,11 +130,16 @@ async def upload_reference_documents_job( continue try: + # Preprocess PDF files with OCRmyPDF for rotation and OCR + processed_file_data = preprocessing.preprocess_pdf_with_ocrmypdf( + file_data=file_data, filename=filename + ) + file_url = files.put_document_file( client=client, workspace_name=workspace.name, document_id=file_document_create.id, # type: ignore - file_data=file_data, + file_data=processed_file_data, filename=filename, # metadata=file_document_create.model_dump( # include={"file_name": True, "pmid": True, "doi": True} From b4e15e617f7540a255bdbd1d944bdb6958533e0e Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Mon, 4 Aug 2025 23:36:30 -0700 Subject: [PATCH 02/22] refactor: enhance PDF preprocessing with configurable settings and integrate OCRmyPDF --- argilla-server/pdm.lock | 468 +----------------- argilla-server/pyproject.toml | 5 +- .../contexts/document/preprocessing.py | 288 +++++++---- .../src/argilla_server/jobs/document_jobs.py | 8 +- 4 files changed, 214 insertions(+), 555 deletions(-) diff --git a/argilla-server/pdm.lock b/argilla-server/pdm.lock index c40a808db..8403ffc7e 100644 --- a/argilla-server/pdm.lock +++ b/argilla-server/pdm.lock @@ -5,10 +5,10 @@ groups = ["default", "postgresql", "test"] strategy = [] lock_version = "4.5.0" -content_hash = "sha256:21af2a3f2dff0688e08aed10184370c7c0aed450f814cf0d3509e964d9582654" +content_hash = "sha256:f20406357dc5b02a37c1da8689053074229d1204cb9bcb4fe8848b6d2835b1b4" [[metadata.targets]] -requires_python = ">=3.9" +requires_python = ">=3.10" [[package]] name = "aiofiles" @@ -102,21 +102,6 @@ files = [ {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:77abf6665ae54000b98b3c742bc6ea1d1fb31c394bcabf8b5d2c1ac3ebfe7f3b"}, {file = "aiohttp-3.10.10-cp313-cp313-win32.whl", hash = "sha256:4470c73c12cd9109db8277287d11f9dd98f77fc54155fc71a7738a83ffcc8ea8"}, {file = "aiohttp-3.10.10-cp313-cp313-win_amd64.whl", hash = "sha256:486f7aabfa292719a2753c016cc3a8f8172965cabb3ea2e7f7436c7f5a22a151"}, - {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01948b1d570f83ee7bbf5a60ea2375a89dfb09fd419170e7f5af029510033d24"}, - {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9fc1500fd2a952c5c8e3b29aaf7e3cc6e27e9cfc0a8819b3bce48cc1b849e4cc"}, - {file = "aiohttp-3.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f614ab0c76397661b90b6851a030004dac502e48260ea10f2441abd2207fbcc7"}, - {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00819de9e45d42584bed046314c40ea7e9aea95411b38971082cad449392b08c"}, - {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05646ebe6b94cc93407b3bf34b9eb26c20722384d068eb7339de802154d61bc5"}, - {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:998f3bd3cfc95e9424a6acd7840cbdd39e45bc09ef87533c006f94ac47296090"}, - {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9010c31cd6fa59438da4e58a7f19e4753f7f264300cd152e7f90d4602449762"}, - {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ea7ffc6d6d6f8a11e6f40091a1040995cdff02cfc9ba4c2f30a516cb2633554"}, - {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ef9c33cc5cbca35808f6c74be11eb7f5f6b14d2311be84a15b594bd3e58b5527"}, - {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ce0cdc074d540265bfeb31336e678b4e37316849d13b308607efa527e981f5c2"}, - {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:597a079284b7ee65ee102bc3a6ea226a37d2b96d0418cc9047490f231dc09fe8"}, - {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:7789050d9e5d0c309c706953e5e8876e38662d57d45f936902e176d19f1c58ab"}, - {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e7f8b04d83483577fd9200461b057c9f14ced334dcb053090cea1da9c8321a91"}, - {file = "aiohttp-3.10.10-cp39-cp39-win32.whl", hash = "sha256:c02a30b904282777d872266b87b20ed8cc0d1501855e27f831320f471d54d983"}, - {file = "aiohttp-3.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:edfe3341033a6b53a5c522c802deb2079eee5cbfbb0af032a55064bd65c73a23"}, {file = "aiohttp-3.10.10.tar.gz", hash = "sha256:0631dd7c9f0822cc61c88586ca76d5b5ada26538097d0f1df510b082bad3411a"}, ] @@ -264,14 +249,6 @@ files = [ {file = "asyncpg-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04ff0785ae7eed6cc138e73fc67b8e51d54ee7a3ce9b63666ce55a0bf095f7ba"}, {file = "asyncpg-0.30.0-cp313-cp313-win32.whl", hash = "sha256:ae374585f51c2b444510cdf3595b97ece4f233fde739aa14b50e0d64e8a7a590"}, {file = "asyncpg-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:f59b430b8e27557c3fb9869222559f7417ced18688375825f8f12302c34e915e"}, - {file = "asyncpg-0.30.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6f4e83f067b35ab5e6371f8a4c93296e0439857b4569850b178a01385e82e9ad"}, - {file = "asyncpg-0.30.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5df69d55add4efcd25ea2a3b02025b669a285b767bfbf06e356d68dbce4234ff"}, - {file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3479a0d9a852c7c84e822c073622baca862d1217b10a02dd57ee4a7a081f708"}, - {file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26683d3b9a62836fad771a18ecf4659a30f348a561279d6227dab96182f46144"}, - {file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1b982daf2441a0ed314bd10817f1606f1c28b1136abd9e4f11335358c2c631cb"}, - {file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1c06a3a50d014b303e5f6fc1e5f95eb28d2cee89cf58384b700da621e5d5e547"}, - {file = "asyncpg-0.30.0-cp39-cp39-win32.whl", hash = "sha256:1b11a555a198b08f5c4baa8f8231c74a366d190755aa4f99aacec5970afe929a"}, - {file = "asyncpg-0.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:8b684a3c858a83cd876f05958823b68e8d14ec01bb0c0d14a6704c5bf9711773"}, {file = "asyncpg-0.30.0.tar.gz", hash = "sha256:c551e9928ab6707602f44811817f82ba3c446e018bfe1d3abecc8ba5f3eac851"}, ] @@ -322,8 +299,6 @@ files = [ {file = "bcrypt-4.2.0-cp39-abi3-win_amd64.whl", hash = "sha256:61ed14326ee023917ecd093ee6ef422a72f3aec6f07e21ea5f10622b735538a9"}, {file = "bcrypt-4.2.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:39e1d30c7233cfc54f5c3f2c825156fe044efdd3e0b9d309512cc514a263ec2a"}, {file = "bcrypt-4.2.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f4f4acf526fcd1c34e7ce851147deedd4e26e6402369304220250598b26448db"}, - {file = "bcrypt-4.2.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:1ff39b78a52cf03fdf902635e4c81e544714861ba3f0efc56558979dd4f09170"}, - {file = "bcrypt-4.2.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:373db9abe198e8e2c70d12b479464e0d5092cc122b20ec504097b5f2297ed184"}, {file = "bcrypt-4.2.0.tar.gz", hash = "sha256:cf69eaf5185fd58f268f805b505ce31f9b9fc2d64b376642164e9244540c1221"}, ] @@ -394,22 +369,6 @@ files = [ {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7"}, {file = "Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0"}, {file = "Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b"}, - {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a"}, - {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7905193081db9bfa73b1219140b3d315831cbff0d8941f22da695832f0dd188f"}, - {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a77def80806c421b4b0af06f45d65a136e7ac0bdca3c09d9e2ea4e515367c7e9"}, - {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8dadd1314583ec0bf2d1379f7008ad627cd6336625d6679cf2f8e67081b83acf"}, - {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:901032ff242d479a0efa956d853d16875d42157f98951c0230f69e69f9c09bac"}, - {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:22fc2a8549ffe699bfba2256ab2ed0421a7b8fadff114a3d201794e45a9ff578"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ae15b066e5ad21366600ebec29a7ccbc86812ed267e4b28e860b8ca16a2bc474"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:949f3b7c29912693cee0afcf09acd6ebc04c57af949d9bf77d6101ebb61e388c"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:89f4988c7203739d48c6f806f1e87a1d96e0806d44f0fba61dba81392c9e474d"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:de6551e370ef19f8de1807d0a9aa2cdfdce2e85ce88b122fe9f6b2b076837e59"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0737ddb3068957cf1b054899b0883830bb1fec522ec76b1098f9b6e0f02d9419"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4f3607b129417e111e30637af1b56f24f7a49e64763253bbc275c75fa887d4b2"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6c6e0c425f22c1c719c42670d561ad682f7bfeeef918edea971a79ac5252437f"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:494994f807ba0b92092a163a0a283961369a65f6cbe01e8891132b7a320e61eb"}, - {file = "Brotli-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f0d8a7a6b5983c2496e364b969f0e526647a06b075d034f3297dc66f3b360c64"}, - {file = "Brotli-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdad5b9014d83ca68c25d2e9444e28e967ef16e80f6b436918c700c117a85467"}, {file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"}, ] @@ -489,18 +448,6 @@ files = [ {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"}, {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"}, {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"}, - {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"}, - {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"}, - {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"}, - {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"}, - {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"}, - {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"}, - {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"}, - {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"}, - {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"}, - {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"}, - {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"}, - {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, ] @@ -569,21 +516,6 @@ files = [ {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"}, {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"}, {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"}, - {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"}, {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"}, {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"}, ] @@ -664,16 +596,6 @@ files = [ {file = "coverage-7.6.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cc8ff50b50ce532de2fa7a7daae9dd12f0a699bfcd47f20945364e5c31799fef"}, {file = "coverage-7.6.4-cp313-cp313t-win32.whl", hash = "sha256:b8d3a03d9bfcaf5b0141d07a88456bb6a4c3ce55c080712fec8418ef3610230e"}, {file = "coverage-7.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:f3ddf056d3ebcf6ce47bdaf56142af51bb7fad09e4af310241e9db7a3a8022e1"}, - {file = "coverage-7.6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cb7fa111d21a6b55cbf633039f7bc2749e74932e3aa7cb7333f675a58a58bf3"}, - {file = "coverage-7.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11a223a14e91a4693d2d0755c7a043db43d96a7450b4f356d506c2562c48642c"}, - {file = "coverage-7.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a413a096c4cbac202433c850ee43fa326d2e871b24554da8327b01632673a076"}, - {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00a1d69c112ff5149cabe60d2e2ee948752c975d95f1e1096742e6077affd376"}, - {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f76846299ba5c54d12c91d776d9605ae33f8ae2b9d1d3c3703cf2db1a67f2c0"}, - {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fe439416eb6380de434886b00c859304338f8b19f6f54811984f3420a2e03858"}, - {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:0294ca37f1ba500667b1aef631e48d875ced93ad5e06fa665a3295bdd1d95111"}, - {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6f01ba56b1c0e9d149f9ac85a2f999724895229eb36bd997b61e62999e9b0901"}, - {file = "coverage-7.6.4-cp39-cp39-win32.whl", hash = "sha256:bc66f0bf1d7730a17430a50163bb264ba9ded56739112368ba985ddaa9c3bd09"}, - {file = "coverage-7.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:c481b47f6b5845064c65a7bc78bc0860e635a9b055af0df46fdf1c58cebf8e8f"}, {file = "coverage-7.6.4-pp39.pp310-none-any.whl", hash = "sha256:3c65d37f3a9ebb703e710befdc489a38683a5b152242664b973a7b7b22348a4e"}, {file = "coverage-7.6.4.tar.gz", hash = "sha256:29fc0f17b1d3fea332f8001d4558f8214af7f1d87a345f3a133c901d60347c73"}, ] @@ -738,16 +660,6 @@ files = [ {file = "coverage-7.6.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cc8ff50b50ce532de2fa7a7daae9dd12f0a699bfcd47f20945364e5c31799fef"}, {file = "coverage-7.6.4-cp313-cp313t-win32.whl", hash = "sha256:b8d3a03d9bfcaf5b0141d07a88456bb6a4c3ce55c080712fec8418ef3610230e"}, {file = "coverage-7.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:f3ddf056d3ebcf6ce47bdaf56142af51bb7fad09e4af310241e9db7a3a8022e1"}, - {file = "coverage-7.6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cb7fa111d21a6b55cbf633039f7bc2749e74932e3aa7cb7333f675a58a58bf3"}, - {file = "coverage-7.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11a223a14e91a4693d2d0755c7a043db43d96a7450b4f356d506c2562c48642c"}, - {file = "coverage-7.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a413a096c4cbac202433c850ee43fa326d2e871b24554da8327b01632673a076"}, - {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00a1d69c112ff5149cabe60d2e2ee948752c975d95f1e1096742e6077affd376"}, - {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f76846299ba5c54d12c91d776d9605ae33f8ae2b9d1d3c3703cf2db1a67f2c0"}, - {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fe439416eb6380de434886b00c859304338f8b19f6f54811984f3420a2e03858"}, - {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:0294ca37f1ba500667b1aef631e48d875ced93ad5e06fa665a3295bdd1d95111"}, - {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6f01ba56b1c0e9d149f9ac85a2f999724895229eb36bd997b61e62999e9b0901"}, - {file = "coverage-7.6.4-cp39-cp39-win32.whl", hash = "sha256:bc66f0bf1d7730a17430a50163bb264ba9ded56739112368ba985ddaa9c3bd09"}, - {file = "coverage-7.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:c481b47f6b5845064c65a7bc78bc0860e635a9b055af0df46fdf1c58cebf8e8f"}, {file = "coverage-7.6.4-pp39.pp310-none-any.whl", hash = "sha256:3c65d37f3a9ebb703e710befdc489a38683a5b152242664b973a7b7b22348a4e"}, {file = "coverage-7.6.4.tar.gz", hash = "sha256:29fc0f17b1d3fea332f8001d4558f8214af7f1d87a345f3a133c901d60347c73"}, ] @@ -782,10 +694,6 @@ files = [ {file = "cryptography-43.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a2a431ee15799d6db9fe80c82b055bae5a752bef645bba795e8e52687c69efe3"}, {file = "cryptography-43.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:281c945d0e28c92ca5e5930664c1cefd85efe80e5c0d2bc58dd63383fda29f83"}, {file = "cryptography-43.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f18c716be16bc1fea8e95def49edf46b82fccaa88587a45f8dc0ff6ab5d8e0a7"}, - {file = "cryptography-43.0.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:4a02ded6cd4f0a5562a8887df8b3bd14e822a90f97ac5e544c162899bc467664"}, - {file = "cryptography-43.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:53a583b6637ab4c4e3591a15bc9db855b8d9dee9a669b550f311480acab6eb08"}, - {file = "cryptography-43.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1ec0bcf7e17c0c5669d881b1cd38c4972fade441b27bda1051665faaa89bdcaa"}, - {file = "cryptography-43.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2ce6fae5bdad59577b44e4dfed356944fbf1d925269114c28be377692643b4ff"}, {file = "cryptography-43.0.3.tar.gz", hash = "sha256:315b9001266a492a6ff443b61238f956b214dbec9910a081ba5b6646a055a805"}, ] @@ -1029,21 +937,6 @@ files = [ {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03"}, {file = "frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c"}, {file = "frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28"}, - {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9bbcdfaf4af7ce002694a4e10a0159d5a8d20056a12b05b45cea944a4953f972"}, - {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1893f948bf6681733aaccf36c5232c231e3b5166d607c5fa77773611df6dc336"}, - {file = "frozenlist-1.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2b5e23253bb709ef57a8e95e6ae48daa9ac5f265637529e4ce6b003a37b2621f"}, - {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f253985bb515ecd89629db13cb58d702035ecd8cfbca7d7a7e29a0e6d39af5f"}, - {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04a5c6babd5e8fb7d3c871dc8b321166b80e41b637c31a995ed844a6139942b6"}, - {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9fe0f1c29ba24ba6ff6abf688cb0b7cf1efab6b6aa6adc55441773c252f7411"}, - {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:226d72559fa19babe2ccd920273e767c96a49b9d3d38badd7c91a0fdeda8ea08"}, - {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b731db116ab3aedec558573c1a5eec78822b32292fe4f2f0345b7f697745c2"}, - {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:366d8f93e3edfe5a918c874702f78faac300209a4d5bf38352b2c1bdc07a766d"}, - {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1b96af8c582b94d381a1c1f51ffaedeb77c821c690ea5f01da3d70a487dd0a9b"}, - {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c03eff4a41bd4e38415cbed054bbaff4a075b093e2394b6915dca34a40d1e38b"}, - {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:50cf5e7ee9b98f22bdecbabf3800ae78ddcc26e4a435515fc72d97903e8488e0"}, - {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e76bfbc72353269c44e0bc2cfe171900fbf7f722ad74c9a7b638052afe6a00c"}, - {file = "frozenlist-1.5.0-cp39-cp39-win32.whl", hash = "sha256:666534d15ba8f0fda3f53969117383d5dc021266b3c1a42c9ec4855e4b58b9d3"}, - {file = "frozenlist-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:5c28f4b5dbef8a0d8aad0d4de24d1e9e981728628afaf4ea0792f5d0939372f0"}, {file = "frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3"}, {file = "frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817"}, ] @@ -1119,16 +1012,6 @@ files = [ {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6510bf84a6b643dabba74d3049ead221257603a253d0a9873f55f6a59a65f822"}, {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:04b013dc07c96f83134b1e99888e7a79979f1a247e2a9f59697fa14b5862ed01"}, {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6"}, - {file = "greenlet-3.1.1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:396979749bd95f018296af156201d6211240e7a23090f50a8d5d18c370084dc3"}, - {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca9d0ff5ad43e785350894d97e13633a66e2b50000e8a183a50a88d834752d42"}, - {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f6ff3b14f2df4c41660a7dec01045a045653998784bf8cfcb5a525bdffffbc8f"}, - {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94ebba31df2aa506d7b14866fed00ac141a867e63143fe5bca82a8e503b36437"}, - {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aaad12ac0ff500f62cebed98d8789198ea0e6f233421059fa68a5aa7220145"}, - {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63e4844797b975b9af3a3fb8f7866ff08775f5426925e1e0bbcfe7932059a12c"}, - {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7939aa3ca7d2a1593596e7ac6d59391ff30281ef280d8632fa03d81f7c5f955e"}, - {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d0028e725ee18175c6e422797c407874da24381ce0690d6b9396c204c7f7276e"}, - {file = "greenlet-3.1.1-cp39-cp39-win32.whl", hash = "sha256:5e06afd14cbaf9e00899fae69b24a32f2196c19de08fcb9f4779dd4f004e5e7c"}, - {file = "greenlet-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:3319aa75e0e0639bc15ff54ca327e8dc7a6fe404003496e3c6925cd3142e0e22"}, {file = "greenlet-3.1.1.tar.gz", hash = "sha256:4ce3ac6cdb6adf7946475d7ef31777c26d94bccc377e070a7986bd2d5c515467"}, ] @@ -1187,13 +1070,6 @@ files = [ {file = "httptools-0.6.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:342dd6946aa6bda4b8f18c734576106b8a31f2fe31492881a9a160ec84ff4bd5"}, {file = "httptools-0.6.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b36913ba52008249223042dca46e69967985fb4051951f94357ea681e1f5dc0"}, {file = "httptools-0.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8"}, - {file = "httptools-0.6.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:85797e37e8eeaa5439d33e556662cc370e474445d5fab24dcadc65a8ffb04003"}, - {file = "httptools-0.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:db353d22843cf1028f43c3651581e4bb49374d85692a85f95f7b9a130e1b2cab"}, - {file = "httptools-0.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1ffd262a73d7c28424252381a5b854c19d9de5f56f075445d33919a637e3547"}, - {file = "httptools-0.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:703c346571fa50d2e9856a37d7cd9435a25e7fd15e236c397bf224afaa355fe9"}, - {file = "httptools-0.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:aafe0f1918ed07b67c1e838f950b1c1fabc683030477e60b335649b8020e1076"}, - {file = "httptools-0.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0e563e54979e97b6d13f1bbc05a96109923e76b901f786a5eae36e99c01237bd"}, - {file = "httptools-0.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:b799de31416ecc589ad79dd85a0b2657a8fe39327944998dea368c1d4c9e55e6"}, {file = "httptools-0.6.4.tar.gz", hash = "sha256:4e93eee4add6493b59a5c514da98c939b244fce4a0d8879cd3f466562f4b7d5c"}, ] @@ -1252,18 +1128,6 @@ files = [ {file = "img2pdf-0.6.1.tar.gz", hash = "sha256:306e279eb832bc159d7d6294b697a9fbd11b4be1f799b14b3b2174fb506af289"}, ] -[[package]] -name = "importlib-metadata" -version = "8.5.0" -summary = "" -dependencies = [ - "zipp; python_full_version < \"3.10\"", -] -files = [ - {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"}, - {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"}, -] - [[package]] name = "iniconfig" version = "2.0.0" @@ -1350,32 +1214,12 @@ files = [ {file = "lxml-6.0.0-cp313-cp313-win32.whl", hash = "sha256:5fcd7d3b1d8ecb91445bd71b9c88bdbeae528fefee4f379895becfc72298d181"}, {file = "lxml-6.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:2f34687222b78fff795feeb799a7d44eca2477c3d9d3a46ce17d51a4f383e32e"}, {file = "lxml-6.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:21db1ec5525780fd07251636eb5f7acb84003e9382c72c18c542a87c416ade03"}, - {file = "lxml-6.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:85b14a4689d5cff426c12eefe750738648706ea2753b20c2f973b2a000d3d261"}, - {file = "lxml-6.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f64ccf593916e93b8d36ed55401bb7fe9c7d5de3180ce2e10b08f82a8f397316"}, - {file = "lxml-6.0.0-cp39-cp39-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:b372d10d17a701b0945f67be58fae4664fd056b85e0ff0fbc1e6c951cdbc0512"}, - {file = "lxml-6.0.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a674c0948789e9136d69065cc28009c1b1874c6ea340253db58be7622ce6398f"}, - {file = "lxml-6.0.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:edf6e4c8fe14dfe316939711e3ece3f9a20760aabf686051b537a7562f4da91a"}, - {file = "lxml-6.0.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:048a930eb4572829604982e39a0c7289ab5dc8abc7fc9f5aabd6fbc08c154e93"}, - {file = "lxml-6.0.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0b5fa5eda84057a4f1bbb4bb77a8c28ff20ae7ce211588d698ae453e13c6281"}, - {file = "lxml-6.0.0-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:c352fc8f36f7e9727db17adbf93f82499457b3d7e5511368569b4c5bd155a922"}, - {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8db5dc617cb937ae17ff3403c3a70a7de9df4852a046f93e71edaec678f721d0"}, - {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:2181e4b1d07dde53986023482673c0f1fba5178ef800f9ab95ad791e8bdded6a"}, - {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b3c98d5b24c6095e89e03d65d5c574705be3d49c0d8ca10c17a8a4b5201b72f5"}, - {file = "lxml-6.0.0-cp39-cp39-win32.whl", hash = "sha256:04d67ceee6db4bcb92987ccb16e53bef6b42ced872509f333c04fb58a3315256"}, - {file = "lxml-6.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:e0b1520ef900e9ef62e392dd3d7ae4f5fa224d1dd62897a792cf353eb20b6cae"}, - {file = "lxml-6.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:e35e8aaaf3981489f42884b59726693de32dabfc438ac10ef4eb3409961fd402"}, {file = "lxml-6.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:dbdd7679a6f4f08152818043dbb39491d1af3332128b3752c3ec5cebc0011a72"}, {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:40442e2a4456e9910875ac12951476d36c0870dcb38a68719f8c4686609897c4"}, {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:db0efd6bae1c4730b9c863fc4f5f3c0fa3e8f05cae2c44ae141cb9dfc7d091dc"}, {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ab542c91f5a47aaa58abdd8ea84b498e8e49fe4b883d67800017757a3eb78e8"}, {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:013090383863b72c62a702d07678b658fa2567aa58d373d963cca245b017e065"}, {file = "lxml-6.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c86df1c9af35d903d2b52d22ea3e66db8058d21dc0f59842ca5deb0595921141"}, - {file = "lxml-6.0.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4337e4aec93b7c011f7ee2e357b0d30562edd1955620fdd4aeab6aacd90d43c5"}, - {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ae74f7c762270196d2dda56f8dd7309411f08a4084ff2dfcc0b095a218df2e06"}, - {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:059c4cbf3973a621b62ea3132934ae737da2c132a788e6cfb9b08d63a0ef73f9"}, - {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:17f090a9bc0ce8da51a5632092f98a7e7f84bca26f33d161a98b57f7fb0004ca"}, - {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9da022c14baeec36edfcc8daf0e281e2f55b950249a455776f0d1adeeada4734"}, - {file = "lxml-6.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a55da151d0b0c6ab176b4e761670ac0e2667817a1e0dadd04a01d0561a219349"}, {file = "lxml-6.0.0.tar.gz", hash = "sha256:032e65120339d44cdc3efc326c9f660f5f7205f3a535c1fdbf898b29ea01fb72"}, ] @@ -1458,16 +1302,6 @@ files = [ {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"}, {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"}, {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"}, {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, ] @@ -1564,21 +1398,6 @@ files = [ {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6"}, {file = "multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81"}, {file = "multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774"}, - {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c"}, - {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1"}, - {file = "multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c"}, - {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c"}, - {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f"}, - {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875"}, - {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255"}, - {file = "multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30"}, - {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057"}, - {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657"}, - {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28"}, - {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972"}, - {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43"}, - {file = "multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada"}, - {file = "multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a"}, {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"}, {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"}, ] @@ -1593,8 +1412,6 @@ dependencies = [ files = [ {file = "multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee"}, {file = "multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec"}, - {file = "multiprocess-0.70.16-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41"}, - {file = "multiprocess-0.70.16-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a"}, {file = "multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02"}, {file = "multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a"}, {file = "multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e"}, @@ -1632,17 +1449,6 @@ files = [ {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"}, {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"}, {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"}, - {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"}, - {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"}, - {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"}, - {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"}, - {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"}, - {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"}, - {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"}, - {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"}, - {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"}, - {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"}, - {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"}, {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] @@ -1655,41 +1461,20 @@ files = [ {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"}, ] -[[package]] -name = "ocrmypdf" -version = "15.4.4" -summary = "" -dependencies = [ - "deprecation; python_full_version < \"3.10\"", - "img2pdf; python_full_version < \"3.10\"", - "packaging; python_full_version < \"3.10\"", - "pdfminer-six; python_full_version < \"3.10\"", - "pikepdf; python_full_version < \"3.10\"", - "pillow; python_full_version < \"3.10\"", - "pluggy; python_full_version < \"3.10\"", - "reportlab; python_full_version < \"3.10\"", - "rich; python_full_version < \"3.10\"", - "typing-extensions; python_full_version < \"3.10\"", -] -files = [ - {file = "ocrmypdf-15.4.4-py39-none-any.whl", hash = "sha256:13fd388035b5f4bb673bff570cfc2cf72e51168646d5401de9e48ca355917c6d"}, - {file = "ocrmypdf-15.4.4.tar.gz", hash = "sha256:4696c81cc5b5d64f31ccfe685d10baeb69b42bb0974acddf292d8cf9d97605c3"}, -] - [[package]] name = "ocrmypdf" version = "16.10.4" summary = "" dependencies = [ - "deprecation; python_full_version >= \"3.10\"", - "img2pdf; python_full_version >= \"3.10\"", - "packaging; python_full_version >= \"3.10\"", - "pdfminer-six; python_full_version >= \"3.10\"", - "pi-heif; python_full_version >= \"3.10\"", - "pikepdf; python_full_version >= \"3.10\"", - "pillow; python_full_version >= \"3.10\"", - "pluggy; python_full_version >= \"3.10\"", - "rich; python_full_version >= \"3.10\"", + "deprecation", + "img2pdf", + "packaging", + "pdfminer-six", + "pi-heif", + "pikepdf", + "pillow", + "pluggy", + "rich", ] files = [ {file = "ocrmypdf-16.10.4-py3-none-any.whl", hash = "sha256:061f3165d09ffafac975cea00803802b8a75551ada9965292ea86ea382673688"}, @@ -1764,16 +1549,21 @@ files = [ {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"}, {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"}, {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"}, - {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"}, - {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"}, - {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"}, - {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"}, - {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"}, - {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"}, - {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"}, {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"}, ] +[[package]] +name = "pdf2image" +version = "1.17.0" +summary = "" +dependencies = [ + "pillow", +] +files = [ + {file = "pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2"}, + {file = "pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57"}, +] + [[package]] name = "pdfminer-six" version = "20250506" @@ -1792,7 +1582,7 @@ name = "pi-heif" version = "0.22.0" summary = "" dependencies = [ - "pillow; python_full_version >= \"3.10\"", + "pillow", ] files = [ {file = "pi_heif-0.22.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:fca84436339eee2c91ff09cd7e301cfa2a0f7a9d83d5bc6a9d1db8587221d239"}, @@ -1823,23 +1613,11 @@ files = [ {file = "pi_heif-0.22.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a89b57cd839b09ee749d12397d2027e20fe7a64a44883688ab44a873b16b507b"}, {file = "pi_heif-0.22.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93acd60ef14e3ea835b7e3dafe284c07116349b0df05507520f10520c3ad09c1"}, {file = "pi_heif-0.22.0-cp313-cp313-win_amd64.whl", hash = "sha256:6415b0005216ad08f86d0ef75ec24e13e60bf5f45273ab54a4a22f008b9f41ac"}, - {file = "pi_heif-0.22.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:3f85ac3c0e2fb18af10e5b9789dcfd73f091b1d6ea2090d70d6e87f8744b8fe9"}, - {file = "pi_heif-0.22.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2635cbcf35206dd3d7f6453df8a6a5cd6a83bcdc9818d999b7342837482d614e"}, - {file = "pi_heif-0.22.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:893a49c195563a9bbbef571daad995110b47e3e6b624b92269c281cf1b70b8da"}, - {file = "pi_heif-0.22.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b160a20dd6fa9d951a556006f02ec601a433ec4002953fdb67025f42e5fa89ea"}, - {file = "pi_heif-0.22.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e2508317837ad6da6b6e2ba154faab766a0cdc189a86dd45b4b7decd641bfa5"}, - {file = "pi_heif-0.22.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a7a1666070cffce08027b4309fb7f270c0e3a4715a3e5a7a7202b05f65a849f2"}, - {file = "pi_heif-0.22.0-cp39-cp39-win_amd64.whl", hash = "sha256:c73e651cb17b7da3a740881c479e224084c95380df0d9d4f72d4858a422e80ae"}, {file = "pi_heif-0.22.0-pp310-pypy310_pp73-macosx_13_0_x86_64.whl", hash = "sha256:6b83ec2f6db2dd61e09940006ee0a854eb58d91a52023be057da13a08a9f0517"}, {file = "pi_heif-0.22.0-pp310-pypy310_pp73-macosx_14_0_arm64.whl", hash = "sha256:f33211fa2afa756b13a63e21aeab577cdc7ddb18a929a012cbbcd3b7d8a772d0"}, {file = "pi_heif-0.22.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a82bb03e5ab429b6aee5f1446c7c1925b1fb4fd58d74c960c7995734285db269"}, {file = "pi_heif-0.22.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79d72744708949bd9028516d860bd2c341371bca13aa2196e4f2267263834608"}, {file = "pi_heif-0.22.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7bb583f93bb4c1dfaf3b6e689a9fa0de7c83182730c16ec8798c459cf8c3e8cf"}, - {file = "pi_heif-0.22.0-pp39-pypy39_pp73-macosx_13_0_x86_64.whl", hash = "sha256:052fffb0b65c51adf90993a696dd51dddc5f5707d5f40e7bd9f4ad958bb505d9"}, - {file = "pi_heif-0.22.0-pp39-pypy39_pp73-macosx_14_0_arm64.whl", hash = "sha256:b326a48001a97906e5eb4110113d0cfe1203704f3572100dd177782568c9fc32"}, - {file = "pi_heif-0.22.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8cc68012a870d5e39d8fd5468dfd1d452ca10388cab5fac30f90ddfa0772a3e"}, - {file = "pi_heif-0.22.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:350c49ac597d1b8cdaa8a35f2c0901a3847067b9d0a9fdc07d2d6851e5d63382"}, - {file = "pi_heif-0.22.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3de6fb5a58cf271897adc31e045db45003ae1e32116efc30fa20c72e1c90b2b"}, {file = "pi_heif-0.22.0.tar.gz", hash = "sha256:489ddda3c9fed948715a9c8642c6ee24c3b438a7fbf85b3a8f097d632d7082a8"}, ] @@ -1882,13 +1660,6 @@ files = [ {file = "pikepdf-9.10.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:048f3d5138c44f8c452d818e14130fa30d809f61d70063b6e615e91148342188"}, {file = "pikepdf-9.10.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0fff140da5a75b41b4cdf34354366620c206f31fc513356c70cf5da6b81d2483"}, {file = "pikepdf-9.10.2-cp313-cp313-win_amd64.whl", hash = "sha256:1b5af8e233ed232f02e31a281134eed94504c72e9de88326433e34641f04a113"}, - {file = "pikepdf-9.10.2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:ed7032dfe0f280e87908e025b22ecd49b230d2b753c4ef66d0f6ce2952f5e721"}, - {file = "pikepdf-9.10.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:9d5f9fa9513e600752acdd81fd1b987b6bf85a36c25779bd9a7e0986626424d7"}, - {file = "pikepdf-9.10.2-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1899d0d9dd1ebdf13125159029a2c89afc66d87f0f3bcdbca9adbda6ad2bce15"}, - {file = "pikepdf-9.10.2-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77ec60c230f11797e94a0659523c579fd8d25969de9091b2d6c7799868cd60c3"}, - {file = "pikepdf-9.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2ddc1cb0aba4f2fa0d95ed68460688e3efcd3a70973901faf5b8c85e81438bcf"}, - {file = "pikepdf-9.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a0ee549af6560be2c3f7b9c37b4c9c814bcd24249323b0525ba0b00a11988d90"}, - {file = "pikepdf-9.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:f1d7417a1b49d77f13f9e9310e5d122a0e69d5e06afd21e06d12b0baa5cd9578"}, {file = "pikepdf-9.10.2.tar.gz", hash = "sha256:f62fc2183888f2ca1d271bf4faa440a2e2d0159221620a9c6a314f9c9a95680c"}, ] @@ -1949,17 +1720,6 @@ files = [ {file = "pillow-11.0.0-cp313-cp313t-win32.whl", hash = "sha256:607bbe123c74e272e381a8d1957083a9463401f7bd01287f50521ecb05a313f8"}, {file = "pillow-11.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5c39ed17edea3bc69c743a8dd3e9853b7509625c2462532e62baa0732163a904"}, {file = "pillow-11.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:75acbbeb05b86bc53cbe7b7e6fe00fbcf82ad7c684b3ad82e3d711da9ba287d3"}, - {file = "pillow-11.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2e46773dc9f35a1dd28bd6981332fd7f27bec001a918a72a79b4133cf5291dba"}, - {file = "pillow-11.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2679d2258b7f1192b378e2893a8a0a0ca472234d4c2c0e6bdd3380e8dfa21b6a"}, - {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eda2616eb2313cbb3eebbe51f19362eb434b18e3bb599466a1ffa76a033fb916"}, - {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20ec184af98a121fb2da42642dea8a29ec80fc3efbaefb86d8fdd2606619045d"}, - {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:8594f42df584e5b4bb9281799698403f7af489fba84c34d53d1c4bfb71b7c4e7"}, - {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:c12b5ae868897c7338519c03049a806af85b9b8c237b7d675b8c5e089e4a618e"}, - {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:70fbbdacd1d271b77b7721fe3cdd2d537bbbd75d29e6300c672ec6bb38d9672f"}, - {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5178952973e588b3f1360868847334e9e3bf49d19e169bbbdfaf8398002419ae"}, - {file = "pillow-11.0.0-cp39-cp39-win32.whl", hash = "sha256:8c676b587da5673d3c75bd67dd2a8cdfeb282ca38a30f37950511766b26858c4"}, - {file = "pillow-11.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:94f3e1780abb45062287b4614a5bc0874519c86a777d4a7ad34978e86428b8dd"}, - {file = "pillow-11.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:290f2cc809f9da7d6d622550bbf4c1e57518212da51b6a30fe8e0a270a5b78bd"}, {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1187739620f2b365de756ce086fdb3604573337cc28a0d3ac4a01ab6b2d2a6d2"}, {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fbbcb7b57dc9c794843e3d1258c0fbf0f48656d46ffe9e09b63bbd6e8cd5d0a2"}, {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d203af30149ae339ad1b4f710d9844ed8796e97fda23ffbc4cc472968a47d0b"}, @@ -1967,10 +1727,6 @@ files = [ {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:73853108f56df97baf2bb8b522f3578221e56f646ba345a372c78326710d3830"}, {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e58876c91f97b0952eb766123bfef372792ab3f4e3e1f1a2267834c2ab131734"}, {file = "pillow-11.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:224aaa38177597bb179f3ec87eeefcce8e4f85e608025e9cfac60de237ba6316"}, - {file = "pillow-11.0.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:5bd2d3bdb846d757055910f0a59792d33b555800813c3b39ada1829c372ccb06"}, - {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:375b8dd15a1f5d2feafff536d47e22f69625c1aa92f12b339ec0b2ca40263273"}, - {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:daffdf51ee5db69a82dd127eabecce20729e21f7a3680cf7cbb23f0829189790"}, - {file = "pillow-11.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7326a1787e3c7b0429659e0a944725e1b03eeaa10edd945a86dead1913383944"}, {file = "pillow-11.0.0.tar.gz", hash = "sha256:72bacbaf24ac003fea9bff9837d1eedb6088758d41e100c1552930151f677739"}, ] @@ -2052,22 +1808,6 @@ files = [ {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"}, {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"}, {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"}, - {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"}, - {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"}, - {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"}, - {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"}, - {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"}, - {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"}, - {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"}, - {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"}, - {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"}, - {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"}, - {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"}, - {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"}, - {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"}, - {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"}, - {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"}, - {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"}, {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"}, {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"}, ] @@ -2098,8 +1838,6 @@ files = [ {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"}, {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"}, {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"}, - {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"}, - {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"}, {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"}, ] @@ -2142,13 +1880,6 @@ files = [ {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:336addb8b6f5208be1b2398442c703a710b6b937b1a046065ee4db65e782ff5a"}, {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:45476490dd4adec5472c92b4d253e245258745d0ccaabe706f8d03288ed60a79"}, {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b46591222c864e7da7faa3b19455196416cd8355ff6c2cc2e65726a760a3c420"}, - {file = "pyarrow-18.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb7e3abcda7e1e6b83c2dc2909c8d045881017270a119cc6ee7fdcfe71d02df8"}, - {file = "pyarrow-18.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:09f30690b99ce34e0da64d20dab372ee54431745e4efb78ac938234a282d15f9"}, - {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d5ca5d707e158540312e09fd907f9f49bacbe779ab5236d9699ced14d2293b8"}, - {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6331f280c6e4521c69b201a42dd978f60f7e129511a55da9e0bfe426b4ebb8d"}, - {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3ac24b2be732e78a5a3ac0b3aa870d73766dd00beba6e015ea2ea7394f8b4e55"}, - {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b30a927c6dff89ee702686596f27c25160dd6c99be5bcc1513a763ae5b1bfc03"}, - {file = "pyarrow-18.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:8f40ec677e942374e3d7f2fad6a67a4c2811a8b975e8703c6fd26d3b168a90e2"}, {file = "pyarrow-18.0.0.tar.gz", hash = "sha256:a6aa027b1a9d2970cf328ccd6dbe4a996bc13c39fd427f502782f5bdb9ca20f5"}, ] @@ -2202,11 +1933,6 @@ files = [ {file = "pycryptodome-3.23.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14e15c081e912c4b0d75632acd8382dfce45b258667aa3c67caf7a4d4c13f630"}, {file = "pycryptodome-3.23.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7fc76bf273353dc7e5207d172b83f569540fc9a28d63171061c42e361d22353"}, {file = "pycryptodome-3.23.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:45c69ad715ca1a94f778215a11e66b7ff989d792a4d63b68dc586a1da1392ff5"}, - {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:865d83c906b0fc6a59b510deceee656b6bc1c4fa0d82176e2b77e97a420a996a"}, - {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89d4d56153efc4d81defe8b65fd0821ef8b2d5ddf8ed19df31ba2f00872b8002"}, - {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3f2d0aaf8080bda0587d58fc9fe4766e012441e2eed4269a77de6aea981c8be"}, - {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64093fc334c1eccfd3933c134c4457c34eaca235eeae49d69449dc4728079339"}, - {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ce64e84a962b63a47a592690bdc16a7eaf709d2c2697ababf24a0def566899a6"}, {file = "pycryptodome-3.23.0.tar.gz", hash = "sha256:447700a657182d60338bab09fdb27518f8856aecd80ae4c6bdddb67ff5da44ef"}, ] @@ -2280,18 +2006,6 @@ files = [ {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed541d70698978a20eb63d8c5d72f2cc6d7079d9d90f6b50bad07826f1320f5f"}, {file = "pydantic_core-2.23.4-cp313-none-win32.whl", hash = "sha256:3d5639516376dce1940ea36edf408c554475369f5da2abd45d44621cb616f769"}, {file = "pydantic_core-2.23.4-cp313-none-win_amd64.whl", hash = "sha256:5a1504ad17ba4210df3a045132a7baeeba5a200e930f57512ee02909fc5c4cb5"}, - {file = "pydantic_core-2.23.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a4fa4fc04dff799089689f4fd502ce7d59de529fc2f40a2c8836886c03e0175a"}, - {file = "pydantic_core-2.23.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7df63886be5e270da67e0966cf4afbae86069501d35c8c1b3b6c168f42cb36"}, - {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcedcd19a557e182628afa1d553c3895a9f825b936415d0dbd3cd0bbcfd29b4b"}, - {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f54b118ce5de9ac21c363d9b3caa6c800341e8c47a508787e5868c6b79c9323"}, - {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d2f57d3e1379a9525c5ab067b27dbb8a0642fb5d454e17a9ac434f9ce523e3"}, - {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de6d1d1b9e5101508cb37ab0d972357cac5235f5c6533d1071964c47139257df"}, - {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1278e0d324f6908e872730c9102b0112477a7f7cf88b308e4fc36ce1bdb6d58c"}, - {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a6b5099eeec78827553827f4c6b8615978bb4b6a88e5d9b93eddf8bb6790f55"}, - {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e55541f756f9b3ee346b840103f32779c695a19826a4c442b7954550a0972040"}, - {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a5c7ba8ffb6d6f8f2ab08743be203654bb1aaa8c9dcb09f82ddd34eadb695605"}, - {file = "pydantic_core-2.23.4-cp39-none-win32.whl", hash = "sha256:37b0fe330e4a58d3c58b24d91d1eb102aeec675a3db4c292ec3928ecd892a9a6"}, - {file = "pydantic_core-2.23.4-cp39-none-win_amd64.whl", hash = "sha256:1498bec4c05c9c787bde9125cfdcc63a41004ff167f495063191b863399b1a29"}, {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f455ee30a9d61d3e1a15abd5068827773d6e4dc513e795f380cdd59932c782d5"}, {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1e90d2e3bd2c3863d48525d297cd143fe541be8bbf6f579504b9712cb6b643ec"}, {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e203fdf807ac7e12ab59ca2bfcabb38c7cf0b33c41efeb00f8e5da1d86af480"}, @@ -2300,14 +2014,6 @@ files = [ {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d06b0c8da4f16d1d1e352134427cb194a0a6e19ad5db9161bf32b2113409e728"}, {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ba1a0996f6c2773bd83e63f18914c1de3c9dd26d55f4ac302a7efe93fb8e7433"}, {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9a5bce9d23aac8f0cf0836ecfc033896aa8443b501c58d0602dbfd5bd5b37753"}, - {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:78ddaaa81421a29574a682b3179d4cf9e6d405a09b99d93ddcf7e5239c742e21"}, - {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:883a91b5dd7d26492ff2f04f40fbb652de40fcc0afe07e8129e8ae779c2110eb"}, - {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88ad334a15b32a791ea935af224b9de1bf99bcd62fabf745d5f3442199d86d59"}, - {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:233710f069d251feb12a56da21e14cca67994eab08362207785cf8c598e74577"}, - {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:19442362866a753485ba5e4be408964644dd6a09123d9416c54cd49171f50744"}, - {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:624e278a7d29b6445e4e813af92af37820fafb6dcc55c012c834f9e26f9aaaef"}, - {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f5ef8f42bec47f21d07668a043f077d507e5bf4e668d5c6dfe6aaba89de1a5b8"}, - {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:aea443fffa9fbe3af1a9ba721a87f926fe548d32cab71d188a6ede77d0ff244e"}, {file = "pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863"}, ] @@ -2414,7 +2120,6 @@ name = "pytest-randomly" version = "3.16.0" summary = "" dependencies = [ - "importlib-metadata; python_full_version < \"3.10\"", "pytest", ] files = [ @@ -2542,15 +2247,6 @@ files = [ {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"}, {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"}, {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"}, - {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"}, - {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"}, - {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"}, - {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"}, - {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"}, - {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"}, - {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"}, - {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"}, - {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"}, {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] @@ -2566,19 +2262,6 @@ files = [ {file = "redis-5.2.0.tar.gz", hash = "sha256:0b1087665a771b1ff2e003aa5bdd354f15a70c9e25d5a7dbf9c722c16528a7b0"}, ] -[[package]] -name = "reportlab" -version = "4.4.3" -summary = "" -dependencies = [ - "charset-normalizer; python_full_version < \"3.10\"", - "pillow; python_full_version < \"3.10\"", -] -files = [ - {file = "reportlab-4.4.3-py3-none-any.whl", hash = "sha256:df905dc5ec5ddaae91fc9cb3371af863311271d555236410954961c5ee6ee1b5"}, - {file = "reportlab-4.4.3.tar.gz", hash = "sha256:073b0975dab69536acd3251858e6b0524ed3e087e71f1d0d1895acb50acf9c7b"}, -] - [[package]] name = "requests" version = "2.32.3" @@ -2735,14 +2418,6 @@ files = [ {file = "SQLAlchemy-2.0.36-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a121d62ebe7d26fec9155f83f8be5189ef1405f5973ea4874a26fab9f1e262c"}, {file = "SQLAlchemy-2.0.36-cp313-cp313-win32.whl", hash = "sha256:0572f4bd6f94752167adfd7c1bed84f4b240ee6203a95e05d1e208d488d0d436"}, {file = "SQLAlchemy-2.0.36-cp313-cp313-win_amd64.whl", hash = "sha256:8c78ac40bde930c60e0f78b3cd184c580f89456dd87fc08f9e3ee3ce8765ce88"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:dc022184d3e5cacc9579e41805a681187650e170eb2fd70e28b86192a479dcaa"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b817d41d692bf286abc181f8af476c4fbef3fd05e798777492618378448ee689"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4e46a888b54be23d03a89be510f24a7652fe6ff660787b96cd0e57a4ebcb46d"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4ae3005ed83f5967f961fd091f2f8c5329161f69ce8480aa8168b2d7fe37f06"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:03e08af7a5f9386a43919eda9de33ffda16b44eb11f3b313e6822243770e9763"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3dbb986bad3ed5ceaf090200eba750b5245150bd97d3e67343a3cfed06feecf7"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-win32.whl", hash = "sha256:9fe53b404f24789b5ea9003fc25b9a3988feddebd7e7b369c8fac27ad6f52f28"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-win_amd64.whl", hash = "sha256:af148a33ff0349f53512a049c6406923e4e02bf2f26c5fb285f143faf4f0e46a"}, {file = "SQLAlchemy-2.0.36-py3-none-any.whl", hash = "sha256:fddbe92b4760c6f5d48162aef14824add991aeda8ddadb3c31d56eb15ca69f8e"}, {file = "sqlalchemy-2.0.36.tar.gz", hash = "sha256:7f2767680b6d2398aea7082e45a774b2b0767b5c8d8ffb9c8b683088ea9b29c5"}, ] @@ -2769,7 +2444,6 @@ version = "0.41.2" summary = "" dependencies = [ "anyio", - "typing-extensions; python_full_version < \"3.10\"", ] files = [ {file = "starlette-0.41.2-py3-none-any.whl", hash = "sha256:fbc189474b4731cf30fcef52f18a8d070e3f3b46c6a04c97579e85e6ffca942d"}, @@ -2918,12 +2592,6 @@ files = [ {file = "uvloop-0.21.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816"}, {file = "uvloop-0.21.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc"}, {file = "uvloop-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553"}, - {file = "uvloop-0.21.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c097078b8031190c934ed0ebfee8cc5f9ba9642e6eb88322b9958b649750f72b"}, - {file = "uvloop-0.21.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:46923b0b5ee7fc0020bef24afe7836cb068f5050ca04caf6b487c513dc1a20b2"}, - {file = "uvloop-0.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53e420a3afe22cdcf2a0f4846e377d16e718bc70103d7088a4f7623567ba5fb0"}, - {file = "uvloop-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88cb67cdbc0e483da00af0b2c3cdad4b7c61ceb1ee0f33fe00e09c81e3a6cb75"}, - {file = "uvloop-0.21.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:221f4f2a1f46032b403bf3be628011caf75428ee3cc204a22addf96f586b19fd"}, - {file = "uvloop-0.21.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2d1f581393673ce119355d56da84fe1dd9d2bb8b3d13ce792524e1607139feff"}, {file = "uvloop-0.21.0.tar.gz", hash = "sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3"}, ] @@ -2985,26 +2653,10 @@ files = [ {file = "watchfiles-0.24.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:edf71b01dec9f766fb285b73930f95f730bb0943500ba0566ae234b5c1618c18"}, {file = "watchfiles-0.24.0-cp313-none-win32.whl", hash = "sha256:f4c96283fca3ee09fb044f02156d9570d156698bc3734252175a38f0e8975f07"}, {file = "watchfiles-0.24.0-cp313-none-win_amd64.whl", hash = "sha256:a974231b4fdd1bb7f62064a0565a6b107d27d21d9acb50c484d2cdba515b9366"}, - {file = "watchfiles-0.24.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:b665caeeda58625c3946ad7308fbd88a086ee51ccb706307e5b1fa91556ac886"}, - {file = "watchfiles-0.24.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5c51749f3e4e269231510da426ce4a44beb98db2dce9097225c338f815b05d4f"}, - {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82b2509f08761f29a0fdad35f7e1638b8ab1adfa2666d41b794090361fb8b855"}, - {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9a60e2bf9dc6afe7f743e7c9b149d1fdd6dbf35153c78fe3a14ae1a9aee3d98b"}, - {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7d9b87c4c55e3ea8881dfcbf6d61ea6775fffed1fedffaa60bd047d3c08c430"}, - {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:78470906a6be5199524641f538bd2c56bb809cd4bf29a566a75051610bc982c3"}, - {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07cdef0c84c03375f4e24642ef8d8178e533596b229d32d2bbd69e5128ede02a"}, - {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d337193bbf3e45171c8025e291530fb7548a93c45253897cd764a6a71c937ed9"}, - {file = "watchfiles-0.24.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ec39698c45b11d9694a1b635a70946a5bad066b593af863460a8e600f0dff1ca"}, - {file = "watchfiles-0.24.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2e28d91ef48eab0afb939fa446d8ebe77e2f7593f5f463fd2bb2b14132f95b6e"}, - {file = "watchfiles-0.24.0-cp39-none-win32.whl", hash = "sha256:7138eff8baa883aeaa074359daabb8b6c1e73ffe69d5accdc907d62e50b1c0da"}, - {file = "watchfiles-0.24.0-cp39-none-win_amd64.whl", hash = "sha256:b3ef2c69c655db63deb96b3c3e587084612f9b1fa983df5e0c3379d41307467f"}, {file = "watchfiles-0.24.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:632676574429bee8c26be8af52af20e0c718cc7f5f67f3fb658c71928ccd4f7f"}, {file = "watchfiles-0.24.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a2a9891723a735d3e2540651184be6fd5b96880c08ffe1a98bae5017e65b544b"}, {file = "watchfiles-0.24.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a7fa2bc0efef3e209a8199fd111b8969fe9db9c711acc46636686331eda7dd4"}, {file = "watchfiles-0.24.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01550ccf1d0aed6ea375ef259706af76ad009ef5b0203a3a4cce0f6024f9b68a"}, - {file = "watchfiles-0.24.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:96619302d4374de5e2345b2b622dc481257a99431277662c30f606f3e22f42be"}, - {file = "watchfiles-0.24.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:85d5f0c7771dcc7a26c7a27145059b6bb0ce06e4e751ed76cdf123d7039b60b5"}, - {file = "watchfiles-0.24.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:951088d12d339690a92cef2ec5d3cfd957692834c72ffd570ea76a6790222777"}, - {file = "watchfiles-0.24.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49fb58bcaa343fedc6a9e91f90195b20ccb3135447dc9e4e2570c3a39565853e"}, {file = "watchfiles-0.24.0.tar.gz", hash = "sha256:afb72325b74fa7a428c009c1b8be4b4d7c2afedafb2982827ef2156646df2fe1"}, ] @@ -3057,29 +2709,12 @@ files = [ {file = "websockets-13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:70c5be9f416aa72aab7a2a76c90ae0a4fe2755c1816c153c1a2bcc3333ce4ce6"}, {file = "websockets-13.1-cp313-cp313-win32.whl", hash = "sha256:624459daabeb310d3815b276c1adef475b3e6804abaf2d9d2c061c319f7f187d"}, {file = "websockets-13.1-cp313-cp313-win_amd64.whl", hash = "sha256:c518e84bb59c2baae725accd355c8dc517b4a3ed8db88b4bc93c78dae2974bf2"}, - {file = "websockets-13.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9b37c184f8b976f0c0a231a5f3d6efe10807d41ccbe4488df8c74174805eea7d"}, - {file = "websockets-13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:163e7277e1a0bd9fb3c8842a71661ad19c6aa7bb3d6678dc7f89b17fbcc4aeb7"}, - {file = "websockets-13.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4b889dbd1342820cc210ba44307cf75ae5f2f96226c0038094455a96e64fb07a"}, - {file = "websockets-13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:586a356928692c1fed0eca68b4d1c2cbbd1ca2acf2ac7e7ebd3b9052582deefa"}, - {file = "websockets-13.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7bd6abf1e070a6b72bfeb71049d6ad286852e285f146682bf30d0296f5fbadfa"}, - {file = "websockets-13.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2aad13a200e5934f5a6767492fb07151e1de1d6079c003ab31e1823733ae79"}, - {file = "websockets-13.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:df01aea34b6e9e33572c35cd16bae5a47785e7d5c8cb2b54b2acdb9678315a17"}, - {file = "websockets-13.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e54affdeb21026329fb0744ad187cf812f7d3c2aa702a5edb562b325191fcab6"}, - {file = "websockets-13.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9ef8aa8bdbac47f4968a5d66462a2a0935d044bf35c0e5a8af152d58516dbeb5"}, - {file = "websockets-13.1-cp39-cp39-win32.whl", hash = "sha256:deeb929efe52bed518f6eb2ddc00cc496366a14c726005726ad62c2dd9017a3c"}, - {file = "websockets-13.1-cp39-cp39-win_amd64.whl", hash = "sha256:7c65ffa900e7cc958cd088b9a9157a8141c991f8c53d11087e6fb7277a03f81d"}, {file = "websockets-13.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5dd6da9bec02735931fccec99d97c29f47cc61f644264eb995ad6c0c27667238"}, {file = "websockets-13.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:2510c09d8e8df777177ee3d40cd35450dc169a81e747455cc4197e63f7e7bfe5"}, {file = "websockets-13.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1c3cf67185543730888b20682fb186fc8d0fa6f07ccc3ef4390831ab4b388d9"}, {file = "websockets-13.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcc03c8b72267e97b49149e4863d57c2d77f13fae12066622dc78fe322490fe6"}, {file = "websockets-13.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:004280a140f220c812e65f36944a9ca92d766b6cc4560be652a0a3883a79ed8a"}, {file = "websockets-13.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e2620453c075abeb0daa949a292e19f56de518988e079c36478bacf9546ced23"}, - {file = "websockets-13.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:25c35bf84bf7c7369d247f0b8cfa157f989862c49104c5cf85cb5436a641d93e"}, - {file = "websockets-13.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:83f91d8a9bb404b8c2c41a707ac7f7f75b9442a0a876df295de27251a856ad09"}, - {file = "websockets-13.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a43cfdcddd07f4ca2b1afb459824dd3c6d53a51410636a2c7fc97b9a8cf4842"}, - {file = "websockets-13.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48a2ef1381632a2f0cb4efeff34efa97901c9fbc118e01951ad7cfc10601a9bb"}, - {file = "websockets-13.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:459bf774c754c35dbb487360b12c5727adab887f1622b8aed5755880a21c4a20"}, - {file = "websockets-13.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:95858ca14a9f6fa8413d29e0a585b31b278388aa775b8a81fa24830123874678"}, {file = "websockets-13.1-py3-none-any.whl", hash = "sha256:a9a396a6ad26130cdae92ae10c36af09d9bfe6cafe69670fd3b6da9b07b4044f"}, {file = "websockets-13.1.tar.gz", hash = "sha256:a3b3366087c1bc0a2795111edcadddb8b3b59509d5db5d7ea3fdd69f954a8878"}, ] @@ -3119,16 +2754,6 @@ files = [ {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"}, {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"}, {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"}, - {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"}, - {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"}, - {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"}, - {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"}, - {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"}, - {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"}, - {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"}, - {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"}, - {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"}, - {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"}, {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"}, {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"}, ] @@ -3198,31 +2823,11 @@ files = [ {file = "xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637"}, {file = "xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43"}, {file = "xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b"}, - {file = "xxhash-3.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bfc8cdd7f33d57f0468b0614ae634cc38ab9202c6957a60e31d285a71ebe0301"}, - {file = "xxhash-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e0c48b6300cd0b0106bf49169c3e0536408dfbeb1ccb53180068a18b03c662ab"}, - {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe1a92cfbaa0a1253e339ccec42dbe6db262615e52df591b68726ab10338003f"}, - {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33513d6cc3ed3b559134fb307aae9bdd94d7e7c02907b37896a6c45ff9ce51bd"}, - {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eefc37f6138f522e771ac6db71a6d4838ec7933939676f3753eafd7d3f4c40bc"}, - {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a606c8070ada8aa2a88e181773fa1ef17ba65ce5dd168b9d08038e2a61b33754"}, - {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:42eca420c8fa072cc1dd62597635d140e78e384a79bb4944f825fbef8bfeeef6"}, - {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:604253b2143e13218ff1ef0b59ce67f18b8bd1c4205d2ffda22b09b426386898"}, - {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6e93a5ad22f434d7876665444a97e713a8f60b5b1a3521e8df11b98309bff833"}, - {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7a46e1d6d2817ba8024de44c4fd79913a90e5f7265434cef97026215b7d30df6"}, - {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:30eb2efe6503c379b7ab99c81ba4a779748e3830241f032ab46bd182bf5873af"}, - {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c8aa771ff2c13dd9cda8166d685d7333d389fae30a4d2bb39d63ab5775de8606"}, - {file = "xxhash-3.5.0-cp39-cp39-win32.whl", hash = "sha256:5ed9ebc46f24cf91034544b26b131241b699edbfc99ec5e7f8f3d02d6eb7fba4"}, - {file = "xxhash-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:220f3f896c6b8d0316f63f16c077d52c412619e475f9372333474ee15133a558"}, - {file = "xxhash-3.5.0-cp39-cp39-win_arm64.whl", hash = "sha256:a7b1d8315d9b5e9f89eb2933b73afae6ec9597a258d52190944437158b49d38e"}, {file = "xxhash-3.5.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c"}, {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986"}, {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6"}, {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b"}, {file = "xxhash-3.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da"}, - {file = "xxhash-3.5.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:531af8845aaadcadf951b7e0c1345c6b9c68a990eeb74ff9acd8501a0ad6a1c9"}, - {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ce379bcaa9fcc00f19affa7773084dd09f5b59947b3fb47a1ceb0179f91aaa1"}, - {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd1b2281d01723f076df3c8188f43f2472248a6b63118b036e641243656b1b0f"}, - {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c770750cc80e8694492244bca7251385188bc5597b6a39d98a9f30e8da984e0"}, - {file = "xxhash-3.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b150b8467852e1bd844387459aa6fbe11d7f38b56e901f9f3b3e6aba0d660240"}, {file = "xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f"}, ] @@ -3300,31 +2905,6 @@ files = [ {file = "yarl-1.17.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5b937c216b6dee8b858c6afea958de03c5ff28406257d22b55c24962a2baf6fd"}, {file = "yarl-1.17.0-cp313-cp313-win32.whl", hash = "sha256:d0131b14cb545c1a7bd98f4565a3e9bdf25a1bd65c83fc156ee5d8a8499ec4a3"}, {file = "yarl-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:01c96efa4313c01329e88b7e9e9e1b2fc671580270ddefdd41129fa8d0db7696"}, - {file = "yarl-1.17.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:0d44f67e193f0a7acdf552ecb4d1956a3a276c68e7952471add9f93093d1c30d"}, - {file = "yarl-1.17.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:16ea0aa5f890cdcb7ae700dffa0397ed6c280840f637cd07bffcbe4b8d68b985"}, - {file = "yarl-1.17.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cf5469dc7dcfa65edf5cc3a6add9f84c5529c6b556729b098e81a09a92e60e51"}, - {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e662bf2f6e90b73cf2095f844e2bc1fda39826472a2aa1959258c3f2a8500a2f"}, - {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8260e88f1446904ba20b558fa8ce5d0ab9102747238e82343e46d056d7304d7e"}, - {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5dc16477a4a2c71e64c5d3d15d7ae3d3a6bb1e8b955288a9f73c60d2a391282f"}, - {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46027e326cecd55e5950184ec9d86c803f4f6fe4ba6af9944a0e537d643cdbe0"}, - {file = "yarl-1.17.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc95e46c92a2b6f22e70afe07e34dbc03a4acd07d820204a6938798b16f4014f"}, - {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:16ca76c7ac9515320cd09d6cc083d8d13d1803f6ebe212b06ea2505fd66ecff8"}, - {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:eb1a5b97388f2613f9305d78a3473cdf8d80c7034e554d8199d96dcf80c62ac4"}, - {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:41fd5498975418cdc34944060b8fbeec0d48b2741068077222564bea68daf5a6"}, - {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:146ca582ed04a5664ad04b0e0603934281eaab5c0115a5a46cce0b3c061a56a1"}, - {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:6abb8c06107dbec97481b2392dafc41aac091a5d162edf6ed7d624fe7da0587a"}, - {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:4d14be4613dd4f96c25feb4bd8c0d8ce0f529ab0ae555a17df5789e69d8ec0c5"}, - {file = "yarl-1.17.0-cp39-cp39-win32.whl", hash = "sha256:174d6a6cad1068f7850702aad0c7b1bca03bcac199ca6026f84531335dfc2646"}, - {file = "yarl-1.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:6af417ca2c7349b101d3fd557ad96b4cd439fdb6ab0d288e3f64a068eea394d0"}, {file = "yarl-1.17.0-py3-none-any.whl", hash = "sha256:62dd42bb0e49423f4dd58836a04fcf09c80237836796025211bbe913f1524993"}, {file = "yarl-1.17.0.tar.gz", hash = "sha256:d3f13583f378930377e02002b4085a3d025b00402d5a80911726d43a67911cd9"}, ] - -[[package]] -name = "zipp" -version = "3.20.2" -summary = "" -files = [ - {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"}, - {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"}, -] diff --git a/argilla-server/pyproject.toml b/argilla-server/pyproject.toml index 098bfdd57..d9eae2ddc 100644 --- a/argilla-server/pyproject.toml +++ b/argilla-server/pyproject.toml @@ -3,7 +3,7 @@ name = "extralit-server" dynamic = ["version"] description = "Open-source tool for accurate & fast scientific literature data extraction with LLM and human-in-the-loop." readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = { text = "Apache-2.0" } keywords = [ "literature-review", @@ -67,7 +67,8 @@ dependencies = [ # For file storage "minio>=7.2.7", # For document processing - "ocrmypdf>=16.10.4" + "ocrmypdf>=16.10.4", + "pdf2image>=1.17.0" ] [project.optional-dependencies] diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py index 6b28a8ebb..c3047be97 100644 --- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py +++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py @@ -19,8 +19,12 @@ import tempfile import time from io import BytesIO +from typing import List, Optional from uuid import uuid4 +from pydantic import Field +from pydantic_settings import BaseSettings + try: import ocrmypdf @@ -31,128 +35,202 @@ logger = logging.getLogger(__name__) -def preprocess_pdf_with_ocrmypdf(file_data: bytes, filename: str) -> bytes: +class PDFPreprocessingSettings(BaseSettings): """ - Preprocess PDF with OCRmyPDF to add OCR layer and fix orientation. - Works with bytes data and returns processed bytes, minimizing disk I/O. + PDF preprocessing settings that can be configured via environment variables. - Args: - file_data: PDF file data as bytes - filename: Original filename for logging purposes + All settings have the PREPROCESSING_ prefix. + """ - Returns: - Processed PDF data as bytes (or original bytes if processing fails) + enabled: bool = Field( + default=True, description="Enable PDF preprocessing with OCRmyPDF. Set to False to disable all processing." + ) + + language: List[str] = Field( + default=["eng"], description="List of languages for OCR processing (e.g., ['eng', 'spa', 'fra'])" + ) + + rotate_pages: bool = Field(default=True, description="Auto-rotate pages with horizontal text") + + deskew: bool = Field(default=True, description="Fix skewed text") + + clean: bool = Field(default=True, description="Clean up artifacts") + + optimize: int = Field( + default=1, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)" + ) + + pdf_renderer: str = Field(default="hocr", description="PDF renderer: 'auto', 'hocr', 'sandwich'") + + force_ocr: bool = Field(default=False, description="Force OCR on all pages, even if they already have text") + + skip_text: bool = Field(default=False, description="Skip text-based operations (OCR only for images)") + + redo_ocr: bool = Field(default=False, description="Redo OCR on pages that already have OCR") + + progress_bar: bool = Field(default=False, description="Show progress bar during processing") + + quiet: bool = Field(default=True, description="Suppress OCRmyPDF output messages") + + class Config: + env_prefix = "PREPROCESSING_" + + +class PDFPreprocessor: """ - if not OCRMYPDF_AVAILABLE: - logger.warning("OCRmyPDF not available, skipping preprocessing") - return file_data + PDF preprocessor that uses OCRmyPDF for rotation, OCR, and optimization. - # Only process PDF files - if not filename.lower().endswith(".pdf"): - logger.debug(f"Skipping OCRmyPDF for non-PDF file: {filename}") - return file_data + Can be configured with environment variables using the PDFPreprocessingSettings. + """ - try: - logger.info(f"Starting OCRmyPDF preprocessing for: {filename}") - start_time = time.time() + def __init__(self, settings: Optional[PDFPreprocessingSettings] = None): + """ + Initialize the PDF preprocessor. + + Args: + settings: Optional PDFPreprocessingSettings instance. If None, loads from environment. + """ + self.settings = settings or PDFPreprocessingSettings() + self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") + + if not self.settings.enabled: + self.logger.info("PDF preprocessing is disabled via configuration") + elif not OCRMYPDF_AVAILABLE: + self.logger.warning("OCRmyPDF not available, PDF preprocessing will be skipped") + + def preprocess(self, file_data: bytes, filename: str) -> bytes: + """ + Preprocess PDF with OCRmyPDF using configured settings. + + Args: + file_data: PDF file data as bytes + filename: Original filename for logging purposes + + Returns: + Processed PDF data as bytes (or original bytes if processing fails/disabled) + """ + if not self.settings.enabled: + self.logger.debug(f"PDF preprocessing disabled, skipping: {filename}") + return file_data + + if not OCRMYPDF_AVAILABLE: + self.logger.warning("OCRmyPDF not available, skipping preprocessing") + return file_data + + if not filename.lower().endswith(".pdf"): + return file_data + + try: + start_time = time.time() + self.logger.info(f"Starting OCRmyPDF preprocessing for: {filename}") + + try: + input_buffer = BytesIO(file_data) + output_buffer = BytesIO() + + ocrmypdf.ocr( + input_buffer, + output_buffer, + language=self.settings.language, + rotate_pages=self.settings.rotate_pages, + deskew=self.settings.deskew, + clean=self.settings.clean, + optimize=self.settings.optimize, + pdf_renderer=self.settings.pdf_renderer, + force_ocr=self.settings.force_ocr, + skip_text=self.settings.skip_text, + redo_ocr=self.settings.redo_ocr, + progress_bar=self.settings.progress_bar, + quiet=self.settings.quiet, + ) + + processed_data = output_buffer.getvalue() + output_buffer.close() + input_buffer.close() + + except Exception as buffer_error: + self.logger.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}") + processed_data = self._preprocess_with_temp_files(file_data, filename) + + processing_time = time.time() - start_time + self.logger.info(f"OCRmyPDF completed for {filename} in {processing_time:.2f} seconds") + + return processed_data + + except Exception as e: + self.logger.error(f"OCRmyPDF preprocessing failed for {filename}: {e}") + return file_data + + def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes: + """ + Fallback implementation using unique temporary files to avoid concurrency issues. + """ + input_temp_file = None + output_temp_file = None - # Try using BytesIO objects first to minimize disk I/O try: - input_buffer = BytesIO(file_data) - output_buffer = BytesIO() + unique_id = str(uuid4()) + temp_dir = tempfile.gettempdir() + + input_temp_file = tempfile.NamedTemporaryFile( + suffix=".pdf", prefix=f"ocr_input_{unique_id}_", dir=temp_dir, delete=False + ) + input_temp_file.write(file_data) + input_temp_file.flush() + input_temp_file.close() + + output_temp_file = tempfile.NamedTemporaryFile( + suffix=".pdf", prefix=f"ocr_output_{unique_id}_", dir=temp_dir, delete=False + ) + output_temp_file.close() - # OCRmyPDF configuration for optimal processing ocrmypdf.ocr( - input_buffer, - output_buffer, - language=["eng"], # Can be configured for other languages - rotate_pages=True, # Auto-rotate pages with horizontal text - deskew=True, # Fix skewed text - clean=True, # Clean up artifacts - optimize=1, # Optimize output file size - pdf_renderer="hocr", # Use hOCR for better text positioning - force_ocr=False, # Only OCR pages that need it - skip_text=False, # Don't skip existing text - redo_ocr=False, # Don't redo existing OCR - progress_bar=False, - quiet=True, + input_temp_file.name, + output_temp_file.name, + language=self.settings.language, + rotate_pages=self.settings.rotate_pages, + deskew=self.settings.deskew, + clean=self.settings.clean, + optimize=self.settings.optimize, + pdf_renderer=self.settings.pdf_renderer, + force_ocr=self.settings.force_ocr, + skip_text=self.settings.skip_text, + redo_ocr=self.settings.redo_ocr, + progress_bar=self.settings.progress_bar, + quiet=self.settings.quiet, ) - # Get processed PDF data - processed_data = output_buffer.getvalue() - output_buffer.close() - input_buffer.close() + with open(output_temp_file.name, "rb") as f: + processed_data = f.read() - except Exception as buffer_error: - # Fallback to temporary files if BytesIO approach fails - logger.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}") - processed_data = _preprocess_pdf_with_temp_files(file_data, filename) + return processed_data - processing_time = time.time() - start_time - logger.info(f"OCRmyPDF completed for {filename} in {processing_time:.2f} seconds") + finally: + for temp_file in [input_temp_file, output_temp_file]: + if temp_file is not None: + try: + if hasattr(temp_file, "name"): + os.unlink(temp_file.name) + except OSError as e: + self.logger.warning(f"Failed to clean up temp file: {e}") - return processed_data - except Exception as e: - logger.error(f"OCRmyPDF preprocessing failed for {filename}: {e}") - return file_data +# Global preprocessor instance (can be configured via environment variables) +pdf_preprocessor = PDFPreprocessor() -def _preprocess_pdf_with_temp_files(file_data: bytes, filename: str) -> bytes: +def preprocess_pdf_with_ocrmypdf(file_data: bytes, filename: str) -> bytes: """ - Fallback implementation using unique temporary files to avoid concurrency issues. + Preprocess PDF with OCRmyPDF to add OCR layer and fix orientation. + + This function provides backward compatibility by using the global pdf_preprocessor instance. + For new code, consider using PDFPreprocessor directly for better configuration control. + + Args: + file_data: PDF file data as bytes + filename: Original filename for logging purposes + + Returns: + Processed PDF data as bytes (or original bytes if processing fails) """ - input_temp_file = None - output_temp_file = None - - try: - # Generate unique identifiers to avoid filename collisions in concurrent jobs - unique_id = str(uuid4()) - temp_dir = tempfile.gettempdir() - - # Create input temp file with unique identifier - input_temp_file = tempfile.NamedTemporaryFile( - suffix=".pdf", prefix=f"ocr_input_{unique_id}_", dir=temp_dir, delete=False - ) - input_temp_file.write(file_data) - input_temp_file.flush() - input_temp_file.close() - - # Create output temp file with unique identifier - output_temp_file = tempfile.NamedTemporaryFile( - suffix=".pdf", prefix=f"ocr_output_{unique_id}_", dir=temp_dir, delete=False - ) - output_temp_file.close() - - # OCRmyPDF configuration for optimal processing - ocrmypdf.ocr( - input_temp_file.name, - output_temp_file.name, - language=["eng"], # Can be configured for other languages - rotate_pages=True, # Auto-rotate pages with horizontal text - deskew=True, # Fix skewed text - clean=True, # Clean up artifacts - optimize=1, # Optimize output file size - pdf_renderer="hocr", # Use hOCR for better text positioning - force_ocr=False, # Only OCR pages that need it - skip_text=False, # Don't skip existing text - redo_ocr=False, # Don't redo existing OCR - progress_bar=False, - quiet=True, - ) - - # Read processed PDF data - with open(output_temp_file.name, "rb") as f: - processed_data = f.read() - - return processed_data - - finally: - # Clean up temporary files - for temp_file in [input_temp_file, output_temp_file]: - if temp_file is not None: - try: - if hasattr(temp_file, "name"): - os.unlink(temp_file.name) - except OSError as e: - logger.warning(f"Failed to clean up temp file: {e}") + return pdf_preprocessor.preprocess(file_data, filename) diff --git a/argilla-server/src/argilla_server/jobs/document_jobs.py b/argilla-server/src/argilla_server/jobs/document_jobs.py index 8f3f99e90..42f7004db 100644 --- a/argilla-server/src/argilla_server/jobs/document_jobs.py +++ b/argilla-server/src/argilla_server/jobs/document_jobs.py @@ -131,7 +131,7 @@ async def upload_reference_documents_job( try: # Preprocess PDF files with OCRmyPDF for rotation and OCR - processed_file_data = preprocessing.preprocess_pdf_with_ocrmypdf( + processed_file_data = preprocessing.pdf_preprocessor.preprocess( file_data=file_data, filename=filename ) @@ -141,9 +141,9 @@ async def upload_reference_documents_job( document_id=file_document_create.id, # type: ignore file_data=processed_file_data, filename=filename, - # metadata=file_document_create.model_dump( - # include={"file_name": True, "pmid": True, "doi": True} - # ), + metadata=file_document_create.model_dump( + include={"file_name": True, "pmid": True, "doi": True} + ), ) if file_url: From b7b8a1b72929a308834fe44c04621c0dde10babf Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Mon, 4 Aug 2025 23:53:52 -0700 Subject: [PATCH 03/22] feat: add margin analysis to PDF preprocessing with opencv-python --- argilla-server/pdm.lock | 19 +- argilla-server/pyproject.toml | 3 +- .../contexts/document/preprocessing.py | 351 +++++++++++++++++- .../src/argilla_server/jobs/document_jobs.py | 13 +- 4 files changed, 364 insertions(+), 22 deletions(-) diff --git a/argilla-server/pdm.lock b/argilla-server/pdm.lock index 8403ffc7e..731284e16 100644 --- a/argilla-server/pdm.lock +++ b/argilla-server/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "postgresql", "test"] strategy = [] lock_version = "4.5.0" -content_hash = "sha256:f20406357dc5b02a37c1da8689053074229d1204cb9bcb4fe8848b6d2835b1b4" +content_hash = "sha256:037bf9850aef2d48dd2d032bdac1c64e906123f5aab6cea46dff7d66d2035d37" [[metadata.targets]] requires_python = ">=3.10" @@ -1481,6 +1481,23 @@ files = [ {file = "ocrmypdf-16.10.4.tar.gz", hash = "sha256:de749ef5f554b63d57e68d032e7cba5500cbd5030835bf24f658f7b7a04f3dc1"}, ] +[[package]] +name = "opencv-python" +version = "4.11.0.86" +summary = "" +dependencies = [ + "numpy", +] +files = [ + {file = "opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4"}, + {file = "opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:432f67c223f1dc2824f5e73cdfcd9db0efc8710647d4e813012195dc9122a52a"}, + {file = "opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:9d05ef13d23fe97f575153558653e2d6e87103995d54e6a35db3f282fe1f9c66"}, + {file = "opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b92ae2c8852208817e6776ba1ea0d6b1e0a1b5431e971a2a0ddd2a8cc398202"}, + {file = "opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b02611523803495003bd87362db3e1d2a0454a6a63025dc6658a9830570aa0d"}, + {file = "opencv_python-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:810549cb2a4aedaa84ad9a1c92fbfdfc14090e2749cedf2c1589ad8359aa169b"}, + {file = "opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec"}, +] + [[package]] name = "opensearch-py" version = "2.0.1" diff --git a/argilla-server/pyproject.toml b/argilla-server/pyproject.toml index d9eae2ddc..abd8a1969 100644 --- a/argilla-server/pyproject.toml +++ b/argilla-server/pyproject.toml @@ -68,7 +68,8 @@ dependencies = [ "minio>=7.2.7", # For document processing "ocrmypdf>=16.10.4", - "pdf2image>=1.17.0" + "pdf2image>=1.17.0", + "opencv-python>=4.11.0.86" ] [project.optional-dependencies] diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py index c3047be97..a0ff0fd19 100644 --- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py +++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py @@ -18,13 +18,31 @@ import os import tempfile import time +from dataclasses import dataclass from io import BytesIO -from typing import List, Optional +from typing import Dict, List, Optional, Tuple from uuid import uuid4 +import numpy as np from pydantic import Field from pydantic_settings import BaseSettings +try: + pass + + CV2_AVAILABLE = True +except ImportError: + CV2_AVAILABLE = False + +try: + from pdf2image import convert_from_bytes + from PIL import ImageChops + from PIL.Image import Image as PILImage + + PDF2IMAGE_AVAILABLE = True +except ImportError: + PDF2IMAGE_AVAILABLE = False + try: import ocrmypdf @@ -35,6 +53,272 @@ logger = logging.getLogger(__name__) +@dataclass +class PDFProcessingResult: + """ + Result of PDF preprocessing containing both processed data and analysis metadata. + """ + + processed_data: bytes + metadata: Dict + + +class PDFAnalyzer: + """ + Analyzes PDF layout structure to detect margins, headers, footers, and other regions. + """ + + def __init__(self): + self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") + + def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: + """ + Analyze PDF layout to extract margin and region information. + + Args: + pdf_data: PDF file data as bytes + filename: Filename for logging + + Returns: + Dictionary containing layout analysis metadata + """ + if not (PDF2IMAGE_AVAILABLE and CV2_AVAILABLE): + self.logger.warning("PDF analysis requires pdf2image and cv2, skipping layout analysis") + return {"analysis_available": False, "error": "Missing dependencies"} + + try: + # Convert PDF to images + images = convert_from_bytes(pdf_data, dpi=150) # Lower DPI for analysis + if not images: + return {"analysis_available": False, "error": "No pages found"} + + self.logger.info(f"Analyzing layout for {filename} with {len(images)} pages") + + # Analyze layout + layout_data = self._analyze_page_layout(images) + + return { + "analysis_available": True, + "total_pages": len(images), + "page_dimensions": {"width": images[0].size[0], "height": images[0].size[1]} if images else {}, + **layout_data, + } + + except Exception as e: + self.logger.error(f"PDF layout analysis failed for {filename}: {e}") + return {"analysis_available": False, "error": str(e)} + + def _analyze_page_layout(self, images: List[PILImage]) -> Dict: + """ + Analyze page layout by comparing pages to find common regions. + """ + if len(images) < 2: + return self._analyze_single_page(images[0]) if images else {} + + # Use first page as reference, compare with others + reference_img = images[0].convert("RGB") + margin_data = [] + + for i in range(1, min(len(images), 5)): # Analyze up to 5 pages for efficiency + compare_img = images[i].convert("RGB") + page_margins = self._compare_pages_for_margins(reference_img, compare_img) + if page_margins: + margin_data.append(page_margins) + + # Aggregate margin data + if margin_data: + return self._aggregate_margin_data(margin_data, reference_img.size) + else: + return self._analyze_single_page(reference_img) + + def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) -> Optional[Dict]: + """ + Compare two pages to identify common regions (headers, footers, margins). + """ + try: + # Ensure same size + if reference.size != compare.size: + compare = compare.resize(reference.size) + + # Compute difference and create sameness mask + diff = ImageChops.difference(reference, compare) + sameness_mask = ImageChops.invert(diff.convert("L")) + + # Find horizontal bands (potential headers/footers) + horizontal_bands = self._find_horizontal_bands(sameness_mask) + + # Classify regions + regions = self._classify_regions(horizontal_bands, reference.size) + + return regions + + except Exception as e: + self.logger.debug(f"Page comparison failed: {e}") + return None + + def _find_horizontal_bands( + self, mask: PILImage, min_height: int = 15, min_ratio: float = 0.95 + ) -> List[Tuple[int, int]]: + """ + Find horizontal bands of similar content across pages. + """ + mask_np = np.array(mask.convert("L")) + h, w = mask_np.shape + + # Calculate row-wise similarity + row_sums = np.sum(mask_np == 255, axis=1) / w + same_rows = row_sums >= min_ratio + + # Find contiguous bands + bands = [] + start = None + + for i, is_same in enumerate(same_rows): + if is_same and start is None: + start = i + elif not is_same and start is not None: + if i - start >= min_height: + bands.append((start, i)) + start = None + + # Handle band that extends to end + if start is not None and h - start >= min_height: + bands.append((start, h)) + + return bands + + def _classify_regions(self, bands: List[Tuple[int, int]], page_size: Tuple[int, int]) -> Dict: + """ + Classify horizontal bands into headers, footers, and margins. + """ + width, height = page_size + regions = {"header_bands": [], "footer_bands": [], "estimated_margins": {}} + + for start_y, end_y in bands: + band_center = (start_y + end_y) / 2 + band_height = end_y - start_y + + # Classify based on position + if band_center < height * 0.25: # Top 25% + regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) + elif band_center > height * 0.75: # Bottom 25% + regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) + + # Estimate margins based on bands + regions["estimated_margins"] = self._estimate_margins_from_bands(regions, page_size) + + return regions + + def _estimate_margins_from_bands(self, regions: Dict, page_size: Tuple[int, int]) -> Dict: + """ + Estimate page margins based on detected bands. + """ + width, height = page_size + margins = { + "top": 0, + "bottom": 0, + "left": 50, # Default estimates + "right": 50, + } + + # Calculate top margin from header bands + if regions["header_bands"]: + max_header_end = max(band["end_y"] for band in regions["header_bands"]) + margins["top"] = max_header_end + + # Calculate bottom margin from footer bands + if regions["footer_bands"]: + min_footer_start = min(band["start_y"] for band in regions["footer_bands"]) + margins["bottom"] = height - min_footer_start + + # Convert to relative percentages for consistency + return { + "top_px": margins["top"], + "bottom_px": margins["bottom"], + "left_px": margins["left"], + "right_px": margins["right"], + "top_percent": (margins["top"] / height) * 100, + "bottom_percent": (margins["bottom"] / height) * 100, + "left_percent": (margins["left"] / width) * 100, + "right_percent": (margins["right"] / width) * 100, + } + + def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, int]) -> Dict: + """ + Aggregate margin data from multiple page comparisons. + """ + # Average the margin estimates + all_margins = [data.get("estimated_margins", {}) for data in margin_data if data.get("estimated_margins")] + + if not all_margins: + return self._analyze_single_page_size(page_size) + + # Calculate average margins + avg_margins = {} + for key in [ + "top_px", + "bottom_px", + "left_px", + "right_px", + "top_percent", + "bottom_percent", + "left_percent", + "right_percent", + ]: + values = [m.get(key, 0) for m in all_margins if key in m] + avg_margins[key] = sum(values) / len(values) if values else 0 + + # Collect all bands + all_header_bands = [] + all_footer_bands = [] + + for data in margin_data: + all_header_bands.extend(data.get("header_bands", [])) + all_footer_bands.extend(data.get("footer_bands", [])) + + return { + "layout_analysis": { + "header_bands": all_header_bands, + "footer_bands": all_footer_bands, + "estimated_margins": avg_margins, + "analysis_method": "multi_page_comparison", + } + } + + def _analyze_single_page(self, image: PILImage) -> Dict: + """ + Analyze a single page when comparison isn't possible. + """ + return self._analyze_single_page_size(image.size) + + def _analyze_single_page_size(self, page_size: Tuple[int, int]) -> Dict: + """ + Provide default margin estimates for single page analysis. + """ + width, height = page_size + + # Use common academic paper margins as defaults + default_margins = { + "top_px": int(height * 0.1), # 10% top margin + "bottom_px": int(height * 0.1), # 10% bottom margin + "left_px": int(width * 0.1), # 10% left margin + "right_px": int(width * 0.1), # 10% right margin + "top_percent": 10.0, + "bottom_percent": 10.0, + "left_percent": 10.0, + "right_percent": 10.0, + } + + return { + "layout_analysis": { + "header_bands": [], + "footer_bands": [], + "estimated_margins": default_margins, + "analysis_method": "default_estimates", + } + } + + class PDFPreprocessingSettings(BaseSettings): """ PDF preprocessing settings that can be configured via environment variables. @@ -79,6 +363,7 @@ class Config: class PDFPreprocessor: """ PDF preprocessor that uses OCRmyPDF for rotation, OCR, and optimization. + Also performs layout analysis to extract margin and structure information. Can be configured with environment variables using the PDFPreprocessingSettings. """ @@ -92,38 +377,64 @@ def __init__(self, settings: Optional[PDFPreprocessingSettings] = None): """ self.settings = settings or PDFPreprocessingSettings() self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") + self.analyzer = PDFAnalyzer() if not self.settings.enabled: self.logger.info("PDF preprocessing is disabled via configuration") elif not OCRMYPDF_AVAILABLE: self.logger.warning("OCRmyPDF not available, PDF preprocessing will be skipped") - def preprocess(self, file_data: bytes, filename: str) -> bytes: + def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult: """ - Preprocess PDF with OCRmyPDF using configured settings. + Preprocess PDF with OCRmyPDF and analyze layout structure. Args: file_data: PDF file data as bytes filename: Original filename for logging purposes Returns: - Processed PDF data as bytes (or original bytes if processing fails/disabled) + PDFProcessingResult containing processed data and layout analysis metadata """ + # Initialize metadata + metadata = { + "preprocessing_enabled": self.settings.enabled, + "ocrmypdf_available": OCRMYPDF_AVAILABLE, + "original_filename": filename, + "processing_timestamp": time.time(), + } + + # Handle non-PDF files or disabled preprocessing + if not filename.lower().endswith(".pdf"): + metadata["skipped_reason"] = "not_pdf" + return PDFProcessingResult(processed_data=file_data, metadata=metadata) + if not self.settings.enabled: self.logger.debug(f"PDF preprocessing disabled, skipping: {filename}") - return file_data + metadata["skipped_reason"] = "preprocessing_disabled" + # Still run analysis on original data + layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) + metadata.update(layout_analysis) + return PDFProcessingResult(processed_data=file_data, metadata=metadata) if not OCRMYPDF_AVAILABLE: self.logger.warning("OCRmyPDF not available, skipping preprocessing") - return file_data - - if not filename.lower().endswith(".pdf"): - return file_data + metadata["skipped_reason"] = "ocrmypdf_unavailable" + # Still run analysis on original data + layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) + metadata.update(layout_analysis) + return PDFProcessingResult(processed_data=file_data, metadata=metadata) try: start_time = time.time() - self.logger.info(f"Starting OCRmyPDF preprocessing for: {filename}") + self.logger.info(f"Starting PDF preprocessing and analysis for: {filename}") + # Step 1: Analyze original PDF layout + self.logger.debug("Analyzing PDF layout structure...") + layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) + metadata.update(layout_analysis) + + # Step 2: OCR preprocessing + self.logger.debug("Starting OCRmyPDF processing...") try: input_buffer = BytesIO(file_data) output_buffer = BytesIO() @@ -148,18 +459,27 @@ def preprocess(self, file_data: bytes, filename: str) -> bytes: output_buffer.close() input_buffer.close() + metadata["ocr_method"] = "bytesio" + except Exception as buffer_error: self.logger.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}") processed_data = self._preprocess_with_temp_files(file_data, filename) + metadata["ocr_method"] = "temp_files" + metadata["ocr_fallback_reason"] = str(buffer_error) processing_time = time.time() - start_time - self.logger.info(f"OCRmyPDF completed for {filename} in {processing_time:.2f} seconds") + metadata["processing_time_seconds"] = processing_time + metadata["processing_successful"] = True - return processed_data + self.logger.info(f"PDF preprocessing completed for {filename} in {processing_time:.2f} seconds") + + return PDFProcessingResult(processed_data=processed_data, metadata=metadata) except Exception as e: - self.logger.error(f"OCRmyPDF preprocessing failed for {filename}: {e}") - return file_data + self.logger.error(f"PDF preprocessing failed for {filename}: {e}") + metadata["processing_successful"] = False + metadata["processing_error"] = str(e) + return PDFProcessingResult(processed_data=file_data, metadata=metadata) def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes: """ @@ -233,4 +553,5 @@ def preprocess_pdf_with_ocrmypdf(file_data: bytes, filename: str) -> bytes: Returns: Processed PDF data as bytes (or original bytes if processing fails) """ - return pdf_preprocessor.preprocess(file_data, filename) + result = pdf_preprocessor.preprocess(file_data, filename) + return result.processed_data diff --git a/argilla-server/src/argilla_server/jobs/document_jobs.py b/argilla-server/src/argilla_server/jobs/document_jobs.py index 42f7004db..2c4a61cf1 100644 --- a/argilla-server/src/argilla_server/jobs/document_jobs.py +++ b/argilla-server/src/argilla_server/jobs/document_jobs.py @@ -130,10 +130,14 @@ async def upload_reference_documents_job( continue try: - # Preprocess PDF files with OCRmyPDF for rotation and OCR - processed_file_data = preprocessing.pdf_preprocessor.preprocess( + # Preprocess PDF files with OCRmyPDF for rotation and OCR, plus layout analysis + preprocessing_result = preprocessing.pdf_preprocessor.preprocess( file_data=file_data, filename=filename ) + processed_file_data = preprocessing_result.processed_data + + # Store preprocessing metadata in file metadata + file_metadata.update({"preprocessing": preprocessing_result.metadata}) file_url = files.put_document_file( client=client, @@ -141,9 +145,7 @@ async def upload_reference_documents_job( document_id=file_document_create.id, # type: ignore file_data=processed_file_data, filename=filename, - metadata=file_document_create.model_dump( - include={"file_name": True, "pmid": True, "doi": True} - ), + metadata=file_metadata, ) if file_url: @@ -158,6 +160,7 @@ async def upload_reference_documents_job( # Create document in database try: + file_document_create.metadata = file_metadata document = await imports.create_document(db, file_document_create) _LOGGER.info(f"Document created successfully for file {filename} with ID {document.id}") file_result.update({"success": True, "document_id": str(document.id), "status": "created"}) From 25a90981d8e9b61f0c2239306775b9b7fcd0a0ea Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Tue, 5 Aug 2025 00:05:12 -0700 Subject: [PATCH 04/22] feat: enable PDF preprocessing analysis with new configuration options --- argilla-server/.env.dev | 8 + .../contexts/document/analysis.py | 495 ++++++++++++++++++ .../contexts/document/preprocessing.py | 328 ++---------- 3 files changed, 535 insertions(+), 296 deletions(-) create mode 100644 argilla-server/src/argilla_server/contexts/document/analysis.py diff --git a/argilla-server/.env.dev b/argilla-server/.env.dev index a1399f4f2..354fff256 100644 --- a/argilla-server/.env.dev +++ b/argilla-server/.env.dev @@ -20,3 +20,11 @@ ARGILLA_ELASTICSEARCH=http://localhost:9200 # Redis configuration ARGILLA_REDIS_URL=redis://localhost:6379/0 + +# PDF Preprocessing +PREPROCESSING_ENABLED=true +PREPROCESSING_ENABLE_ANALYSIS=true +PREPROCESSING_LANGUAGE='["eng"]' +PREPROCESSING_ROTATE_PAGES=true +PREPROCESSING_OPTIMIZE=1 +PREPROCESSING_QUIET=true \ No newline at end of file diff --git a/argilla-server/src/argilla_server/contexts/document/analysis.py b/argilla-server/src/argilla_server/contexts/document/analysis.py new file mode 100644 index 000000000..1f097cfe6 --- /dev/null +++ b/argilla-server/src/argilla_server/contexts/document/analysis.py @@ -0,0 +1,495 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple +import numpy as np + +try: + import cv2 + + CV2_AVAILABLE = True +except ImportError: + CV2_AVAILABLE = False + +try: + from pdf2image import convert_from_bytes + from PIL import ImageChops, ImageDraw + from PIL.Image import Image as PILImage + + PDF2IMAGE_AVAILABLE = True +except ImportError: + PDF2IMAGE_AVAILABLE = False + +try: + pass + + OCRMYPDF_AVAILABLE = True +except ImportError: + OCRMYPDF_AVAILABLE = False + +logger = logging.getLogger(__name__) + + +@dataclass +class PDFProcessingResult: + """ + Result of PDF preprocessing containing both processed data and analysis metadata. + """ + + processed_data: bytes + metadata: Dict + + +def pil_to_cv(image: PILImage) -> np.ndarray: + """Convert PIL Image to OpenCV format.""" + return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) + + +def classify_and_draw_layout_regions( + reference: PILImage, mask: PILImage, min_area: int = 5000, label: bool = True +) -> Tuple[PILImage, List[Dict]]: + """ + Classify and optionally draw layout regions using contour detection. + + Returns: + Tuple of (annotated image, list of detected regions) + """ + if not CV2_AVAILABLE: + return reference, [] + + mask_np = np.array(mask.convert("L")) + h, w = mask_np.shape + + # Clean up the mask using morphological operations + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) + cleaned = cv2.morphologyEx(mask_np, cv2.MORPH_CLOSE, kernel) + + contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + img = reference.copy() if label else reference + regions = [] + + if label: + draw = ImageDraw.Draw(img) + + for cnt in contours: + x, y, rw, rh = cv2.boundingRect(cnt) + area = rw * rh + + if area < min_area: + continue + + cx, cy = x + rw // 2, y + rh // 2 + + # Classify region based on position + if cy < h * 0.25: + region = "header" + elif cy > h * 0.75: + region = "footer" + elif cx < w * 0.15: + region = "left_margin" + elif cx > w * 0.85: + region = "right_margin" + else: + region = "body" + + region_data = { + "type": region, + "x": x, + "y": y, + "width": rw, + "height": rh, + "area": area, + "center_x": cx, + "center_y": cy, + } + regions.append(region_data) + + if label: + draw.rectangle([x, y, x + rw, y + rh], outline="green", width=2) + draw.text((x, y - 10), region, fill="green") + + return img, regions + + +def find_horizontal_bands(mask: PILImage, min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]: + """Find horizontal bands of similar content across pages.""" + mask_np = np.array(mask.convert("L")) + h, w = mask_np.shape + + row_sums = np.sum(mask_np == 255, axis=1) / w # white = same + same_rows = row_sums >= min_ratio + + bands = [] + start = None + for i, val in enumerate(same_rows): + if val and start is None: + start = i + elif not val and start is not None: + if i - start >= min_height: + bands.append((start, i)) + start = None + if start is not None and h - start >= min_height: + bands.append((start, h)) + + return bands + + +class PDFAnalyzer: + """ + Analyzes PDF layout structure to detect margins, headers, footers, and other regions. + """ + + def __init__(self): + self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") + + def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: + """ + Analyze PDF layout to extract margin and region information. + + Args: + pdf_data: PDF file data as bytes + filename: Filename for logging + + Returns: + Dictionary containing layout analysis metadata + """ + if not (PDF2IMAGE_AVAILABLE and CV2_AVAILABLE): + self.logger.warning("PDF analysis requires pdf2image and cv2, skipping layout analysis") + return {"analysis_available": False, "error": "Missing dependencies"} + + try: + # Convert PDF to images + images = convert_from_bytes(pdf_data, dpi=150) # Lower DPI for analysis + if not images: + return {"analysis_available": False, "error": "No pages found"} + + self.logger.info(f"Analyzing layout for {filename} with {len(images)} pages") + + # Analyze layout + layout_data = self._analyze_page_layout(images) + + return { + "analysis_available": True, + "total_pages": len(images), + "page_dimensions": {"width": images[0].size[0], "height": images[0].size[1]} if images else {}, + **layout_data, + } + + except Exception as e: + self.logger.error(f"PDF layout analysis failed for {filename}: {e}") + return {"analysis_available": False, "error": str(e)} + + def _analyze_page_layout(self, images: List[PILImage]) -> Dict: + """ + Analyze page layout by comparing pages to find common regions. + """ + if len(images) < 2: + return self._analyze_single_page(images[0]) if images else {} + + # Use first page as reference, compare with others + reference_img = images[0].convert("RGB") + margin_data = [] + + for i in range(1, min(len(images), 5)): # Analyze up to 5 pages for efficiency + compare_img = images[i].convert("RGB") + page_margins = self._compare_pages_for_margins(reference_img, compare_img) + if page_margins: + margin_data.append(page_margins) + + # Aggregate margin data + if margin_data: + return self._aggregate_margin_data(margin_data, reference_img.size) + else: + return self._analyze_single_page(reference_img) + + def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) -> Optional[Dict]: + """ + Compare two pages to identify common regions using advanced CV2 techniques. + """ + try: + # Ensure same size + if reference.size != compare.size: + self.logger.debug(f"Resizing page to match reference size") + compare = compare.resize(reference.size) + + # Step 1: Compute difference and invert so white = same + diff = ImageChops.difference(reference, compare) + sameness_mask = ImageChops.invert(diff.convert("L")) + + # Step 2: Threshold the mask (keep high-sameness pixels) + # Create a lookup table for thresholding + threshold = 30 + lut = [255 if i > threshold else 0 for i in range(256)] + sameness_mask.point(lut).convert("1") + + # Step 3: Find horizontal bands (potential headers/footers) + horizontal_bands = find_horizontal_bands(sameness_mask) + + # Step 4: Use contour-based region classification + annotated_img, detected_regions = classify_and_draw_layout_regions( + reference, sameness_mask, min_area=5000, label=False + ) + + # Step 5: Classify and aggregate results + regions = self._classify_regions_advanced(horizontal_bands, detected_regions, reference.size) + + return regions + + except Exception as e: + self.logger.debug(f"Page comparison failed: {e}") + return None + + def _classify_regions_advanced( + self, bands: List[Tuple[int, int]], detected_regions: List[Dict], page_size: Tuple[int, int] + ) -> Dict: + """ + Advanced region classification combining horizontal bands and contour detection. + """ + width, height = page_size + regions = { + "header_bands": [], + "footer_bands": [], + "detected_regions": detected_regions, + "estimated_margins": {}, + } + + # Process horizontal bands + for start_y, end_y in bands: + band_center = (start_y + end_y) / 2 + band_height = end_y - start_y + + # Classify based on position + if band_center < height * 0.25: # Top 25% + regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) + elif band_center > height * 0.75: # Bottom 25% + regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) + + # Estimate margins using both techniques + regions["estimated_margins"] = self._estimate_margins_advanced(regions, detected_regions, page_size) + + return regions + + def _estimate_margins_advanced( + self, regions: Dict, detected_regions: List[Dict], page_size: Tuple[int, int] + ) -> Dict: + """ + Advanced margin estimation using both band and contour information. + """ + width, height = page_size + margins = { + "top": 0, + "bottom": 0, + "left": 50, # Default estimates + "right": 50, + } + + # Calculate top margin from header regions + header_sources = [] + if regions["header_bands"]: + header_sources.append(max(band["end_y"] for band in regions["header_bands"])) + + # Add header regions from contour detection + header_regions = [r for r in detected_regions if r["type"] == "header"] + if header_regions: + header_sources.append(max(r["y"] + r["height"] for r in header_regions)) + + if header_sources: + margins["top"] = max(header_sources) + + # Calculate bottom margin from footer regions + footer_sources = [] + if regions["footer_bands"]: + footer_sources.append(min(band["start_y"] for band in regions["footer_bands"])) + + # Add footer regions from contour detection + footer_regions = [r for r in detected_regions if r["type"] == "footer"] + if footer_regions: + footer_sources.append(min(r["y"] for r in footer_regions)) + + if footer_sources: + margins["bottom"] = height - min(footer_sources) + + # Calculate left/right margins from contour detection + left_regions = [r for r in detected_regions if r["type"] == "left_margin"] + if left_regions: + margins["left"] = max(r["x"] + r["width"] for r in left_regions) + + right_regions = [r for r in detected_regions if r["type"] == "right_margin"] + if right_regions: + margins["right"] = width - min(r["x"] for r in right_regions) + + # Convert to relative percentages for consistency + return { + "top_px": margins["top"], + "bottom_px": margins["bottom"], + "left_px": margins["left"], + "right_px": margins["right"], + "top_percent": (margins["top"] / height) * 100 if height > 0 else 0, + "bottom_percent": (margins["bottom"] / height) * 100 if height > 0 else 0, + "left_percent": (margins["left"] / width) * 100 if width > 0 else 0, + "right_percent": (margins["right"] / width) * 100 if width > 0 else 0, + } + + def _classify_regions(self, bands: List[Tuple[int, int]], page_size: Tuple[int, int]) -> Dict: + """ + Classify horizontal bands into headers, footers, and margins. + """ + width, height = page_size + regions = {"header_bands": [], "footer_bands": [], "estimated_margins": {}} + + for start_y, end_y in bands: + band_center = (start_y + end_y) / 2 + band_height = end_y - start_y + + # Classify based on position + if band_center < height * 0.25: # Top 25% + regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) + elif band_center > height * 0.75: # Bottom 25% + regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) + + # Estimate margins based on bands + regions["estimated_margins"] = self._estimate_margins_from_bands(regions, page_size) + + return regions + + def _estimate_margins_from_bands(self, regions: Dict, page_size: Tuple[int, int]) -> Dict: + """ + Estimate page margins based on detected bands. + """ + width, height = page_size + margins = { + "top": 0, + "bottom": 0, + "left": 50, # Default estimates + "right": 50, + } + + # Calculate top margin from header bands + if regions["header_bands"]: + max_header_end = max(band["end_y"] for band in regions["header_bands"]) + margins["top"] = max_header_end + + # Calculate bottom margin from footer bands + if regions["footer_bands"]: + min_footer_start = min(band["start_y"] for band in regions["footer_bands"]) + margins["bottom"] = height - min_footer_start + + # Convert to relative percentages for consistency + return { + "top_px": margins["top"], + "bottom_px": margins["bottom"], + "left_px": margins["left"], + "right_px": margins["right"], + "top_percent": (margins["top"] / height) * 100, + "bottom_percent": (margins["bottom"] / height) * 100, + "left_percent": (margins["left"] / width) * 100, + "right_percent": (margins["right"] / width) * 100, + } + + def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, int]) -> Dict: + """ + Aggregate margin data from multiple page comparisons. + """ + # Average the margin estimates + all_margins = [data.get("estimated_margins", {}) for data in margin_data if data.get("estimated_margins")] + + if not all_margins: + return self._analyze_single_page_size(page_size) + + # Calculate average margins + avg_margins = {} + for key in [ + "top_px", + "bottom_px", + "left_px", + "right_px", + "top_percent", + "bottom_percent", + "left_percent", + "right_percent", + ]: + values = [m.get(key, 0) for m in all_margins if key in m] + avg_margins[key] = sum(values) / len(values) if values else 0 + + # Collect all bands and regions + all_header_bands = [] + all_footer_bands = [] + all_detected_regions = [] + + for data in margin_data: + all_header_bands.extend(data.get("header_bands", [])) + all_footer_bands.extend(data.get("footer_bands", [])) + all_detected_regions.extend(data.get("detected_regions", [])) + + # Aggregate detected regions by type + region_stats = {} + for region in all_detected_regions: + region_type = region["type"] + if region_type not in region_stats: + region_stats[region_type] = [] + region_stats[region_type].append(region) + + return { + "layout_analysis": { + "header_bands": all_header_bands, + "footer_bands": all_footer_bands, + "detected_regions": all_detected_regions, + "region_statistics": { + region_type: { + "count": len(regions), + "avg_area": sum(r["area"] for r in regions) / len(regions) if regions else 0, + "total_area": sum(r["area"] for r in regions), + } + for region_type, regions in region_stats.items() + }, + "estimated_margins": avg_margins, + "analysis_method": "multi_page_comparison_advanced", + } + } + + def _analyze_single_page(self, image: PILImage) -> Dict: + """ + Analyze a single page when comparison isn't possible. + """ + return self._analyze_single_page_size(image.size) + + def _analyze_single_page_size(self, page_size: Tuple[int, int]) -> Dict: + """ + Provide default margin estimates for single page analysis. + """ + width, height = page_size + + # Use common academic paper margins as defaults + default_margins = { + "top_px": int(height * 0.1), # 10% top margin + "bottom_px": int(height * 0.1), # 10% bottom margin + "left_px": int(width * 0.1), # 10% left margin + "right_px": int(width * 0.1), # 10% right margin + "top_percent": 10.0, + "bottom_percent": 10.0, + "left_percent": 10.0, + "right_percent": 10.0, + } + + return { + "layout_analysis": { + "header_bands": [], + "footer_bands": [], + "estimated_margins": default_margins, + "analysis_method": "default_estimates", + } + } diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py index a0ff0fd19..5395a7fc3 100644 --- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py +++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py @@ -18,31 +18,13 @@ import os import tempfile import time -from dataclasses import dataclass from io import BytesIO -from typing import Dict, List, Optional, Tuple +from typing import List, Optional from uuid import uuid4 -import numpy as np from pydantic import Field from pydantic_settings import BaseSettings -try: - pass - - CV2_AVAILABLE = True -except ImportError: - CV2_AVAILABLE = False - -try: - from pdf2image import convert_from_bytes - from PIL import ImageChops - from PIL.Image import Image as PILImage - - PDF2IMAGE_AVAILABLE = True -except ImportError: - PDF2IMAGE_AVAILABLE = False - try: import ocrmypdf @@ -50,273 +32,12 @@ except ImportError: OCRMYPDF_AVAILABLE = False -logger = logging.getLogger(__name__) - - -@dataclass -class PDFProcessingResult: - """ - Result of PDF preprocessing containing both processed data and analysis metadata. - """ - - processed_data: bytes - metadata: Dict - - -class PDFAnalyzer: - """ - Analyzes PDF layout structure to detect margins, headers, footers, and other regions. - """ - - def __init__(self): - self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") - - def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: - """ - Analyze PDF layout to extract margin and region information. - - Args: - pdf_data: PDF file data as bytes - filename: Filename for logging - - Returns: - Dictionary containing layout analysis metadata - """ - if not (PDF2IMAGE_AVAILABLE and CV2_AVAILABLE): - self.logger.warning("PDF analysis requires pdf2image and cv2, skipping layout analysis") - return {"analysis_available": False, "error": "Missing dependencies"} - - try: - # Convert PDF to images - images = convert_from_bytes(pdf_data, dpi=150) # Lower DPI for analysis - if not images: - return {"analysis_available": False, "error": "No pages found"} - - self.logger.info(f"Analyzing layout for {filename} with {len(images)} pages") - - # Analyze layout - layout_data = self._analyze_page_layout(images) - - return { - "analysis_available": True, - "total_pages": len(images), - "page_dimensions": {"width": images[0].size[0], "height": images[0].size[1]} if images else {}, - **layout_data, - } - - except Exception as e: - self.logger.error(f"PDF layout analysis failed for {filename}: {e}") - return {"analysis_available": False, "error": str(e)} - - def _analyze_page_layout(self, images: List[PILImage]) -> Dict: - """ - Analyze page layout by comparing pages to find common regions. - """ - if len(images) < 2: - return self._analyze_single_page(images[0]) if images else {} - - # Use first page as reference, compare with others - reference_img = images[0].convert("RGB") - margin_data = [] - - for i in range(1, min(len(images), 5)): # Analyze up to 5 pages for efficiency - compare_img = images[i].convert("RGB") - page_margins = self._compare_pages_for_margins(reference_img, compare_img) - if page_margins: - margin_data.append(page_margins) - - # Aggregate margin data - if margin_data: - return self._aggregate_margin_data(margin_data, reference_img.size) - else: - return self._analyze_single_page(reference_img) - - def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) -> Optional[Dict]: - """ - Compare two pages to identify common regions (headers, footers, margins). - """ - try: - # Ensure same size - if reference.size != compare.size: - compare = compare.resize(reference.size) - - # Compute difference and create sameness mask - diff = ImageChops.difference(reference, compare) - sameness_mask = ImageChops.invert(diff.convert("L")) - - # Find horizontal bands (potential headers/footers) - horizontal_bands = self._find_horizontal_bands(sameness_mask) - - # Classify regions - regions = self._classify_regions(horizontal_bands, reference.size) - - return regions - - except Exception as e: - self.logger.debug(f"Page comparison failed: {e}") - return None - - def _find_horizontal_bands( - self, mask: PILImage, min_height: int = 15, min_ratio: float = 0.95 - ) -> List[Tuple[int, int]]: - """ - Find horizontal bands of similar content across pages. - """ - mask_np = np.array(mask.convert("L")) - h, w = mask_np.shape - - # Calculate row-wise similarity - row_sums = np.sum(mask_np == 255, axis=1) / w - same_rows = row_sums >= min_ratio - - # Find contiguous bands - bands = [] - start = None - - for i, is_same in enumerate(same_rows): - if is_same and start is None: - start = i - elif not is_same and start is not None: - if i - start >= min_height: - bands.append((start, i)) - start = None - - # Handle band that extends to end - if start is not None and h - start >= min_height: - bands.append((start, h)) - - return bands - - def _classify_regions(self, bands: List[Tuple[int, int]], page_size: Tuple[int, int]) -> Dict: - """ - Classify horizontal bands into headers, footers, and margins. - """ - width, height = page_size - regions = {"header_bands": [], "footer_bands": [], "estimated_margins": {}} - - for start_y, end_y in bands: - band_center = (start_y + end_y) / 2 - band_height = end_y - start_y - - # Classify based on position - if band_center < height * 0.25: # Top 25% - regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) - elif band_center > height * 0.75: # Bottom 25% - regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) - - # Estimate margins based on bands - regions["estimated_margins"] = self._estimate_margins_from_bands(regions, page_size) - - return regions - - def _estimate_margins_from_bands(self, regions: Dict, page_size: Tuple[int, int]) -> Dict: - """ - Estimate page margins based on detected bands. - """ - width, height = page_size - margins = { - "top": 0, - "bottom": 0, - "left": 50, # Default estimates - "right": 50, - } - - # Calculate top margin from header bands - if regions["header_bands"]: - max_header_end = max(band["end_y"] for band in regions["header_bands"]) - margins["top"] = max_header_end - - # Calculate bottom margin from footer bands - if regions["footer_bands"]: - min_footer_start = min(band["start_y"] for band in regions["footer_bands"]) - margins["bottom"] = height - min_footer_start - - # Convert to relative percentages for consistency - return { - "top_px": margins["top"], - "bottom_px": margins["bottom"], - "left_px": margins["left"], - "right_px": margins["right"], - "top_percent": (margins["top"] / height) * 100, - "bottom_percent": (margins["bottom"] / height) * 100, - "left_percent": (margins["left"] / width) * 100, - "right_percent": (margins["right"] / width) * 100, - } - - def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, int]) -> Dict: - """ - Aggregate margin data from multiple page comparisons. - """ - # Average the margin estimates - all_margins = [data.get("estimated_margins", {}) for data in margin_data if data.get("estimated_margins")] - - if not all_margins: - return self._analyze_single_page_size(page_size) - - # Calculate average margins - avg_margins = {} - for key in [ - "top_px", - "bottom_px", - "left_px", - "right_px", - "top_percent", - "bottom_percent", - "left_percent", - "right_percent", - ]: - values = [m.get(key, 0) for m in all_margins if key in m] - avg_margins[key] = sum(values) / len(values) if values else 0 - - # Collect all bands - all_header_bands = [] - all_footer_bands = [] - - for data in margin_data: - all_header_bands.extend(data.get("header_bands", [])) - all_footer_bands.extend(data.get("footer_bands", [])) - - return { - "layout_analysis": { - "header_bands": all_header_bands, - "footer_bands": all_footer_bands, - "estimated_margins": avg_margins, - "analysis_method": "multi_page_comparison", - } - } - - def _analyze_single_page(self, image: PILImage) -> Dict: - """ - Analyze a single page when comparison isn't possible. - """ - return self._analyze_single_page_size(image.size) - - def _analyze_single_page_size(self, page_size: Tuple[int, int]) -> Dict: - """ - Provide default margin estimates for single page analysis. - """ - width, height = page_size - - # Use common academic paper margins as defaults - default_margins = { - "top_px": int(height * 0.1), # 10% top margin - "bottom_px": int(height * 0.1), # 10% bottom margin - "left_px": int(width * 0.1), # 10% left margin - "right_px": int(width * 0.1), # 10% right margin - "top_percent": 10.0, - "bottom_percent": 10.0, - "left_percent": 10.0, - "right_percent": 10.0, - } +try: + from argilla_server.contexts.document.analysis import PDFAnalyzer, PDFProcessingResult - return { - "layout_analysis": { - "header_bands": [], - "footer_bands": [], - "estimated_margins": default_margins, - "analysis_method": "default_estimates", - } - } + ANALYSIS_AVAILABLE = True +except ImportError: + ANALYSIS_AVAILABLE = False class PDFPreprocessingSettings(BaseSettings): @@ -356,6 +77,9 @@ class PDFPreprocessingSettings(BaseSettings): quiet: bool = Field(default=True, description="Suppress OCRmyPDF output messages") + # Analysis settings + enable_analysis: bool = Field(default=True, description="Enable PDF layout analysis and margin detection") + class Config: env_prefix = "PREPROCESSING_" @@ -377,7 +101,14 @@ def __init__(self, settings: Optional[PDFPreprocessingSettings] = None): """ self.settings = settings or PDFPreprocessingSettings() self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") - self.analyzer = PDFAnalyzer() + + # Initialize analyzer if available and enabled + if self.settings.enable_analysis and ANALYSIS_AVAILABLE: + self.analyzer = PDFAnalyzer() + else: + self.analyzer = None + if self.settings.enable_analysis and not ANALYSIS_AVAILABLE: + self.logger.warning("PDF analysis is enabled but dependencies are not available") if not self.settings.enabled: self.logger.info("PDF preprocessing is disabled via configuration") @@ -411,27 +142,32 @@ def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult: if not self.settings.enabled: self.logger.debug(f"PDF preprocessing disabled, skipping: {filename}") metadata["skipped_reason"] = "preprocessing_disabled" - # Still run analysis on original data - layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) - metadata.update(layout_analysis) + # Still run analysis on original data if enabled and available + if self.analyzer: + layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) + metadata.update(layout_analysis) return PDFProcessingResult(processed_data=file_data, metadata=metadata) if not OCRMYPDF_AVAILABLE: self.logger.warning("OCRmyPDF not available, skipping preprocessing") metadata["skipped_reason"] = "ocrmypdf_unavailable" - # Still run analysis on original data - layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) - metadata.update(layout_analysis) + # Still run analysis on original data if enabled and available + if self.analyzer: + layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) + metadata.update(layout_analysis) return PDFProcessingResult(processed_data=file_data, metadata=metadata) try: start_time = time.time() self.logger.info(f"Starting PDF preprocessing and analysis for: {filename}") - # Step 1: Analyze original PDF layout - self.logger.debug("Analyzing PDF layout structure...") - layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) - metadata.update(layout_analysis) + # Step 1: Analyze original PDF layout (if enabled and available) + if self.analyzer: + self.logger.debug("Analyzing PDF layout structure...") + layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) + metadata.update(layout_analysis) + else: + metadata.update({"analysis_available": False, "analysis_skipped": "disabled_or_unavailable"}) # Step 2: OCR preprocessing self.logger.debug("Starting OCRmyPDF processing...") From ab7f39b06590e13a0f2dd75b005d995fc0c1ac51 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Tue, 5 Aug 2025 10:02:00 -0700 Subject: [PATCH 05/22] feat: update PDF preprocessing settings and add new document analysis schemas --- argilla-server/.env.dev | 3 ++- .../api/schemas/v1/document/analysis.py | 14 ++++++++++++++ .../api/schemas/v1/{ => document}/segments.py | 0 .../contexts/document/preprocessing.py | 2 +- 4 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 argilla-server/src/argilla_server/api/schemas/v1/document/analysis.py rename argilla-server/src/argilla_server/api/schemas/v1/{ => document}/segments.py (100%) diff --git a/argilla-server/.env.dev b/argilla-server/.env.dev index 354fff256..024d40e71 100644 --- a/argilla-server/.env.dev +++ b/argilla-server/.env.dev @@ -23,8 +23,9 @@ ARGILLA_REDIS_URL=redis://localhost:6379/0 # PDF Preprocessing PREPROCESSING_ENABLED=true -PREPROCESSING_ENABLE_ANALYSIS=true +PREPROCESSING_ENABLE_ANALYSIS=false PREPROCESSING_LANGUAGE='["eng"]' PREPROCESSING_ROTATE_PAGES=true PREPROCESSING_OPTIMIZE=1 +PREPROCESSING_CLEAN=false PREPROCESSING_QUIET=true \ No newline at end of file diff --git a/argilla-server/src/argilla_server/api/schemas/v1/document/analysis.py b/argilla-server/src/argilla_server/api/schemas/v1/document/analysis.py new file mode 100644 index 000000000..fb5dffc96 --- /dev/null +++ b/argilla-server/src/argilla_server/api/schemas/v1/document/analysis.py @@ -0,0 +1,14 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/argilla-server/src/argilla_server/api/schemas/v1/segments.py b/argilla-server/src/argilla_server/api/schemas/v1/document/segments.py similarity index 100% rename from argilla-server/src/argilla_server/api/schemas/v1/segments.py rename to argilla-server/src/argilla_server/api/schemas/v1/document/segments.py diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py index 5395a7fc3..9b72183f4 100644 --- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py +++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py @@ -59,7 +59,7 @@ class PDFPreprocessingSettings(BaseSettings): deskew: bool = Field(default=True, description="Fix skewed text") - clean: bool = Field(default=True, description="Clean up artifacts") + clean: bool = Field(default=True, description="Use `unpaper` to clean up artifacts") optimize: int = Field( default=1, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)" From 1a67fbc521689f78e4de9c91412e81279ca02100 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Tue, 5 Aug 2025 10:40:06 -0700 Subject: [PATCH 06/22] feat: add new preprocessing options for Tesseract timeout and text skipping in PDF settings --- argilla-server/.env.dev | 2 + .../contexts/document/preprocessing.py | 120 +++++++----------- 2 files changed, 51 insertions(+), 71 deletions(-) diff --git a/argilla-server/.env.dev b/argilla-server/.env.dev index 024d40e71..6c9ad1694 100644 --- a/argilla-server/.env.dev +++ b/argilla-server/.env.dev @@ -28,4 +28,6 @@ PREPROCESSING_LANGUAGE='["eng"]' PREPROCESSING_ROTATE_PAGES=true PREPROCESSING_OPTIMIZE=1 PREPROCESSING_CLEAN=false +PREPROCESSING_SKIP_TEXT=true +PREPROCESSING_TESSERACT_TIMEOUT=0 PREPROCESSING_QUIET=true \ No newline at end of file diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py index 9b72183f4..085daf6e5 100644 --- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py +++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py @@ -39,6 +39,8 @@ except ImportError: ANALYSIS_AVAILABLE = False +_LOGGER = logging.getLogger(__name__) + class PDFPreprocessingSettings(BaseSettings): """ @@ -47,6 +49,9 @@ class PDFPreprocessingSettings(BaseSettings): All settings have the PREPROCESSING_ prefix. """ + class Config: + env_prefix = "PREPROCESSING_" + enabled: bool = Field( default=True, description="Enable PDF preprocessing with OCRmyPDF. Set to False to disable all processing." ) @@ -57,7 +62,7 @@ class PDFPreprocessingSettings(BaseSettings): rotate_pages: bool = Field(default=True, description="Auto-rotate pages with horizontal text") - deskew: bool = Field(default=True, description="Fix skewed text") + deskew: bool = Field(default=False, description="Fix skewed text") clean: bool = Field(default=True, description="Use `unpaper` to clean up artifacts") @@ -69,19 +74,44 @@ class PDFPreprocessingSettings(BaseSettings): force_ocr: bool = Field(default=False, description="Force OCR on all pages, even if they already have text") - skip_text: bool = Field(default=False, description="Skip text-based operations (OCR only for images)") + tesseract_timeout: int = Field( + default=0, description="Timeout for Tesseract OCR processing in seconds (0 for no timeout)" + ) + + skip_text: bool = Field(default=True, description="Skip text-based operations (OCR only for images)") redo_ocr: bool = Field(default=False, description="Redo OCR on pages that already have OCR") progress_bar: bool = Field(default=False, description="Show progress bar during processing") - quiet: bool = Field(default=True, description="Suppress OCRmyPDF output messages") - - # Analysis settings enable_analysis: bool = Field(default=True, description="Enable PDF layout analysis and margin detection") - class Config: - env_prefix = "PREPROCESSING_" + output_type: str = Field( + default="pdf", + description="Output type for OCRmyPDF. Set to 'pdf' to skip PDF/A conversion.", + ) + + def get_ocrmypdf_args(self) -> dict: + """ + Get OCRmyPDF arguments as a dictionary for use with **kwargs. + + Returns: + Dictionary of OCRmyPDF arguments excluding input/output parameters. + """ + return { + "language": self.language, + "rotate_pages": self.rotate_pages, + "deskew": self.deskew, + "clean": self.clean, + "optimize": self.optimize, + "pdf_renderer": self.pdf_renderer, + "force_ocr": self.force_ocr, + "skip_text": self.skip_text, + "tesseract_timeout": self.tesseract_timeout, + "redo_ocr": self.redo_ocr, + "progress_bar": self.progress_bar, + "output_type": self.output_type, # skip PDF/A conversion + } class PDFPreprocessor: @@ -100,7 +130,6 @@ def __init__(self, settings: Optional[PDFPreprocessingSettings] = None): settings: Optional PDFPreprocessingSettings instance. If None, loads from environment. """ self.settings = settings or PDFPreprocessingSettings() - self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") # Initialize analyzer if available and enabled if self.settings.enable_analysis and ANALYSIS_AVAILABLE: @@ -108,12 +137,12 @@ def __init__(self, settings: Optional[PDFPreprocessingSettings] = None): else: self.analyzer = None if self.settings.enable_analysis and not ANALYSIS_AVAILABLE: - self.logger.warning("PDF analysis is enabled but dependencies are not available") + _LOGGER.warning("PDF analysis is enabled but dependencies are not available") if not self.settings.enabled: - self.logger.info("PDF preprocessing is disabled via configuration") + _LOGGER.info("PDF preprocessing is disabled via configuration") elif not OCRMYPDF_AVAILABLE: - self.logger.warning("OCRmyPDF not available, PDF preprocessing will be skipped") + _LOGGER.warning("OCRmyPDF not available, PDF preprocessing will be skipped") def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult: """ @@ -126,31 +155,20 @@ def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult: Returns: PDFProcessingResult containing processed data and layout analysis metadata """ - # Initialize metadata - metadata = { - "preprocessing_enabled": self.settings.enabled, - "ocrmypdf_available": OCRMYPDF_AVAILABLE, - "original_filename": filename, - "processing_timestamp": time.time(), - } + metadata = {} # Handle non-PDF files or disabled preprocessing if not filename.lower().endswith(".pdf"): - metadata["skipped_reason"] = "not_pdf" return PDFProcessingResult(processed_data=file_data, metadata=metadata) if not self.settings.enabled: - self.logger.debug(f"PDF preprocessing disabled, skipping: {filename}") - metadata["skipped_reason"] = "preprocessing_disabled" - # Still run analysis on original data if enabled and available if self.analyzer: layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) metadata.update(layout_analysis) return PDFProcessingResult(processed_data=file_data, metadata=metadata) if not OCRMYPDF_AVAILABLE: - self.logger.warning("OCRmyPDF not available, skipping preprocessing") - metadata["skipped_reason"] = "ocrmypdf_unavailable" + _LOGGER.warning("OCRmyPDF not available, skipping preprocessing") # Still run analysis on original data if enabled and available if self.analyzer: layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) @@ -159,62 +177,36 @@ def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult: try: start_time = time.time() - self.logger.info(f"Starting PDF preprocessing and analysis for: {filename}") # Step 1: Analyze original PDF layout (if enabled and available) if self.analyzer: - self.logger.debug("Analyzing PDF layout structure...") layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) metadata.update(layout_analysis) - else: - metadata.update({"analysis_available": False, "analysis_skipped": "disabled_or_unavailable"}) # Step 2: OCR preprocessing - self.logger.debug("Starting OCRmyPDF processing...") try: input_buffer = BytesIO(file_data) output_buffer = BytesIO() - ocrmypdf.ocr( - input_buffer, - output_buffer, - language=self.settings.language, - rotate_pages=self.settings.rotate_pages, - deskew=self.settings.deskew, - clean=self.settings.clean, - optimize=self.settings.optimize, - pdf_renderer=self.settings.pdf_renderer, - force_ocr=self.settings.force_ocr, - skip_text=self.settings.skip_text, - redo_ocr=self.settings.redo_ocr, - progress_bar=self.settings.progress_bar, - quiet=self.settings.quiet, - ) + ocrmypdf.ocr(input_buffer, output_buffer, **self.settings.get_ocrmypdf_args()) processed_data = output_buffer.getvalue() output_buffer.close() input_buffer.close() - metadata["ocr_method"] = "bytesio" - except Exception as buffer_error: - self.logger.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}") + _LOGGER.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}") processed_data = self._preprocess_with_temp_files(file_data, filename) - metadata["ocr_method"] = "temp_files" - metadata["ocr_fallback_reason"] = str(buffer_error) processing_time = time.time() - start_time metadata["processing_time_seconds"] = processing_time - metadata["processing_successful"] = True - - self.logger.info(f"PDF preprocessing completed for {filename} in {processing_time:.2f} seconds") + print(metadata) + _LOGGER.info(f"PDF preprocessing completed for {filename} in {processing_time:.2f} seconds") return PDFProcessingResult(processed_data=processed_data, metadata=metadata) except Exception as e: - self.logger.error(f"PDF preprocessing failed for {filename}: {e}") - metadata["processing_successful"] = False - metadata["processing_error"] = str(e) + _LOGGER.error(f"PDF preprocessing failed for {filename}: {e}") return PDFProcessingResult(processed_data=file_data, metadata=metadata) def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes: @@ -240,21 +232,7 @@ def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes: ) output_temp_file.close() - ocrmypdf.ocr( - input_temp_file.name, - output_temp_file.name, - language=self.settings.language, - rotate_pages=self.settings.rotate_pages, - deskew=self.settings.deskew, - clean=self.settings.clean, - optimize=self.settings.optimize, - pdf_renderer=self.settings.pdf_renderer, - force_ocr=self.settings.force_ocr, - skip_text=self.settings.skip_text, - redo_ocr=self.settings.redo_ocr, - progress_bar=self.settings.progress_bar, - quiet=self.settings.quiet, - ) + ocrmypdf.ocr(input_temp_file.name, output_temp_file.name, **self.settings.get_ocrmypdf_args()) with open(output_temp_file.name, "rb") as f: processed_data = f.read() @@ -268,7 +246,7 @@ def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes: if hasattr(temp_file, "name"): os.unlink(temp_file.name) except OSError as e: - self.logger.warning(f"Failed to clean up temp file: {e}") + _LOGGER.warning(f"Failed to clean up temp file: {e}") # Global preprocessor instance (can be configured via environment variables) From 3f84066f7870430a4a3a85e16d70a0077ad54da6 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Tue, 5 Aug 2025 10:50:01 -0700 Subject: [PATCH 07/22] feat: introduce rotate pages threshold in PDF preprocessing settings and update Tesseract timeout description --- argilla-server/.env.dev | 3 ++- .../argilla_server/contexts/document/preprocessing.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/argilla-server/.env.dev b/argilla-server/.env.dev index 6c9ad1694..2122a42cf 100644 --- a/argilla-server/.env.dev +++ b/argilla-server/.env.dev @@ -26,8 +26,9 @@ PREPROCESSING_ENABLED=true PREPROCESSING_ENABLE_ANALYSIS=false PREPROCESSING_LANGUAGE='["eng"]' PREPROCESSING_ROTATE_PAGES=true +PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0 PREPROCESSING_OPTIMIZE=1 PREPROCESSING_CLEAN=false PREPROCESSING_SKIP_TEXT=true -PREPROCESSING_TESSERACT_TIMEOUT=0 +# PREPROCESSING_TESSERACT_TIMEOUT=0 # Uncomment to disable Tesseract OCR PREPROCESSING_QUIET=true \ No newline at end of file diff --git a/argilla-server/src/argilla_server/contexts/document/preprocessing.py b/argilla-server/src/argilla_server/contexts/document/preprocessing.py index 085daf6e5..88601eb24 100644 --- a/argilla-server/src/argilla_server/contexts/document/preprocessing.py +++ b/argilla-server/src/argilla_server/contexts/document/preprocessing.py @@ -62,6 +62,11 @@ class Config: rotate_pages: bool = Field(default=True, description="Auto-rotate pages with horizontal text") + rotate_pages_threshold: float = Field( + default=2.0, + description="Threshold for auto-rotation", + ) + deskew: bool = Field(default=False, description="Fix skewed text") clean: bool = Field(default=True, description="Use `unpaper` to clean up artifacts") @@ -75,7 +80,7 @@ class Config: force_ocr: bool = Field(default=False, description="Force OCR on all pages, even if they already have text") tesseract_timeout: int = Field( - default=0, description="Timeout for Tesseract OCR processing in seconds (0 for no timeout)" + default=0, description="Timeout for Tesseract OCR processing in seconds (0 to skip Tesseract OCR)" ) skip_text: bool = Field(default=True, description="Skip text-based operations (OCR only for images)") @@ -101,6 +106,7 @@ def get_ocrmypdf_args(self) -> dict: return { "language": self.language, "rotate_pages": self.rotate_pages, + "rotate_pages_threshold": self.rotate_pages_threshold, "deskew": self.deskew, "clean": self.clean, "optimize": self.optimize, @@ -110,7 +116,7 @@ def get_ocrmypdf_args(self) -> dict: "tesseract_timeout": self.tesseract_timeout, "redo_ocr": self.redo_ocr, "progress_bar": self.progress_bar, - "output_type": self.output_type, # skip PDF/A conversion + "output_type": self.output_type, } From b69e49e4dd3963db4e8a3ed267251d787d7bccc8 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Wed, 6 Aug 2025 16:16:50 -0700 Subject: [PATCH 08/22] merge conflicts --- extralit-server/.env.dev | 12 ++++++++++++ .../contexts/document/preprocessing.py | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev index 9617b610b..51ec548b0 100644 --- a/extralit-server/.env.dev +++ b/extralit-server/.env.dev @@ -20,3 +20,15 @@ EXTRALIT_ELASTICSEARCH=http://localhost:9200 # Redis configuration EXTRALIT_REDIS_URL=redis://localhost:6379/0 + +# PDF Preprocessing +PREPROCESSING_ENABLED=true +PREPROCESSING_ENABLE_ANALYSIS=false +PREPROCESSING_LANGUAGE='["eng"]' +PREPROCESSING_ROTATE_PAGES=true +PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0 +PREPROCESSING_OPTIMIZE=1 +PREPROCESSING_CLEAN=false +PREPROCESSING_SKIP_TEXT=true +# PREPROCESSING_TESSERACT_TIMEOUT=0 # Uncomment to disable Tesseract OCR +PREPROCESSING_QUIET=true diff --git a/extralit-server/src/extralit_server/contexts/document/preprocessing.py b/extralit-server/src/extralit_server/contexts/document/preprocessing.py index 88601eb24..b23f03fe9 100644 --- a/extralit-server/src/extralit_server/contexts/document/preprocessing.py +++ b/extralit-server/src/extralit_server/contexts/document/preprocessing.py @@ -33,7 +33,7 @@ OCRMYPDF_AVAILABLE = False try: - from argilla_server.contexts.document.analysis import PDFAnalyzer, PDFProcessingResult + from extralit_server.contexts.document.analysis import PDFAnalyzer, PDFProcessingResult ANALYSIS_AVAILABLE = True except ImportError: From 713c10749a61c7e462bc595ec49bc6ee6457d538 Mon Sep 17 00:00:00 2001 From: Priyankesh Date: Fri, 8 Aug 2025 21:07:39 +0530 Subject: [PATCH 09/22] initial local commit --- extralit-server/.env.dev | 2 +- extralit-server/pdm.lock | 168 ++++++++++++++++++++------------- extralit-server/pyproject.toml | 2 +- 3 files changed, 104 insertions(+), 68 deletions(-) diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev index 51ec548b0..234e8f829 100644 --- a/extralit-server/.env.dev +++ b/extralit-server/.env.dev @@ -1,7 +1,7 @@ OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # Needed by RQ to work with forked processes on MacOS ALEMBIC_CONFIG=src/extralit_server/alembic.ini EXTRALIT_AUTH_SECRET_KEY=8VO7na5N/jQx+yP/N+HlE8q51vPdrxqlh6OzoebIyko= # With this we avoid using a different key every time the server is reloaded -EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${HOME}/.extralit/extralit-dev.db?check_same_thread=False +EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${USERPROFILE}/.extralit/extralit-dev.db?check_same_thread=False HF_HUB_DISABLE_TELEMETRY=1 # S3 Configuration (skipped to use LocalFileStorage) diff --git a/extralit-server/pdm.lock b/extralit-server/pdm.lock index 731284e16..a52eb2764 100644 --- a/extralit-server/pdm.lock +++ b/extralit-server/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "postgresql", "test"] strategy = [] lock_version = "4.5.0" -content_hash = "sha256:037bf9850aef2d48dd2d032bdac1c64e906123f5aab6cea46dff7d66d2035d37" +content_hash = "sha256:b81b48f68a21fcdb9fe67c3d94a30419667da306d05bde66d807bcf4bb51858e" [[metadata.targets]] requires_python = ">=3.10" @@ -1682,69 +1682,105 @@ files = [ [[package]] name = "pillow" -version = "11.0.0" -summary = "" -files = [ - {file = "pillow-11.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:6619654954dc4936fcff82db8eb6401d3159ec6be81e33c6000dfd76ae189947"}, - {file = "pillow-11.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b3c5ac4bed7519088103d9450a1107f76308ecf91d6dabc8a33a2fcfb18d0fba"}, - {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a65149d8ada1055029fcb665452b2814fe7d7082fcb0c5bed6db851cb69b2086"}, - {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88a58d8ac0cc0e7f3a014509f0455248a76629ca9b604eca7dc5927cc593c5e9"}, - {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c26845094b1af3c91852745ae78e3ea47abf3dbcd1cf962f16b9a5fbe3ee8488"}, - {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:1a61b54f87ab5786b8479f81c4b11f4d61702830354520837f8cc791ebba0f5f"}, - {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:674629ff60030d144b7bca2b8330225a9b11c482ed408813924619c6f302fdbb"}, - {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:598b4e238f13276e0008299bd2482003f48158e2b11826862b1eb2ad7c768b97"}, - {file = "pillow-11.0.0-cp310-cp310-win32.whl", hash = "sha256:9a0f748eaa434a41fccf8e1ee7a3eed68af1b690e75328fd7a60af123c193b50"}, - {file = "pillow-11.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:a5629742881bcbc1f42e840af185fd4d83a5edeb96475a575f4da50d6ede337c"}, - {file = "pillow-11.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:ee217c198f2e41f184f3869f3e485557296d505b5195c513b2bfe0062dc537f1"}, - {file = "pillow-11.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1c1d72714f429a521d8d2d018badc42414c3077eb187a59579f28e4270b4b0fc"}, - {file = "pillow-11.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:499c3a1b0d6fc8213519e193796eb1a86a1be4b1877d678b30f83fd979811d1a"}, - {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8b2351c85d855293a299038e1f89db92a2f35e8d2f783489c6f0b2b5f3fe8a3"}, - {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f4dba50cfa56f910241eb7f883c20f1e7b1d8f7d91c750cd0b318bad443f4d5"}, - {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:5ddbfd761ee00c12ee1be86c9c0683ecf5bb14c9772ddbd782085779a63dd55b"}, - {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:45c566eb10b8967d71bf1ab8e4a525e5a93519e29ea071459ce517f6b903d7fa"}, - {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b4fd7bd29610a83a8c9b564d457cf5bd92b4e11e79a4ee4716a63c959699b306"}, - {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cb929ca942d0ec4fac404cbf520ee6cac37bf35be479b970c4ffadf2b6a1cad9"}, - {file = "pillow-11.0.0-cp311-cp311-win32.whl", hash = "sha256:006bcdd307cc47ba43e924099a038cbf9591062e6c50e570819743f5607404f5"}, - {file = "pillow-11.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:52a2d8323a465f84faaba5236567d212c3668f2ab53e1c74c15583cf507a0291"}, - {file = "pillow-11.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:16095692a253047fe3ec028e951fa4221a1f3ed3d80c397e83541a3037ff67c9"}, - {file = "pillow-11.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d2c0a187a92a1cb5ef2c8ed5412dd8d4334272617f532d4ad4de31e0495bd923"}, - {file = "pillow-11.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:084a07ef0821cfe4858fe86652fffac8e187b6ae677e9906e192aafcc1b69903"}, - {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8069c5179902dcdce0be9bfc8235347fdbac249d23bd90514b7a47a72d9fecf4"}, - {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f02541ef64077f22bf4924f225c0fd1248c168f86e4b7abdedd87d6ebaceab0f"}, - {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fcb4621042ac4b7865c179bb972ed0da0218a076dc1820ffc48b1d74c1e37fe9"}, - {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:00177a63030d612148e659b55ba99527803288cea7c75fb05766ab7981a8c1b7"}, - {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8853a3bf12afddfdf15f57c4b02d7ded92c7a75a5d7331d19f4f9572a89c17e6"}, - {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3107c66e43bda25359d5ef446f59c497de2b5ed4c7fdba0894f8d6cf3822dafc"}, - {file = "pillow-11.0.0-cp312-cp312-win32.whl", hash = "sha256:86510e3f5eca0ab87429dd77fafc04693195eec7fd6a137c389c3eeb4cfb77c6"}, - {file = "pillow-11.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:8ec4a89295cd6cd4d1058a5e6aec6bf51e0eaaf9714774e1bfac7cfc9051db47"}, - {file = "pillow-11.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:27a7860107500d813fcd203b4ea19b04babe79448268403172782754870dac25"}, - {file = "pillow-11.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:bcd1fb5bb7b07f64c15618c89efcc2cfa3e95f0e3bcdbaf4642509de1942a699"}, - {file = "pillow-11.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0e038b0745997c7dcaae350d35859c9715c71e92ffb7e0f4a8e8a16732150f38"}, - {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ae08bd8ffc41aebf578c2af2f9d8749d91f448b3bfd41d7d9ff573d74f2a6b2"}, - {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d69bfd8ec3219ae71bcde1f942b728903cad25fafe3100ba2258b973bd2bc1b2"}, - {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:61b887f9ddba63ddf62fd02a3ba7add935d053b6dd7d58998c630e6dbade8527"}, - {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:c6a660307ca9d4867caa8d9ca2c2658ab685de83792d1876274991adec7b93fa"}, - {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:73e3a0200cdda995c7e43dd47436c1548f87a30bb27fb871f352a22ab8dcf45f"}, - {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fba162b8872d30fea8c52b258a542c5dfd7b235fb5cb352240c8d63b414013eb"}, - {file = "pillow-11.0.0-cp313-cp313-win32.whl", hash = "sha256:f1b82c27e89fffc6da125d5eb0ca6e68017faf5efc078128cfaa42cf5cb38798"}, - {file = "pillow-11.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:8ba470552b48e5835f1d23ecb936bb7f71d206f9dfeee64245f30c3270b994de"}, - {file = "pillow-11.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:846e193e103b41e984ac921b335df59195356ce3f71dcfd155aa79c603873b84"}, - {file = "pillow-11.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4ad70c4214f67d7466bea6a08061eba35c01b1b89eaa098040a35272a8efb22b"}, - {file = "pillow-11.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6ec0d5af64f2e3d64a165f490d96368bb5dea8b8f9ad04487f9ab60dc4bb6003"}, - {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c809a70e43c7977c4a42aefd62f0131823ebf7dd73556fa5d5950f5b354087e2"}, - {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4b60c9520f7207aaf2e1d94de026682fc227806c6e1f55bba7606d1c94dd623a"}, - {file = "pillow-11.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1e2688958a840c822279fda0086fec1fdab2f95bf2b717b66871c4ad9859d7e8"}, - {file = "pillow-11.0.0-cp313-cp313t-win32.whl", hash = "sha256:607bbe123c74e272e381a8d1957083a9463401f7bd01287f50521ecb05a313f8"}, - {file = "pillow-11.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5c39ed17edea3bc69c743a8dd3e9853b7509625c2462532e62baa0732163a904"}, - {file = "pillow-11.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:75acbbeb05b86bc53cbe7b7e6fe00fbcf82ad7c684b3ad82e3d711da9ba287d3"}, - {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1187739620f2b365de756ce086fdb3604573337cc28a0d3ac4a01ab6b2d2a6d2"}, - {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fbbcb7b57dc9c794843e3d1258c0fbf0f48656d46ffe9e09b63bbd6e8cd5d0a2"}, - {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d203af30149ae339ad1b4f710d9844ed8796e97fda23ffbc4cc472968a47d0b"}, - {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21a0d3b115009ebb8ac3d2ebec5c2982cc693da935f4ab7bb5c8ebe2f47d36f2"}, - {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:73853108f56df97baf2bb8b522f3578221e56f646ba345a372c78326710d3830"}, - {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e58876c91f97b0952eb766123bfef372792ab3f4e3e1f1a2267834c2ab131734"}, - {file = "pillow-11.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:224aaa38177597bb179f3ec87eeefcce8e4f85e608025e9cfac60de237ba6316"}, - {file = "pillow-11.0.0.tar.gz", hash = "sha256:72bacbaf24ac003fea9bff9837d1eedb6088758d41e100c1552930151f677739"}, +version = "11.3.0" +requires_python = ">=3.9" +summary = "Python Imaging Library (Fork)" +files = [ + {file = "pillow-11.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1b9c17fd4ace828b3003dfd1e30bff24863e0eb59b535e8f80194d9cc7ecf860"}, + {file = "pillow-11.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:65dc69160114cdd0ca0f35cb434633c75e8e7fad4cf855177a05bf38678f73ad"}, + {file = "pillow-11.3.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7107195ddc914f656c7fc8e4a5e1c25f32e9236ea3ea860f257b0436011fddd0"}, + {file = "pillow-11.3.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc3e831b563b3114baac7ec2ee86819eb03caa1a2cef0b481a5675b59c4fe23b"}, + {file = "pillow-11.3.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f182ebd2303acf8c380a54f615ec883322593320a9b00438eb842c1f37ae50"}, + {file = "pillow-11.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4445fa62e15936a028672fd48c4c11a66d641d2c05726c7ec1f8ba6a572036ae"}, + {file = "pillow-11.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:71f511f6b3b91dd543282477be45a033e4845a40278fa8dcdbfdb07109bf18f9"}, + {file = "pillow-11.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:040a5b691b0713e1f6cbe222e0f4f74cd233421e105850ae3b3c0ceda520f42e"}, + {file = "pillow-11.3.0-cp310-cp310-win32.whl", hash = "sha256:89bd777bc6624fe4115e9fac3352c79ed60f3bb18651420635f26e643e3dd1f6"}, + {file = "pillow-11.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:19d2ff547c75b8e3ff46f4d9ef969a06c30ab2d4263a9e287733aa8b2429ce8f"}, + {file = "pillow-11.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:819931d25e57b513242859ce1876c58c59dc31587847bf74cfe06b2e0cb22d2f"}, + {file = "pillow-11.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1cd110edf822773368b396281a2293aeb91c90a2db00d78ea43e7e861631b722"}, + {file = "pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c412fddd1b77a75aa904615ebaa6001f169b26fd467b4be93aded278266b288"}, + {file = "pillow-11.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1aa4de119a0ecac0a34a9c8bde33f34022e2e8f99104e47a3ca392fd60e37d"}, + {file = "pillow-11.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:91da1d88226663594e3f6b4b8c3c8d85bd504117d043740a8e0ec449087cc494"}, + {file = "pillow-11.3.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:643f189248837533073c405ec2f0bb250ba54598cf80e8c1e043381a60632f58"}, + {file = "pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:106064daa23a745510dabce1d84f29137a37224831d88eb4ce94bb187b1d7e5f"}, + {file = "pillow-11.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd8ff254faf15591e724dc7c4ddb6bf4793efcbe13802a4ae3e863cd300b493e"}, + {file = "pillow-11.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:932c754c2d51ad2b2271fd01c3d121daaa35e27efae2a616f77bf164bc0b3e94"}, + {file = "pillow-11.3.0-cp311-cp311-win32.whl", hash = "sha256:b4b8f3efc8d530a1544e5962bd6b403d5f7fe8b9e08227c6b255f98ad82b4ba0"}, + {file = "pillow-11.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:1a992e86b0dd7aeb1f053cd506508c0999d710a8f07b4c791c63843fc6a807ac"}, + {file = "pillow-11.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:30807c931ff7c095620fe04448e2c2fc673fcbb1ffe2a7da3fb39613489b1ddd"}, + {file = "pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4"}, + {file = "pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69"}, + {file = "pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d"}, + {file = "pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6"}, + {file = "pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7"}, + {file = "pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024"}, + {file = "pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809"}, + {file = "pillow-11.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6be31e3fc9a621e071bc17bb7de63b85cbe0bfae91bb0363c893cbe67247780d"}, + {file = "pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149"}, + {file = "pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d"}, + {file = "pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542"}, + {file = "pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:1c627742b539bba4309df89171356fcb3cc5a9178355b2727d1b74a6cf155fbd"}, + {file = "pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30b7c02f3899d10f13d7a48163c8969e4e653f8b43416d23d13d1bbfdc93b9f8"}, + {file = "pillow-11.3.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:7859a4cc7c9295f5838015d8cc0a9c215b77e43d07a25e460f35cf516df8626f"}, + {file = "pillow-11.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec1ee50470b0d050984394423d96325b744d55c701a439d2bd66089bff963d3c"}, + {file = "pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7db51d222548ccfd274e4572fdbf3e810a5e66b00608862f947b163e613b67dd"}, + {file = "pillow-11.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2d6fcc902a24ac74495df63faad1884282239265c6839a0a6416d33faedfae7e"}, + {file = "pillow-11.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0f5d8f4a08090c6d6d578351a2b91acf519a54986c055af27e7a93feae6d3f1"}, + {file = "pillow-11.3.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c37d8ba9411d6003bba9e518db0db0c58a680ab9fe5179f040b0463644bc9805"}, + {file = "pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13f87d581e71d9189ab21fe0efb5a23e9f28552d5be6979e84001d3b8505abe8"}, + {file = "pillow-11.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:023f6d2d11784a465f09fd09a34b150ea4672e85fb3d05931d89f373ab14abb2"}, + {file = "pillow-11.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:45dfc51ac5975b938e9809451c51734124e73b04d0f0ac621649821a63852e7b"}, + {file = "pillow-11.3.0-cp313-cp313-win32.whl", hash = "sha256:a4d336baed65d50d37b88ca5b60c0fa9d81e3a87d4a7930d3880d1624d5b31f3"}, + {file = "pillow-11.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0bce5c4fd0921f99d2e858dc4d4d64193407e1b99478bc5cacecba2311abde51"}, + {file = "pillow-11.3.0-cp313-cp313-win_arm64.whl", hash = "sha256:1904e1264881f682f02b7f8167935cce37bc97db457f8e7849dc3a6a52b99580"}, + {file = "pillow-11.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4c834a3921375c48ee6b9624061076bc0a32a60b5532b322cc0ea64e639dd50e"}, + {file = "pillow-11.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e05688ccef30ea69b9317a9ead994b93975104a677a36a8ed8106be9260aa6d"}, + {file = "pillow-11.3.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1019b04af07fc0163e2810167918cb5add8d74674b6267616021ab558dc98ced"}, + {file = "pillow-11.3.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f944255db153ebb2b19c51fe85dd99ef0ce494123f21b9db4877ffdfc5590c7c"}, + {file = "pillow-11.3.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f85acb69adf2aaee8b7da124efebbdb959a104db34d3a2cb0f3793dbae422a8"}, + {file = "pillow-11.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05f6ecbeff5005399bb48d198f098a9b4b6bdf27b8487c7f38ca16eeb070cd59"}, + {file = "pillow-11.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a7bc6e6fd0395bc052f16b1a8670859964dbd7003bd0af2ff08342eb6e442cfe"}, + {file = "pillow-11.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:83e1b0161c9d148125083a35c1c5a89db5b7054834fd4387499e06552035236c"}, + {file = "pillow-11.3.0-cp313-cp313t-win32.whl", hash = "sha256:2a3117c06b8fb646639dce83694f2f9eac405472713fcb1ae887469c0d4f6788"}, + {file = "pillow-11.3.0-cp313-cp313t-win_amd64.whl", hash = "sha256:857844335c95bea93fb39e0fa2726b4d9d758850b34075a7e3ff4f4fa3aa3b31"}, + {file = "pillow-11.3.0-cp313-cp313t-win_arm64.whl", hash = "sha256:8797edc41f3e8536ae4b10897ee2f637235c94f27404cac7297f7b607dd0716e"}, + {file = "pillow-11.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d9da3df5f9ea2a89b81bb6087177fb1f4d1c7146d583a3fe5c672c0d94e55e12"}, + {file = "pillow-11.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0b275ff9b04df7b640c59ec5a3cb113eefd3795a8df80bac69646ef699c6981a"}, + {file = "pillow-11.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0743841cabd3dba6a83f38a92672cccbd69af56e3e91777b0ee7f4dba4385632"}, + {file = "pillow-11.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2465a69cf967b8b49ee1b96d76718cd98c4e925414ead59fdf75cf0fd07df673"}, + {file = "pillow-11.3.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41742638139424703b4d01665b807c6468e23e699e8e90cffefe291c5832b027"}, + {file = "pillow-11.3.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93efb0b4de7e340d99057415c749175e24c8864302369e05914682ba642e5d77"}, + {file = "pillow-11.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7966e38dcd0fa11ca390aed7c6f20454443581d758242023cf36fcb319b1a874"}, + {file = "pillow-11.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:98a9afa7b9007c67ed84c57c9e0ad86a6000da96eaa638e4f8abe5b65ff83f0a"}, + {file = "pillow-11.3.0-cp314-cp314-win32.whl", hash = "sha256:02a723e6bf909e7cea0dac1b0e0310be9d7650cd66222a5f1c571455c0a45214"}, + {file = "pillow-11.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:a418486160228f64dd9e9efcd132679b7a02a5f22c982c78b6fc7dab3fefb635"}, + {file = "pillow-11.3.0-cp314-cp314-win_arm64.whl", hash = "sha256:155658efb5e044669c08896c0c44231c5e9abcaadbc5cd3648df2f7c0b96b9a6"}, + {file = "pillow-11.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:59a03cdf019efbfeeed910bf79c7c93255c3d54bc45898ac2a4140071b02b4ae"}, + {file = "pillow-11.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f8a5827f84d973d8636e9dc5764af4f0cf2318d26744b3d902931701b0d46653"}, + {file = "pillow-11.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ee92f2fd10f4adc4b43d07ec5e779932b4eb3dbfbc34790ada5a6669bc095aa6"}, + {file = "pillow-11.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c96d333dcf42d01f47b37e0979b6bd73ec91eae18614864622d9b87bbd5bbf36"}, + {file = "pillow-11.3.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c96f993ab8c98460cd0c001447bff6194403e8b1d7e149ade5f00594918128b"}, + {file = "pillow-11.3.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:41342b64afeba938edb034d122b2dda5db2139b9a4af999729ba8818e0056477"}, + {file = "pillow-11.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:068d9c39a2d1b358eb9f245ce7ab1b5c3246c7c8c7d9ba58cfa5b43146c06e50"}, + {file = "pillow-11.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a1bc6ba083b145187f648b667e05a2534ecc4b9f2784c2cbe3089e44868f2b9b"}, + {file = "pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12"}, + {file = "pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db"}, + {file = "pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3cee80663f29e3843b68199b9d6f4f54bd1d4a6b59bdd91bceefc51238bcb967"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b5f56c3f344f2ccaf0dd875d3e180f631dc60a51b314295a3e681fe8cf851fbe"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e67d793d180c9df62f1f40aee3accca4829d3794c95098887edc18af4b8b780c"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d000f46e2917c705e9fb93a3606ee4a819d1e3aa7a9b442f6444f07e77cf5e25"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:527b37216b6ac3a12d7838dc3bd75208ec57c1c6d11ef01902266a5a0c14fc27"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:be5463ac478b623b9dd3937afd7fb7ab3d79dd290a28e2b6df292dc75063eb8a"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8dc70ca24c110503e16918a658b869019126ecfe03109b754c402daff12b3d9f"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7c8ec7a017ad1bd562f93dbd8505763e688d388cde6e4a010ae1486916e713e6"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9ab6ae226de48019caa8074894544af5b53a117ccb9d3b3dcb2871464c829438"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fe27fb049cdcca11f11a7bfda64043c37b30e6b91f10cb5bab275806c32f6ab3"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:465b9e8844e3c3519a983d58b80be3f668e2a7a5db97f2784e7079fbc9f9822c"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5418b53c0d59b3824d05e029669efa023bbef0f3e92e75ec8428f3799487f361"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:504b6f59505f08ae014f724b6207ff6222662aab5cc9542577fb084ed0676ac7"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8"}, + {file = "pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523"}, ] [[package]] @@ -2399,7 +2435,7 @@ name = "sqlalchemy" version = "2.0.36" summary = "" dependencies = [ - "greenlet; python_full_version < \"3.13\" and (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\")", + "greenlet; (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\") and python_full_version < \"3.13\"", "typing-extensions", ] files = [ @@ -2571,7 +2607,7 @@ dependencies = [ "python-dotenv", "pyyaml", "uvicorn==0.32.0", - "uvloop; platform_python_implementation != \"PyPy\" and (sys_platform != \"cygwin\" and sys_platform != \"win32\")", + "uvloop; (sys_platform != \"cygwin\" and sys_platform != \"win32\") and platform_python_implementation != \"PyPy\"", "watchfiles", "websockets", ] diff --git a/extralit-server/pyproject.toml b/extralit-server/pyproject.toml index 5e175aa62..1883a509e 100644 --- a/extralit-server/pyproject.toml +++ b/extralit-server/pyproject.toml @@ -60,7 +60,7 @@ dependencies = [ "standardwebhooks>=1.0.0", # For HF dataset import "datasets >= 3.0.1", - "pillow >= 10.4.0", + "pillow>=11.3.0", # For Telemetry "huggingface-hub>=0.26.2", "Jinja2>=3.1.4", # Used by huggingface-hub to render dataset card templates From 4940b2ee3394c40043e29ac90a2638809187196f Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Fri, 8 Aug 2025 11:54:45 -0700 Subject: [PATCH 10/22] Modified database URL in .env.dev for better compatibility with user profiles --- .gitignore | 3 +++ extralit-server/.env.dev | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 996925b96..7a49b1481 100644 --- a/.gitignore +++ b/.gitignore @@ -155,3 +155,6 @@ src/**/server/static/ # App generated files extralit-server/src/extralit_server/static extralit/site + +# Development files +*.db \ No newline at end of file diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev index 234e8f829..56815c0bf 100644 --- a/extralit-server/.env.dev +++ b/extralit-server/.env.dev @@ -1,7 +1,7 @@ OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # Needed by RQ to work with forked processes on MacOS ALEMBIC_CONFIG=src/extralit_server/alembic.ini EXTRALIT_AUTH_SECRET_KEY=8VO7na5N/jQx+yP/N+HlE8q51vPdrxqlh6OzoebIyko= # With this we avoid using a different key every time the server is reloaded -EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${USERPROFILE}/.extralit/extralit-dev.db?check_same_thread=False +EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${USERPROFILE:-${HOME}}/.extralit/extralit-dev.db?check_same_thread=False HF_HUB_DISABLE_TELEMETRY=1 # S3 Configuration (skipped to use LocalFileStorage) From 0e5937744eb5d8d71535fd9f2803fc3b154310bf Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Fri, 8 Aug 2025 12:19:12 -0700 Subject: [PATCH 11/22] optimize ocrmypdf params by updating `optimization` level to 0, disable fast web view optimization, large image skipping, and set number of `jobs` to 0 --- .../contexts/document/preprocessing.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/extralit-server/src/extralit_server/contexts/document/preprocessing.py b/extralit-server/src/extralit_server/contexts/document/preprocessing.py index b23f03fe9..9445c9b32 100644 --- a/extralit-server/src/extralit_server/contexts/document/preprocessing.py +++ b/extralit-server/src/extralit_server/contexts/document/preprocessing.py @@ -72,7 +72,7 @@ class Config: clean: bool = Field(default=True, description="Use `unpaper` to clean up artifacts") optimize: int = Field( - default=1, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)" + default=0, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)" ) pdf_renderer: str = Field(default="hocr", description="PDF renderer: 'auto', 'hocr', 'sandwich'") @@ -96,6 +96,21 @@ class Config: description="Output type for OCRmyPDF. Set to 'pdf' to skip PDF/A conversion.", ) + fast_web_view: int = Field( + default=999999, + description="Fast web view optimization. Set to 999999 to disable fast web view optimization.", + ) + + skip_big: bool = Field( + default=True, + description="Skip large images if some pages have large images.", + ) + + jobs: int = Field( + default=1, + description="Number of worker processes to use for OCR. Set to 1 for Docker containers with limited CPU to avoid oversubscription.", + ) + def get_ocrmypdf_args(self) -> dict: """ Get OCRmyPDF arguments as a dictionary for use with **kwargs. @@ -117,6 +132,9 @@ def get_ocrmypdf_args(self) -> dict: "redo_ocr": self.redo_ocr, "progress_bar": self.progress_bar, "output_type": self.output_type, + "fast_web_view": self.fast_web_view, + "skip_big": self.skip_big, + "jobs": self.jobs, } From 804ed861967af9b8a2d48aa06b8a7dfc39e19c52 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Fri, 8 Aug 2025 13:45:36 -0700 Subject: [PATCH 12/22] Update dependencies and optimize imports for lazy loading - Added `lazy-loader` package to manage heavy dependencies like `cv2`, `pdf2image`, and `ocrmypdf` for improved performance. - Updated `pdm.lock` to reflect the new package addition and modified existing dependencies. - Cleaned up import statements in `analysis.py` and `preprocessing.py` to utilize lazy loading, ensuring these libraries are only loaded when needed. - Removed unnecessary try-except blocks for dependency availability checks, as all required packages are now included in the application. --- extralit-server/.env.dev | 3 +- extralit-server/pdm.lock | 21 ++++++-- extralit-server/pyproject.toml | 6 ++- .../contexts/document/analysis.py | 51 ++++++++----------- .../contexts/document/preprocessing.py | 15 +++--- .../src/extralit_server/contexts/imports.py | 1 - 6 files changed, 51 insertions(+), 46 deletions(-) diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev index 56815c0bf..8a3093239 100644 --- a/extralit-server/.env.dev +++ b/extralit-server/.env.dev @@ -1,7 +1,7 @@ OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # Needed by RQ to work with forked processes on MacOS ALEMBIC_CONFIG=src/extralit_server/alembic.ini EXTRALIT_AUTH_SECRET_KEY=8VO7na5N/jQx+yP/N+HlE8q51vPdrxqlh6OzoebIyko= # With this we avoid using a different key every time the server is reloaded -EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${USERPROFILE:-${HOME}}/.extralit/extralit-dev.db?check_same_thread=False +EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${HOME}/.extralit/extralit-dev.db?check_same_thread=False HF_HUB_DISABLE_TELEMETRY=1 # S3 Configuration (skipped to use LocalFileStorage) @@ -27,7 +27,6 @@ PREPROCESSING_ENABLE_ANALYSIS=false PREPROCESSING_LANGUAGE='["eng"]' PREPROCESSING_ROTATE_PAGES=true PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0 -PREPROCESSING_OPTIMIZE=1 PREPROCESSING_CLEAN=false PREPROCESSING_SKIP_TEXT=true # PREPROCESSING_TESSERACT_TIMEOUT=0 # Uncomment to disable Tesseract OCR diff --git a/extralit-server/pdm.lock b/extralit-server/pdm.lock index a52eb2764..0de71f798 100644 --- a/extralit-server/pdm.lock +++ b/extralit-server/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "postgresql", "test"] strategy = [] lock_version = "4.5.0" -content_hash = "sha256:b81b48f68a21fcdb9fe67c3d94a30419667da306d05bde66d807bcf4bb51858e" +content_hash = "sha256:02264c865d9bc17964c0abbd121810947822b6ff067ac8fcd0cbcb1ef46de191" [[metadata.targets]] requires_python = ">=3.10" @@ -1149,6 +1149,18 @@ files = [ {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, ] +[[package]] +name = "lazy-loader" +version = "0.4" +summary = "" +dependencies = [ + "packaging", +] +files = [ + {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"}, + {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"}, +] + [[package]] name = "lxml" version = "6.0.0" @@ -1683,8 +1695,7 @@ files = [ [[package]] name = "pillow" version = "11.3.0" -requires_python = ">=3.9" -summary = "Python Imaging Library (Fork)" +summary = "" files = [ {file = "pillow-11.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1b9c17fd4ace828b3003dfd1e30bff24863e0eb59b535e8f80194d9cc7ecf860"}, {file = "pillow-11.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:65dc69160114cdd0ca0f35cb434633c75e8e7fad4cf855177a05bf38678f73ad"}, @@ -2435,7 +2446,7 @@ name = "sqlalchemy" version = "2.0.36" summary = "" dependencies = [ - "greenlet; (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\") and python_full_version < \"3.13\"", + "greenlet; python_full_version < \"3.13\" and (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\")", "typing-extensions", ] files = [ @@ -2607,7 +2618,7 @@ dependencies = [ "python-dotenv", "pyyaml", "uvicorn==0.32.0", - "uvloop; (sys_platform != \"cygwin\" and sys_platform != \"win32\") and platform_python_implementation != \"PyPy\"", + "uvloop; platform_python_implementation != \"PyPy\" and (sys_platform != \"cygwin\" and sys_platform != \"win32\")", "watchfiles", "websockets", ] diff --git a/extralit-server/pyproject.toml b/extralit-server/pyproject.toml index 1883a509e..58ba847d7 100644 --- a/extralit-server/pyproject.toml +++ b/extralit-server/pyproject.toml @@ -49,6 +49,7 @@ dependencies = [ "social-auth-core ~= 4.5.0", # Background processing "rq ~= 1.16.2", + "lazy-loader>=0.4", # Info status "psutil ~= 5.8, <5.10", # For logging, tracebacks, printing, progressbars @@ -69,7 +70,7 @@ dependencies = [ # For document processing "ocrmypdf>=16.10.4", "pdf2image>=1.17.0", - "opencv-python>=4.11.0.86" + "opencv-python>=4.11.0.86", ] [project.optional-dependencies] @@ -186,8 +187,9 @@ worker = { cmd = "python -m extralit_server worker" } server-dev.composite = [ "migrate", "cli database users create_default", - "server", + "server-and-worker", ] +server-and-worker = { shell = "pdm run server & pdm run worker & wait" } test = { cmd = "pytest --verbosity=1 --disable-warnings", env_file = ".env.test" } test-cov = { cmd = "pytest tests --cov=extralit_server --cov-report=term --cov-report=xml --verbosity=0 --disable-warnings", env_file = ".env.test" } diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py index 1f097cfe6..9d97b4fd5 100644 --- a/extralit-server/src/extralit_server/contexts/document/analysis.py +++ b/extralit-server/src/extralit_server/contexts/document/analysis.py @@ -14,31 +14,24 @@ import logging from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import numpy as np -try: - import cv2 +import lazy_loader as lazy - CV2_AVAILABLE = True -except ImportError: - CV2_AVAILABLE = False +# These are only loaded when actually used in Redis workers +cv2 = lazy.load("cv2") +pdf2image = lazy.load("pdf2image") +PIL_ImageChops = lazy.load("PIL.ImageChops") +PIL_ImageDraw = lazy.load("PIL.ImageDraw") +PIL_Image = lazy.load("PIL.Image") -try: - from pdf2image import convert_from_bytes - from PIL import ImageChops, ImageDraw - from PIL.Image import Image as PILImage +# Since dependencies are packaged together, they're always available +CV2_AVAILABLE = True +PDF2IMAGE_AVAILABLE = True - PDF2IMAGE_AVAILABLE = True -except ImportError: - PDF2IMAGE_AVAILABLE = False - -try: - pass - - OCRMYPDF_AVAILABLE = True -except ImportError: - OCRMYPDF_AVAILABLE = False +# For type hints - use Any to avoid import issues at module load time +PILImage = Any # Will be PIL.Image.Image when loaded logger = logging.getLogger(__name__) @@ -55,7 +48,7 @@ class PDFProcessingResult: def pil_to_cv(image: PILImage) -> np.ndarray: """Convert PIL Image to OpenCV format.""" - return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) + return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # type: ignore def classify_and_draw_layout_regions( @@ -74,19 +67,19 @@ def classify_and_draw_layout_regions( h, w = mask_np.shape # Clean up the mask using morphological operations - kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) - cleaned = cv2.morphologyEx(mask_np, cv2.MORPH_CLOSE, kernel) + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) # type: ignore + cleaned = cv2.morphologyEx(mask_np, cv2.MORPH_CLOSE, kernel) # type: ignore - contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # type: ignore img = reference.copy() if label else reference regions = [] if label: - draw = ImageDraw.Draw(img) + draw = PIL_ImageDraw.Draw(img) # type: ignore for cnt in contours: - x, y, rw, rh = cv2.boundingRect(cnt) + x, y, rw, rh = cv2.boundingRect(cnt) # type: ignore area = rw * rh if area < min_area: @@ -173,7 +166,7 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: try: # Convert PDF to images - images = convert_from_bytes(pdf_data, dpi=150) # Lower DPI for analysis + images = pdf2image.convert_from_bytes(pdf_data, dpi=150) # type: ignore # Lower DPI for analysis if not images: return {"analysis_available": False, "error": "No pages found"} @@ -227,8 +220,8 @@ def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) -> compare = compare.resize(reference.size) # Step 1: Compute difference and invert so white = same - diff = ImageChops.difference(reference, compare) - sameness_mask = ImageChops.invert(diff.convert("L")) + diff = PIL_ImageChops.difference(reference, compare) # type: ignore + sameness_mask = PIL_ImageChops.invert(diff.convert("L")) # type: ignore # Step 2: Threshold the mask (keep high-sameness pixels) # Create a lookup table for thresholding diff --git a/extralit-server/src/extralit_server/contexts/document/preprocessing.py b/extralit-server/src/extralit_server/contexts/document/preprocessing.py index 9445c9b32..7e3dae3c4 100644 --- a/extralit-server/src/extralit_server/contexts/document/preprocessing.py +++ b/extralit-server/src/extralit_server/contexts/document/preprocessing.py @@ -22,15 +22,16 @@ from typing import List, Optional from uuid import uuid4 +import lazy_loader as lazy from pydantic import Field from pydantic_settings import BaseSettings -try: - import ocrmypdf +# Lazy load OCRmyPDF to avoid loading it in the main FastAPI process +# Only loaded when actually used in Redis workers +ocrmypdf = lazy.load("ocrmypdf") - OCRMYPDF_AVAILABLE = True -except ImportError: - OCRMYPDF_AVAILABLE = False +# Since OCRmyPDF is packaged with the application, it's always available +OCRMYPDF_AVAILABLE = True try: from extralit_server.contexts.document.analysis import PDFAnalyzer, PDFProcessingResult @@ -212,7 +213,7 @@ def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult: input_buffer = BytesIO(file_data) output_buffer = BytesIO() - ocrmypdf.ocr(input_buffer, output_buffer, **self.settings.get_ocrmypdf_args()) + ocrmypdf.ocr(input_buffer, output_buffer, **self.settings.get_ocrmypdf_args()) # type: ignore processed_data = output_buffer.getvalue() output_buffer.close() @@ -256,7 +257,7 @@ def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes: ) output_temp_file.close() - ocrmypdf.ocr(input_temp_file.name, output_temp_file.name, **self.settings.get_ocrmypdf_args()) + ocrmypdf.ocr(input_temp_file.name, output_temp_file.name, **self.settings.get_ocrmypdf_args()) # type: ignore with open(output_temp_file.name, "rb") as f: processed_data = f.read() diff --git a/extralit-server/src/extralit_server/contexts/imports.py b/extralit-server/src/extralit_server/contexts/imports.py index 6520d8acc..a5d892c1b 100644 --- a/extralit-server/src/extralit_server/contexts/imports.py +++ b/extralit-server/src/extralit_server/contexts/imports.py @@ -390,7 +390,6 @@ async def process_bulk_upload( reference_data=doc.document_create.model_dump(), file_data_list=[], user_id=user_id, - job_timeout=None, # No timeout for large uploads ) # Store job ID mapped to reference key for frontend tracking From 30c970a0ac7d764f8e9218bde4b6fe64bd25364a Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Fri, 8 Aug 2025 15:33:38 -0700 Subject: [PATCH 13/22] Enable PDF analysis and update preprocessing settings - Updated `.env.dev` to enable PDF analysis and set quiet mode to false. - Introduced `PDFMetadata` and `PDFProcessingResponse` models for structured metadata handling in `analysis.py`. - Refactored `PDFPreprocessor` to utilize the new models and improve error handling during preprocessing. - Adjusted type hints in `analysis.py` and `preprocessing.py` for better clarity and consistency. - Updated job processing to reflect changes in the preprocessing method. --- extralit-server/.env.dev | 4 +- .../api/schemas/v1/document/preprocessing.py | 29 ++++ .../document/__init__.py} | 0 .../contexts/document/analysis.py | 55 ++---- .../contexts/document/preprocessing.py | 158 ++++++++---------- .../src/extralit_server/jobs/document_jobs.py | 2 +- 6 files changed, 114 insertions(+), 134 deletions(-) create mode 100644 extralit-server/src/extralit_server/api/schemas/v1/document/preprocessing.py rename extralit-server/src/extralit_server/{api/schemas/v1/document/analysis.py => contexts/document/__init__.py} (100%) diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev index 8a3093239..afbd12a3e 100644 --- a/extralit-server/.env.dev +++ b/extralit-server/.env.dev @@ -23,11 +23,11 @@ EXTRALIT_REDIS_URL=redis://localhost:6379/0 # PDF Preprocessing PREPROCESSING_ENABLED=true -PREPROCESSING_ENABLE_ANALYSIS=false +PREPROCESSING_ENABLE_ANALYSIS=true PREPROCESSING_LANGUAGE='["eng"]' PREPROCESSING_ROTATE_PAGES=true PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0 PREPROCESSING_CLEAN=false PREPROCESSING_SKIP_TEXT=true # PREPROCESSING_TESSERACT_TIMEOUT=0 # Uncomment to disable Tesseract OCR -PREPROCESSING_QUIET=true +PREPROCESSING_QUIET=false diff --git a/extralit-server/src/extralit_server/api/schemas/v1/document/preprocessing.py b/extralit-server/src/extralit_server/api/schemas/v1/document/preprocessing.py new file mode 100644 index 000000000..5448b90ca --- /dev/null +++ b/extralit-server/src/extralit_server/api/schemas/v1/document/preprocessing.py @@ -0,0 +1,29 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional +from pydantic import BaseModel + + +class PDFMetadata(BaseModel): + """ + Metadata for PDF processing results. + """ + + filename: str + processing_time: float + page_count: Optional[int] = None + language_detected: Optional[List[str]] = None + processing_settings: Optional[Dict] = None + analysis_results: Optional[Dict] = None diff --git a/extralit-server/src/extralit_server/api/schemas/v1/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/__init__.py similarity index 100% rename from extralit-server/src/extralit_server/api/schemas/v1/document/analysis.py rename to extralit-server/src/extralit_server/contexts/document/__init__.py diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py index 9d97b4fd5..209ac517a 100644 --- a/extralit-server/src/extralit_server/contexts/document/analysis.py +++ b/extralit-server/src/extralit_server/contexts/document/analysis.py @@ -13,55 +13,36 @@ # limitations under the License. import logging -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple + +from typing import Dict, List, Optional, Tuple, TYPE_CHECKING import numpy as np import lazy_loader as lazy -# These are only loaded when actually used in Redis workers cv2 = lazy.load("cv2") pdf2image = lazy.load("pdf2image") -PIL_ImageChops = lazy.load("PIL.ImageChops") -PIL_ImageDraw = lazy.load("PIL.ImageDraw") -PIL_Image = lazy.load("PIL.Image") - -# Since dependencies are packaged together, they're always available -CV2_AVAILABLE = True -PDF2IMAGE_AVAILABLE = True +PIL = lazy.load("PIL") -# For type hints - use Any to avoid import issues at module load time -PILImage = Any # Will be PIL.Image.Image when loaded +if TYPE_CHECKING: + from PIL.Image import Image logger = logging.getLogger(__name__) -@dataclass -class PDFProcessingResult: - """ - Result of PDF preprocessing containing both processed data and analysis metadata. - """ - - processed_data: bytes - metadata: Dict - - -def pil_to_cv(image: PILImage) -> np.ndarray: +def pil_to_cv(image: Image) -> np.ndarray: """Convert PIL Image to OpenCV format.""" return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # type: ignore def classify_and_draw_layout_regions( - reference: PILImage, mask: PILImage, min_area: int = 5000, label: bool = True -) -> Tuple[PILImage, List[Dict]]: + reference: Image, mask: Image, min_area: int = 5000, label: bool = True +) -> Tuple[Image, List[Dict]]: """ Classify and optionally draw layout regions using contour detection. Returns: Tuple of (annotated image, list of detected regions) """ - if not CV2_AVAILABLE: - return reference, [] mask_np = np.array(mask.convert("L")) h, w = mask_np.shape @@ -76,7 +57,7 @@ def classify_and_draw_layout_regions( regions = [] if label: - draw = PIL_ImageDraw.Draw(img) # type: ignore + draw = PIL.ImageDraw.Draw(img) # type: ignore for cnt in contours: x, y, rw, rh = cv2.boundingRect(cnt) # type: ignore @@ -118,7 +99,7 @@ def classify_and_draw_layout_regions( return img, regions -def find_horizontal_bands(mask: PILImage, min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]: +def find_horizontal_bands(mask: Image, min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]: """Find horizontal bands of similar content across pages.""" mask_np = np.array(mask.convert("L")) h, w = mask_np.shape @@ -160,13 +141,9 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: Returns: Dictionary containing layout analysis metadata """ - if not (PDF2IMAGE_AVAILABLE and CV2_AVAILABLE): - self.logger.warning("PDF analysis requires pdf2image and cv2, skipping layout analysis") - return {"analysis_available": False, "error": "Missing dependencies"} try: - # Convert PDF to images - images = pdf2image.convert_from_bytes(pdf_data, dpi=150) # type: ignore # Lower DPI for analysis + images = pdf2image.convert_from_bytes(pdf_data, dpi=150) # type: ignore if not images: return {"analysis_available": False, "error": "No pages found"} @@ -186,7 +163,7 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: self.logger.error(f"PDF layout analysis failed for {filename}: {e}") return {"analysis_available": False, "error": str(e)} - def _analyze_page_layout(self, images: List[PILImage]) -> Dict: + def _analyze_page_layout(self, images: List[Image]) -> Dict: """ Analyze page layout by comparing pages to find common regions. """ @@ -209,7 +186,7 @@ def _analyze_page_layout(self, images: List[PILImage]) -> Dict: else: return self._analyze_single_page(reference_img) - def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) -> Optional[Dict]: + def _compare_pages_for_margins(self, reference: Image, compare: Image) -> Optional[Dict]: """ Compare two pages to identify common regions using advanced CV2 techniques. """ @@ -220,8 +197,8 @@ def _compare_pages_for_margins(self, reference: PILImage, compare: PILImage) -> compare = compare.resize(reference.size) # Step 1: Compute difference and invert so white = same - diff = PIL_ImageChops.difference(reference, compare) # type: ignore - sameness_mask = PIL_ImageChops.invert(diff.convert("L")) # type: ignore + diff = PIL.ImageChops.difference(reference, compare) # type: ignore + sameness_mask = PIL.ImageChops.invert(diff.convert("L")) # type: ignore # Step 2: Threshold the mask (keep high-sameness pixels) # Create a lookup table for thresholding @@ -454,7 +431,7 @@ def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, } } - def _analyze_single_page(self, image: PILImage) -> Dict: + def _analyze_single_page(self, image: Image) -> Dict: """ Analyze a single page when comparison isn't possible. """ diff --git a/extralit-server/src/extralit_server/contexts/document/preprocessing.py b/extralit-server/src/extralit_server/contexts/document/preprocessing.py index 7e3dae3c4..b2a22f02b 100644 --- a/extralit-server/src/extralit_server/contexts/document/preprocessing.py +++ b/extralit-server/src/extralit_server/contexts/document/preprocessing.py @@ -19,28 +19,30 @@ import tempfile import time from io import BytesIO -from typing import List, Optional +from typing import List +from dataclasses import dataclass from uuid import uuid4 import lazy_loader as lazy from pydantic import Field from pydantic_settings import BaseSettings +from extralit_server.api.schemas.v1.document.preprocessing import PDFMetadata +from extralit_server.contexts.document.analysis import PDFAnalyzer + -# Lazy load OCRmyPDF to avoid loading it in the main FastAPI process -# Only loaded when actually used in Redis workers ocrmypdf = lazy.load("ocrmypdf") -# Since OCRmyPDF is packaged with the application, it's always available -OCRMYPDF_AVAILABLE = True +_LOGGER = logging.getLogger(__name__) -try: - from extralit_server.contexts.document.analysis import PDFAnalyzer, PDFProcessingResult - ANALYSIS_AVAILABLE = True -except ImportError: - ANALYSIS_AVAILABLE = False +@dataclass +class PDFProcessingResponse: + """ + Result of PDF preprocessing containing both processed data and analysis metadata. + """ -_LOGGER = logging.getLogger(__name__) + processed_data: bytes + metadata: PDFMetadata class PDFPreprocessingSettings(BaseSettings): @@ -57,6 +59,8 @@ class Config: default=True, description="Enable PDF preprocessing with OCRmyPDF. Set to False to disable all processing." ) + enable_analysis: bool = Field(default=True, description="Enable PDF layout analysis and margin detection") + language: List[str] = Field( default=["eng"], description="List of languages for OCR processing (e.g., ['eng', 'spa', 'fra'])" ) @@ -73,24 +77,22 @@ class Config: clean: bool = Field(default=True, description="Use `unpaper` to clean up artifacts") optimize: int = Field( - default=0, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)" + default=1, description="Optimize output file size (0=none, 1=lossless, 2=lossy, 3=aggressive)" ) pdf_renderer: str = Field(default="hocr", description="PDF renderer: 'auto', 'hocr', 'sandwich'") force_ocr: bool = Field(default=False, description="Force OCR on all pages, even if they already have text") - tesseract_timeout: int = Field( - default=0, description="Timeout for Tesseract OCR processing in seconds (0 to skip Tesseract OCR)" - ) - skip_text: bool = Field(default=True, description="Skip text-based operations (OCR only for images)") redo_ocr: bool = Field(default=False, description="Redo OCR on pages that already have OCR") - progress_bar: bool = Field(default=False, description="Show progress bar during processing") + tesseract_timeout: int = Field( + default=0, description="Timeout for Tesseract OCR processing in seconds (0 to skip Tesseract OCR)" + ) - enable_analysis: bool = Field(default=True, description="Enable PDF layout analysis and margin detection") + progress_bar: bool = Field(default=False, description="Show progress bar during processing") output_type: str = Field( default="pdf", @@ -102,9 +104,9 @@ class Config: description="Fast web view optimization. Set to 999999 to disable fast web view optimization.", ) - skip_big: bool = Field( - default=True, - description="Skip large images if some pages have large images.", + skip_big: float = Field( + default=100.0, + description="Image size threshold in MB to skip OCR processing.", ) jobs: int = Field( @@ -139,6 +141,9 @@ def get_ocrmypdf_args(self) -> dict: } +settings = PDFPreprocessingSettings() + + class PDFPreprocessor: """ PDF preprocessor that uses OCRmyPDF for rotation, OCR, and optimization. @@ -147,29 +152,21 @@ class PDFPreprocessor: Can be configured with environment variables using the PDFPreprocessingSettings. """ - def __init__(self, settings: Optional[PDFPreprocessingSettings] = None): + def __init__(self, settings: PDFPreprocessingSettings = settings): """ Initialize the PDF preprocessor. Args: settings: Optional PDFPreprocessingSettings instance. If None, loads from environment. """ - self.settings = settings or PDFPreprocessingSettings() + self.settings = settings - # Initialize analyzer if available and enabled - if self.settings.enable_analysis and ANALYSIS_AVAILABLE: + if self.settings.enable_analysis: self.analyzer = PDFAnalyzer() else: self.analyzer = None - if self.settings.enable_analysis and not ANALYSIS_AVAILABLE: - _LOGGER.warning("PDF analysis is enabled but dependencies are not available") - if not self.settings.enabled: - _LOGGER.info("PDF preprocessing is disabled via configuration") - elif not OCRMYPDF_AVAILABLE: - _LOGGER.warning("OCRmyPDF not available, PDF preprocessing will be skipped") - - def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult: + def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResponse: """ Preprocess PDF with OCRmyPDF and analyze layout structure. @@ -180,59 +177,55 @@ def preprocess(self, file_data: bytes, filename: str) -> PDFProcessingResult: Returns: PDFProcessingResult containing processed data and layout analysis metadata """ - metadata = {} + # Initialize metadata variables + analysis_results = None + processing_time = 0.0 + processed_data = file_data - # Handle non-PDF files or disabled preprocessing + # Handle non-PDF files if not filename.lower().endswith(".pdf"): - return PDFProcessingResult(processed_data=file_data, metadata=metadata) + pass # Use default values - if not self.settings.enabled: + # Handle disabled preprocessing + elif not self.settings.enabled: if self.analyzer: - layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) - metadata.update(layout_analysis) - return PDFProcessingResult(processed_data=file_data, metadata=metadata) + analysis_results = self.analyzer.analyze_pdf_layout(file_data, filename) - if not OCRMYPDF_AVAILABLE: - _LOGGER.warning("OCRmyPDF not available, skipping preprocessing") - # Still run analysis on original data if enabled and available - if self.analyzer: - layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) - metadata.update(layout_analysis) - return PDFProcessingResult(processed_data=file_data, metadata=metadata) + # Handle PDF processing + else: + try: + start_time = time.time() - try: - start_time = time.time() + # Step 1: Analyze original PDF layout (if enabled) + if self.analyzer: + analysis_results = self.analyzer.analyze_pdf_layout(file_data, filename) - # Step 1: Analyze original PDF layout (if enabled and available) - if self.analyzer: - layout_analysis = self.analyzer.analyze_pdf_layout(file_data, filename) - metadata.update(layout_analysis) + # Step 2: OCR preprocessing + try: + input_buffer = BytesIO(file_data) + output_buffer = BytesIO() - # Step 2: OCR preprocessing - try: - input_buffer = BytesIO(file_data) - output_buffer = BytesIO() + ocrmypdf.ocr(input_buffer, output_buffer, **self.settings.get_ocrmypdf_args()) # type: ignore - ocrmypdf.ocr(input_buffer, output_buffer, **self.settings.get_ocrmypdf_args()) # type: ignore + processed_data = output_buffer.getvalue() + output_buffer.close() + input_buffer.close() - processed_data = output_buffer.getvalue() - output_buffer.close() - input_buffer.close() + except Exception as buffer_error: + _LOGGER.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}") + processed_data = self._preprocess_with_temp_files(file_data, filename) - except Exception as buffer_error: - _LOGGER.debug(f"BytesIO approach failed for {filename}, falling back to temp files: {buffer_error}") - processed_data = self._preprocess_with_temp_files(file_data, filename) + processing_time = time.time() - start_time + print(filename, analysis_results) - processing_time = time.time() - start_time - metadata["processing_time_seconds"] = processing_time - print(metadata) - _LOGGER.info(f"PDF preprocessing completed for {filename} in {processing_time:.2f} seconds") + except Exception: + # Use default values on error + pass - return PDFProcessingResult(processed_data=processed_data, metadata=metadata) + # Single PDFMetadata initialization for all code paths + metadata = PDFMetadata(filename=filename, processing_time=processing_time, analysis_results=analysis_results) - except Exception as e: - _LOGGER.error(f"PDF preprocessing failed for {filename}: {e}") - return PDFProcessingResult(processed_data=file_data, metadata=metadata) + return PDFProcessingResponse(processed_data=processed_data, metadata=metadata) def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes: """ @@ -274,23 +267,4 @@ def _preprocess_with_temp_files(self, file_data: bytes, filename: str) -> bytes: _LOGGER.warning(f"Failed to clean up temp file: {e}") -# Global preprocessor instance (can be configured via environment variables) -pdf_preprocessor = PDFPreprocessor() - - -def preprocess_pdf_with_ocrmypdf(file_data: bytes, filename: str) -> bytes: - """ - Preprocess PDF with OCRmyPDF to add OCR layer and fix orientation. - - This function provides backward compatibility by using the global pdf_preprocessor instance. - For new code, consider using PDFPreprocessor directly for better configuration control. - - Args: - file_data: PDF file data as bytes - filename: Original filename for logging purposes - - Returns: - Processed PDF data as bytes (or original bytes if processing fails) - """ - result = pdf_preprocessor.preprocess(file_data, filename) - return result.processed_data +preprocessor = PDFPreprocessor() diff --git a/extralit-server/src/extralit_server/jobs/document_jobs.py b/extralit-server/src/extralit_server/jobs/document_jobs.py index db8c433ce..d6c2788c3 100644 --- a/extralit-server/src/extralit_server/jobs/document_jobs.py +++ b/extralit-server/src/extralit_server/jobs/document_jobs.py @@ -132,7 +132,7 @@ async def upload_reference_documents_job( try: # Preprocess PDF files with OCRmyPDF for rotation and OCR, plus layout analysis - preprocessing_result = preprocessing.pdf_preprocessor.preprocess( + preprocessing_result = preprocessing.preprocessor.preprocess( file_data=file_data, filename=filename ) processed_file_data = preprocessing_result.processed_data From 1eacf585a4baee2402414b0a5d599c07dcdf14f3 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Fri, 8 Aug 2025 15:36:30 -0700 Subject: [PATCH 14/22] fix typechecking --- .../extralit_server/contexts/document/analysis.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py index 209ac517a..c488ae91e 100644 --- a/extralit-server/src/extralit_server/contexts/document/analysis.py +++ b/extralit-server/src/extralit_server/contexts/document/analysis.py @@ -29,14 +29,14 @@ logger = logging.getLogger(__name__) -def pil_to_cv(image: Image) -> np.ndarray: +def pil_to_cv(image: "Image") -> np.ndarray: """Convert PIL Image to OpenCV format.""" return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # type: ignore def classify_and_draw_layout_regions( - reference: Image, mask: Image, min_area: int = 5000, label: bool = True -) -> Tuple[Image, List[Dict]]: + reference: "Image", mask: "Image", min_area: int = 5000, label: bool = True +) -> Tuple["Image", List[Dict]]: """ Classify and optionally draw layout regions using contour detection. @@ -99,7 +99,7 @@ def classify_and_draw_layout_regions( return img, regions -def find_horizontal_bands(mask: Image, min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]: +def find_horizontal_bands(mask: "Image", min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]: """Find horizontal bands of similar content across pages.""" mask_np = np.array(mask.convert("L")) h, w = mask_np.shape @@ -163,7 +163,7 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: self.logger.error(f"PDF layout analysis failed for {filename}: {e}") return {"analysis_available": False, "error": str(e)} - def _analyze_page_layout(self, images: List[Image]) -> Dict: + def _analyze_page_layout(self, images: List["Image"]) -> Dict: """ Analyze page layout by comparing pages to find common regions. """ @@ -186,7 +186,7 @@ def _analyze_page_layout(self, images: List[Image]) -> Dict: else: return self._analyze_single_page(reference_img) - def _compare_pages_for_margins(self, reference: Image, compare: Image) -> Optional[Dict]: + def _compare_pages_for_margins(self, reference: "Image", compare: "Image") -> Optional[Dict]: """ Compare two pages to identify common regions using advanced CV2 techniques. """ @@ -431,7 +431,7 @@ def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, } } - def _analyze_single_page(self, image: Image) -> Dict: + def _analyze_single_page(self, image: "Image") -> Dict: """ Analyze a single page when comparison isn't possible. """ From 8d0704c89cb2d3f4f7bb38ca463802837c926fd0 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Fri, 8 Aug 2025 16:35:21 -0700 Subject: [PATCH 15/22] Refactor image handling and lazy load dependencies - Updated image handling in `_media.py`, `_hub.py`, and `_datasets.py` to utilize lazy loading for `PIL` and `datasets` modules. - Adjusted type hints and checks to ensure compatibility with lazy-loaded imports. - Enhanced error handling and type checking for image processing functions. - Modified `DataframeData` schema to use an alias for schema definition. --- .../extralit_server/api/schemas/v1/imports.py | 2 +- .../src/extralit_server/cli/__init__.py | 34 +++++++++++++------ .../src/extralit_server/cli/__main__.py | 16 +-------- .../src/extralit_server/contexts/hub.py | 29 ++++++++++------ extralit/src/extralit/_helpers/_media.py | 21 +++++++----- extralit/src/extralit/datasets/_io/_hub.py | 20 ++++++----- .../src/extralit/records/_dataset_records.py | 9 ++--- extralit/src/extralit/records/_io/__init__.py | 1 - .../src/extralit/records/_io/_datasets.py | 33 ++++++++++-------- 9 files changed, 91 insertions(+), 74 deletions(-) diff --git a/extralit-server/src/extralit_server/api/schemas/v1/imports.py b/extralit-server/src/extralit_server/api/schemas/v1/imports.py index 654fff606..837a004cf 100644 --- a/extralit-server/src/extralit_server/api/schemas/v1/imports.py +++ b/extralit-server/src/extralit_server/api/schemas/v1/imports.py @@ -68,7 +68,7 @@ class DataframeSchema(BaseModel): class DataframeData(BaseModel): """Tabular dataframe representation for generalized import support.""" - schema: DataframeSchema = Field(..., description="Schema definition with fields and primary key") + schema_: DataframeSchema = Field(..., alias="schema", description="Schema definition with fields and primary key") data: List[Dict[str, Any]] = Field(..., description="List of data rows as dictionaries") diff --git a/extralit-server/src/extralit_server/cli/__init__.py b/extralit-server/src/extralit_server/cli/__init__.py index b0ad568f3..0e75fb520 100644 --- a/extralit-server/src/extralit_server/cli/__init__.py +++ b/extralit-server/src/extralit_server/cli/__init__.py @@ -1,18 +1,30 @@ -# Copyright 2021-present, the Recognai S.L. team. +# Copyright 2024-present, Extralit Labs, Inc. # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -from .__main__ import app +import typer + +from .database import app as database_app +from .search_engine import app as search_engine_app +from .start import start +from .worker import worker + +app = typer.Typer(help="Commands for Extralit server management", no_args_is_help=True) + +app.add_typer(database_app, name="database") +app.add_typer(search_engine_app, name="search-engine") +app.command(name="worker", help="Starts rq workers")(worker) +app.command(name="start", help="Starts the Extralit server")(start) if __name__ == "__main__": app() diff --git a/extralit-server/src/extralit_server/cli/__main__.py b/extralit-server/src/extralit_server/cli/__main__.py index 3c98db185..7d7ff85aa 100644 --- a/extralit-server/src/extralit_server/cli/__main__.py +++ b/extralit-server/src/extralit_server/cli/__main__.py @@ -12,21 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import typer - -from .database import app as database_app -from .search_engine import app as search_engine_app -from .start import start -from .worker import worker - -app = typer.Typer(help="Commands for Extralit server management", no_args_is_help=True) - - -app.add_typer(database_app, name="database") -app.add_typer(search_engine_app, name="search-engine") -app.command(name="worker", help="Starts rq workers")(worker) -app.command(name="start", help="Starts the Extralit server")(start) - +from extralit_server.cli import app if __name__ == "__main__": app() diff --git a/extralit-server/src/extralit_server/contexts/hub.py b/extralit-server/src/extralit_server/contexts/hub.py index 8f8ceb981..9d8fbf1a9 100644 --- a/extralit-server/src/extralit_server/contexts/hub.py +++ b/extralit-server/src/extralit_server/contexts/hub.py @@ -19,16 +19,23 @@ from uuid import uuid4 from pathlib import Path -from typing import Any, Optional, List +from typing import Any, Optional, List, TYPE_CHECKING from typing_extensions import Self from tempfile import TemporaryDirectory -from PIL import Image +import lazy_loader as lazy + +datasets = lazy.load("datasets") +PIL = lazy.load("PIL") + from sqlalchemy.orm import selectinload from sqlalchemy.ext.asyncio import AsyncSession from pydantic import BaseModel from huggingface_hub import HfApi, DatasetCard, DatasetCardData -from datasets import Dataset as HFDataset, NamedSplit, load_dataset, features + +if TYPE_CHECKING: + from PIL import Image + from datasets import Dataset as HFDataset from extralit_server.contexts import info from extralit_server.database import get_sync_db @@ -62,7 +69,7 @@ class HubDataset: def __init__(self, name: str, subset: str, split: str, mapping: HubDatasetMapping): - self.dataset: HFDataset = load_dataset(path=name, name=subset, split=split, streaming=True) # type: ignore + self.dataset: "HFDataset" = datasets.load_dataset(path=name, name=subset, split=split, streaming=True) # type: ignore self.split = split self.mapping = mapping self.mapping_feature_names = mapping.sources @@ -121,14 +128,14 @@ def _batch_index_to_row(self, batch: dict, index: int) -> dict: return row def _cast_feature_value(self, feature: Any, value: Any) -> Any: - if isinstance(feature, features.ClassLabel): + if isinstance(feature, datasets.features.ClassLabel): # type: ignore if value == FEATURE_CLASS_LABEL_NO_LABEL: return None else: return feature.int2str(value) - elif isinstance(feature, features.Sequence): + elif isinstance(feature, datasets.features.Sequence): # type: ignore return [self._cast_feature_value(feature.feature, v) for v in value] - elif isinstance(feature, features.Image) and isinstance(value, Image.Image): + elif isinstance(feature, datasets.features.Image) and isinstance(value, PIL.Image.Image): # type: ignore return pil_image_to_data_url(value) else: return value @@ -231,7 +238,9 @@ def __init__(self, dataset: Dataset): self.cache_version = uuid4() def export_to(self, name: str, subset: str, split: str, private: bool, token: str) -> None: - hf_dataset: HFDataset = HFDataset.from_generator(self._rows_generator, split=NamedSplit(split)) # type: ignore + hf_dataset: "HFDataset" = datasets.Dataset.from_generator( + self._rows_generator, split=datasets.NamedSplit(split) + ) # type: ignore hf_dataset.push_to_hub( repo_id=name, config_name=subset, @@ -285,7 +294,7 @@ def _row_fields(self, record: Record) -> dict: feature_value = record.fields.get(field.name) if field.is_image and feature_value is not None and feature_value.startswith("data:"): - row_fields[feature_name] = Image.open(io.BytesIO(data_url_to_bytes(feature_value))) + row_fields[feature_name] = PIL.Image.open(io.BytesIO(data_url_to_bytes(feature_value))) # type: ignore else: row_fields[feature_name] = feature_value @@ -474,7 +483,7 @@ def _create_readme_file(self, directory: str, repo_id: str) -> None: card.save(os.path.join(directory, "README.md")) -def pil_image_to_data_url(image: Image.Image): +def pil_image_to_data_url(image: "Image"): buffer = io.BytesIO() image_format = image.format or DATA_URL_DEFAULT_IMAGE_FORMAT diff --git a/extralit/src/extralit/_helpers/_media.py b/extralit/src/extralit/_helpers/_media.py index 01a7edc02..09c0ceb72 100644 --- a/extralit/src/extralit/_helpers/_media.py +++ b/extralit/src/extralit/_helpers/_media.py @@ -16,9 +16,14 @@ import io import warnings from pathlib import Path -from typing import Union, Optional +from typing import Union, Optional, TYPE_CHECKING -from PIL import Image +import lazy_loader as lazy + +PIL = lazy.load("PIL") + +if TYPE_CHECKING: + from PIL.Image import Image def pil_to_data_uri(image_object: Optional["Image"]) -> Optional[str]: @@ -30,7 +35,7 @@ def pil_to_data_uri(image_object: Optional["Image"]) -> Optional[str]: """ if image_object is None: return None - if not isinstance(image_object, Image.Image): + if not isinstance(image_object, PIL.Image.Image): # type: ignore raise ValueError("The image_object must be a PIL Image object.") image_format = image_object.format @@ -82,28 +87,28 @@ def cast_image(image: Union["Image", str, Path]) -> str: return filepath_to_data_uri(image) elif isinstance(image, Path): return filepath_to_data_uri(image) - elif isinstance(image, Image.Image): + elif isinstance(image, PIL.Image.Image): # type: ignore return pil_to_data_uri(image) else: raise ValueError("The image must be a data URI string, a file path, or a PIL Image object.") -def uncast_image(image: str) -> "Image": +def uncast_image(image: Union[str, "Image"]) -> "Image": """Convert a base64 data URI string to a PIL image.""" - if isinstance(image, Image.Image): + if isinstance(image, PIL.Image.Image): # type: ignore return image elif not isinstance(image, str): raise ValueError("The image must be a data URI string.") elif image.startswith("data:image"): try: image_data = base64.b64decode(image.split(",")[1]) - image = Image.open(io.BytesIO(image_data)) + image = PIL.Image.open(io.BytesIO(image_data)) # type: ignore except Exception as e: raise ValueError("An error occurred while converting the data URI to a PIL image.") from e return image elif image.startswith("http"): return image elif Path(image).exists(): - return Image.open(image) + return PIL.Image.open(image) # type: ignore else: raise ValueError("The image must be a data URI string.") diff --git a/extralit/src/extralit/datasets/_io/_hub.py b/extralit/src/extralit/datasets/_io/_hub.py index 15851ae4d..00a5ded4c 100644 --- a/extralit/src/extralit/datasets/_io/_hub.py +++ b/extralit/src/extralit/datasets/_io/_hub.py @@ -19,10 +19,8 @@ from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Any, Dict, Optional, Type, Union, Literal from uuid import UUID +import lazy_loader as lazy -from datasets import DatasetDict -from datasets.data_files import EmptyDatasetError -from PIL import Image from extralit._exceptions import ImportDatasetError from extralit._exceptions._api import UnprocessableEntityError @@ -34,6 +32,9 @@ from extralit.records._mapping import IngestedRecordMapper from extralit.responses import Response +datasets = lazy.load("datasets") +PIL = lazy.load("PIL") + if TYPE_CHECKING: from datasets import Dataset as HFDataset @@ -143,7 +144,8 @@ def from_hub( A `Dataset` loaded from the Hugging Face Hub. """ from extralit.settings import Settings - from datasets import load_dataset + + # load_dataset is accessed via lazy loaded datasets module from huggingface_hub import snapshot_download settings = settings or "ui" @@ -194,15 +196,15 @@ def from_hub( if with_records: try: - hf_dataset = load_dataset( + hf_dataset = datasets.load_dataset( # type: ignore path=repo_id, split=split, name=subset, **kwargs, - ) # type: ignore + ) hf_dataset = cls._get_dataset_split(hf_dataset=hf_dataset, split=split, **kwargs) cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset) - except EmptyDatasetError: + except datasets.data_files.EmptyDatasetError: # type: ignore warnings.warn( message="Trying to load a dataset `with_records=True` but dataset does not contain any records.", category=UserWarning, @@ -298,7 +300,7 @@ def _get_dataset_split(hf_dataset: "HFDataset", split: Optional[str] = None, **k HFDataset: The single dataset. """ - if isinstance(hf_dataset, DatasetDict) and split is None: + if isinstance(hf_dataset, datasets.DatasetDict) and split is None: # type: ignore split = next(iter(hf_dataset.keys())) if len(hf_dataset.keys()) > 1: warnings.warn( @@ -326,7 +328,7 @@ def _get_sample_hf_record(hf_dataset: "HFDataset") -> Dict: json.dumps(value) sample_huggingface_record[key] = value except TypeError: - if isinstance(value, Image.Image): + if isinstance(value, PIL.Image.Image): # type: ignore sample_huggingface_record[key] = pil_to_data_uri(value) else: sample_huggingface_record[key] = "Record value is not serializable" diff --git a/extralit/src/extralit/records/_dataset_records.py b/extralit/src/extralit/records/_dataset_records.py index 4c9b3fe15..1bfae04f6 100644 --- a/extralit/src/extralit/records/_dataset_records.py +++ b/extralit/src/extralit/records/_dataset_records.py @@ -24,13 +24,14 @@ from extralit._models import RecordModel from extralit._exceptions import RecordsIngestionError from extralit.client import Extralit -from extralit.records._io import GenericIO, HFDataset, HFDatasetsIO, JsonIO +from extralit.records._io import GenericIO, HFDatasetsIO, JsonIO from extralit.records._mapping import IngestedRecordMapper from extralit.records._resource import Record from extralit.records._search import Query if TYPE_CHECKING: from extralit.datasets import Dataset + from datasets import Dataset as HFDataset class RecordErrorHandling(Enum): @@ -246,7 +247,7 @@ def __repr__(self) -> str: def log( self, - records: Union[List[dict], List[Record], HFDataset], + records: Union[List[dict], List[Record], "HFDataset"], mapping: Optional[Dict[str, Union[str, Sequence[str]]]] = None, user_id: Optional[UUID] = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -403,7 +404,7 @@ def from_json(self, path: Union[Path, str]) -> List[Record]: records = JsonIO._records_from_json(path=path) return self.log(records=records) - def to_datasets(self) -> HFDataset: + def to_datasets(self) -> "HFDataset": """ Export the records to a HFDataset. @@ -420,7 +421,7 @@ def to_datasets(self) -> HFDataset: def _ingest_records( self, - records: Union[List[Dict[str, Any]], List[Record], HFDataset], + records: Union[List[Dict[str, Any]], List[Record], "HFDataset"], mapping: Optional[Dict[str, Union[str, Sequence[str]]]] = None, user_id: Optional[UUID] = None, on_error: RecordErrorHandling = RecordErrorHandling.RAISE, diff --git a/extralit/src/extralit/records/_io/__init__.py b/extralit/src/extralit/records/_io/__init__.py index 206b7a71d..398ce4a30 100644 --- a/extralit/src/extralit/records/_io/__init__.py +++ b/extralit/src/extralit/records/_io/__init__.py @@ -15,4 +15,3 @@ from extralit.records._io._datasets import HFDatasetsIO # noqa: F401 from extralit.records._io._generic import GenericIO # noqa: F401 from extralit.records._io._json import JsonIO # noqa: F401 -from extralit.records._io._datasets import HFDataset # noqa: F401 diff --git a/extralit/src/extralit/records/_io/_datasets.py b/extralit/src/extralit/records/_io/_datasets.py index 9de0afddb..2432ca65e 100644 --- a/extralit/src/extralit/records/_io/_datasets.py +++ b/extralit/src/extralit/records/_io/_datasets.py @@ -14,14 +14,17 @@ import warnings from typing import TYPE_CHECKING, Any, Dict, List, Union, Optional, Tuple - -from datasets import Dataset as HFDataset, Sequence -from datasets import Image, ClassLabel, Value +import lazy_loader as lazy from extralit._helpers._media import pil_to_data_uri, uncast_image from extralit.records._io._generic import GenericIO +datasets = lazy.load("datasets") + + if TYPE_CHECKING: + from datasets import Dataset as HFDataset, ClassLabel + from extralit.records import Record from extralit.datasets import Dataset from extralit.records._mapping import IngestedRecordMapper @@ -41,7 +44,7 @@ def _cast_images_as_urls(hf_dataset: "HFDataset", columns: List[str]) -> "HFData for column in columns: # make an updated features object with the new column type features = hf_dataset.features.copy() - features[column] = Value("string") + features[column] = datasets.Value("string") # type: ignore # cast the column in batches hf_dataset = hf_dataset.map( function=lambda batch: {column: [pil_to_data_uri(sample) for sample in batch]}, @@ -55,7 +58,7 @@ def _cast_images_as_urls(hf_dataset: "HFDataset", columns: List[str]) -> "HFData return hf_dataset -def _int2class_name(feature: ClassLabel, value: int) -> Optional[str]: +def _int2class_name(feature: "ClassLabel", value: int) -> Optional[str]: try: return feature.int2str(value) except Exception as ex: @@ -73,7 +76,7 @@ def map2str_list(x: dict, column_name: str, features: dict): for column in columns: features = hf_dataset.features.copy() - features[column] = Sequence(Value("string")) + features[column] = datasets.Sequence(datasets.Value("string")) # type: ignore hf_dataset = hf_dataset.map( map2str_list, fn_kwargs={"column_name": column, "features": hf_dataset.features}, @@ -103,7 +106,7 @@ def label_column2str(x: dict, column: str, features: dict) -> Dict[str, Union[st for column in columns: features = hf_dataset.features.copy() - features[column] = Value("string") + features[column] = datasets.Value("string") # type: ignore hf_dataset = hf_dataset.map( label_column2str, fn_kwargs={"column": column, "features": hf_dataset.features}, features=features ) @@ -126,7 +129,7 @@ def _uncast_uris_as_images(hf_dataset: "HFDataset", columns: List[str]) -> "HFDa for column in columns: features = hf_dataset.features.copy() - features[column] = Image() + features[column] = datasets.Image() # type: ignore casted_hf_dataset = hf_dataset.map( function=lambda batch: {column: [uncast_image(sample) for sample in batch]}, with_indices=False, @@ -162,7 +165,7 @@ def _uncast_label_questions_as_classlabels(hf_dataset: "HFDataset", columns: Lis continue values = list(hf_dataset.unique(column)) features = hf_dataset.features.copy() - features[column] = ClassLabel(names=values) + features[column] = datasets.ClassLabel(names=values) # type: ignore hf_dataset = hf_dataset.map( function=lambda batch: {column: [values.index(sample) for sample in batch]}, with_indices=False, @@ -191,10 +194,10 @@ def _is_hf_dataset(dataset: Any) -> bool: Returns: bool: True if the object is a Hugging Face dataset, False otherwise. """ - return isinstance(dataset, HFDataset) + return isinstance(dataset, datasets.Dataset) # type: ignore @staticmethod - def to_datasets(records: List[Union["Record", Tuple["Record", float]]], dataset: "Dataset") -> HFDataset: + def to_datasets(records: List[Union["Record", Tuple["Record", float]]], dataset: "Dataset") -> "HFDataset": """ Export the records to a Hugging Face dataset. @@ -202,7 +205,7 @@ def to_datasets(records: List[Union["Record", Tuple["Record", float]]], dataset: The dataset containing the records. """ record_dicts = GenericIO.to_dict(records, flatten=True) - hf_dataset = HFDataset.from_dict(record_dicts) + hf_dataset = datasets.Dataset.from_dict(record_dicts) # type: ignore hf_dataset = HFDatasetsIO._uncast_argilla_attributes_to_datasets(hf_dataset, dataset.schema) return hf_dataset @@ -277,11 +280,11 @@ def to_argilla(hf_dataset: "HFDataset", mapper: "IngestedRecordMapper") -> "HFDa class_label_sequence_columns = [] for name, feature in hf_dataset.features.items(): - if isinstance(feature, Image): + if isinstance(feature, datasets.Image): # type: ignore image_columns.append(name) - elif isinstance(feature, ClassLabel): + elif isinstance(feature, datasets.ClassLabel): # type: ignore class_label_columns.append(name) - elif isinstance(feature, Sequence) and isinstance(feature.feature, ClassLabel): + elif isinstance(feature, datasets.Sequence) and isinstance(feature.feature, datasets.ClassLabel): # type: ignore class_label_sequence_columns.append(name) if image_columns: From 484efd82ab07ce4a261a17f8050101ac80a24a28 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Fri, 8 Aug 2025 16:41:05 -0700 Subject: [PATCH 16/22] add lazy-loader --- extralit/pdm.lock | 14 +++++++++++++- extralit/pyproject.toml | 3 ++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/extralit/pdm.lock b/extralit/pdm.lock index 37cb4cf1a..fef71b678 100644 --- a/extralit/pdm.lock +++ b/extralit/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev"] strategy = [] lock_version = "4.5.0" -content_hash = "sha256:70ce069646b7109d8dc4217449d7053c16b14700891f5b9902207ac35ba7943c" +content_hash = "sha256:4eed977ec0ae819f547f71e5131d350c7686fa2a8399a998cb62994db78d442c" [[metadata.targets]] requires_python = ">=3.9.2,<3.14" @@ -2130,6 +2130,18 @@ files = [ {file = "language_data-1.3.0.tar.gz", hash = "sha256:7600ef8aa39555145d06c89f0c324bf7dab834ea0b0a439d8243762e3ebad7ec"}, ] +[[package]] +name = "lazy-loader" +version = "0.4" +summary = "" +dependencies = [ + "packaging", +] +files = [ + {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"}, + {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"}, +] + [[package]] name = "llama-cloud" version = "0.1.23" diff --git a/extralit/pyproject.toml b/extralit/pyproject.toml index 328c37fb4..e2865f785 100644 --- a/extralit/pyproject.toml +++ b/extralit/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "pillow>=9.5.0", "standardwebhooks>=1.0.0", "typer>=0.9.0", + "lazy-loader>=0.4", # for environment variables "python-dotenv~=1.1.0", @@ -49,6 +50,7 @@ dependencies = [ "fastparquet >= 2023.10.0; python_version < '3.13'", "fastparquet >= 2024.4.0; python_version >= '3.13'", "tiktoken ~= 0.9.0", + "bibtexparser>=1.4.3", # for llama-index "llama-index ~= 0.10.68", @@ -61,7 +63,6 @@ dependencies = [ # for weaviate vector db "weaviate-client >= 4", "llama-index-vector-stores-weaviate ~= 1.0.0", - "bibtexparser>=1.4.3", ] nlp = [ "textdescriptives", From 669d989ae85f35a42cfbeeb6e6d80314a09cfd38 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Fri, 8 Aug 2025 18:01:24 -0700 Subject: [PATCH 17/22] Refactor document upload jobs and logging - Renamed `upload_reference_documents_job` to `upload_and_preprocess_documents_job` for clarity in functionality. - Updated references in `imports.py`, `analysis.py`, and test files to reflect the new job name. - Improved logging consistency by standardizing logger usage across the `PDFAnalyzer` class. --- .../contexts/document/analysis.py | 17 +++++------------ .../src/extralit_server/contexts/imports.py | 6 +++--- .../src/extralit_server/jobs/document_jobs.py | 4 ++-- .../tests/unit/jobs/test_document_jobs.py | 8 ++++---- 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py index c488ae91e..e470a11fd 100644 --- a/extralit-server/src/extralit_server/contexts/document/analysis.py +++ b/extralit-server/src/extralit_server/contexts/document/analysis.py @@ -26,7 +26,7 @@ if TYPE_CHECKING: from PIL.Image import Image -logger = logging.getLogger(__name__) +_LOGGER = logging.getLogger(__name__) def pil_to_cv(image: "Image") -> np.ndarray: @@ -123,13 +123,6 @@ def find_horizontal_bands(mask: "Image", min_height: int = 15, min_ratio: float class PDFAnalyzer: - """ - Analyzes PDF layout structure to detect margins, headers, footers, and other regions. - """ - - def __init__(self): - self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") - def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: """ Analyze PDF layout to extract margin and region information. @@ -147,7 +140,7 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: if not images: return {"analysis_available": False, "error": "No pages found"} - self.logger.info(f"Analyzing layout for {filename} with {len(images)} pages") + _LOGGER.info(f"Analyzing layout for {filename} with {len(images)} pages") # Analyze layout layout_data = self._analyze_page_layout(images) @@ -160,7 +153,7 @@ def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: } except Exception as e: - self.logger.error(f"PDF layout analysis failed for {filename}: {e}") + _LOGGER.error(f"PDF layout analysis failed for {filename}: {e}") return {"analysis_available": False, "error": str(e)} def _analyze_page_layout(self, images: List["Image"]) -> Dict: @@ -193,7 +186,7 @@ def _compare_pages_for_margins(self, reference: "Image", compare: "Image") -> Op try: # Ensure same size if reference.size != compare.size: - self.logger.debug(f"Resizing page to match reference size") + _LOGGER.debug(f"Resizing page to match reference size") compare = compare.resize(reference.size) # Step 1: Compute difference and invert so white = same @@ -220,7 +213,7 @@ def _compare_pages_for_margins(self, reference: "Image", compare: "Image") -> Op return regions except Exception as e: - self.logger.debug(f"Page comparison failed: {e}") + _LOGGER.debug(f"Page comparison failed: {e}") return None def _classify_regions_advanced( diff --git a/extralit-server/src/extralit_server/contexts/imports.py b/extralit-server/src/extralit_server/contexts/imports.py index a5d892c1b..21a2f1ae9 100644 --- a/extralit-server/src/extralit_server/contexts/imports.py +++ b/extralit-server/src/extralit_server/contexts/imports.py @@ -37,7 +37,7 @@ ImportHistoryCreate, ImportHistoryCreateResponse, ) -from extralit_server.jobs.document_jobs import upload_reference_documents_job +from extralit_server.jobs.document_jobs import upload_and_preprocess_documents_job _LOGGER = logging.getLogger(__name__) @@ -385,7 +385,7 @@ async def process_bulk_upload( if not doc.associated_files: # Create a reference-based job for documents without files job = DEFAULT_QUEUE.enqueue( - upload_reference_documents_job, + upload_and_preprocess_documents_job, reference=reference, reference_data=doc.document_create.model_dump(), file_data_list=[], @@ -435,7 +435,7 @@ async def process_bulk_upload( # Create a reference-based job for multiple files job = DEFAULT_QUEUE.enqueue( - upload_reference_documents_job, + upload_and_preprocess_documents_job, reference=reference, reference_data=doc.document_create.model_dump(), file_data_list=file_data_list, diff --git a/extralit-server/src/extralit_server/jobs/document_jobs.py b/extralit-server/src/extralit_server/jobs/document_jobs.py index d6c2788c3..6a248c000 100644 --- a/extralit-server/src/extralit_server/jobs/document_jobs.py +++ b/extralit-server/src/extralit_server/jobs/document_jobs.py @@ -33,7 +33,7 @@ @job(DEFAULT_QUEUE, timeout=JOB_TIMEOUT_DISABLED, retry=Retry(max=3, interval=[10, 30, 60])) -async def upload_reference_documents_job( +async def upload_and_preprocess_documents_job( reference: str, reference_data: Dict[str, Any], file_data_list: List[Tuple[str, bytes]], # List of (filename, file_data) tuples @@ -183,7 +183,7 @@ async def upload_reference_documents_job( results["success"] = results["failed_files"] == 0 except Exception as e: - error_msg = f"Error in upload_reference_documents_job for reference {reference}: {str(e)}" + error_msg = f"Error uploading documents for reference {reference}: {str(e)}" _LOGGER.error(error_msg) results["success"] = False results["errors"].append(str(e)) diff --git a/extralit-server/tests/unit/jobs/test_document_jobs.py b/extralit-server/tests/unit/jobs/test_document_jobs.py index a47cb17e6..ca70ca4ac 100644 --- a/extralit-server/tests/unit/jobs/test_document_jobs.py +++ b/extralit-server/tests/unit/jobs/test_document_jobs.py @@ -16,7 +16,7 @@ from unittest.mock import patch, MagicMock from uuid import uuid4 -from extralit_server.jobs.document_jobs import upload_reference_documents_job +from extralit_server.jobs.document_jobs import upload_and_preprocess_documents_job from tests.factories import WorkspaceFactory, UserFactory @@ -70,7 +70,7 @@ async def test_upload_reference_documents_job_success(self, mock_imports, mock_d mock_model_dump.return_value = {"file_name": "test.pdf", "pmid": None, "doi": "10.1234/test.doi"} # Execute job - result = await upload_reference_documents_job(reference, document_data, file_data_list, user.id) + result = await upload_and_preprocess_documents_job(reference, document_data, file_data_list, user.id) # Debug: print the actual result print(f"DEBUG: result = {result}") @@ -107,7 +107,7 @@ async def test_upload_reference_documents_job_workspace_not_found(self): # Use non-existent workspace ID - the job will handle the lookup internally # Execute job - result = await upload_reference_documents_job(reference, document_data, file_data_list, user.id) + result = await upload_and_preprocess_documents_job(reference, document_data, file_data_list, user.id) # Verify result assert result["success"] is False @@ -165,7 +165,7 @@ async def test_upload_reference_documents_job_partial_failure(self, mock_imports mock_model_dump.return_value = {"file_name": "test.pdf", "pmid": None, "doi": "10.1234/test.doi"} # Execute job - result = await upload_reference_documents_job(reference, document_data, file_data_list, user.id) + result = await upload_and_preprocess_documents_job(reference, document_data, file_data_list, user.id) # Debug: print the actual result print(f"DEBUG: result = {result}") From fcd8fe2d4710a04de404f2aba60331fa267ba486 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Fri, 8 Aug 2025 22:27:06 -0700 Subject: [PATCH 18/22] Implement PDF text layer detection using OCRmyPDF - Introduced `PDFTextLayerDetector` class to analyze PDF files for existing text layers. - Added methods for detecting text layers, checking OCR requirements, and retrieving pages needing OCR. - Refactored existing code to improve clarity and functionality, including the use of dataclasses for structured results. - Enhanced error handling for encrypted and invalid PDF files. - Updated module documentation to reflect new functionality. --- .../contexts/document/analysis.py | 612 ++++++------------ .../contexts/document/margin.py | 458 +++++++++++++ .../contexts/embeddings/__init__.py | 14 + .../extralit_server/contexts/ocr/__init__.py | 14 + .../src/extralit_server/jobs/ocr_jobs.py | 14 + 5 files changed, 712 insertions(+), 400 deletions(-) create mode 100644 extralit-server/src/extralit_server/contexts/document/margin.py create mode 100644 extralit-server/src/extralit_server/contexts/embeddings/__init__.py create mode 100644 extralit-server/src/extralit_server/contexts/ocr/__init__.py create mode 100644 extralit-server/src/extralit_server/jobs/ocr_jobs.py diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py index e470a11fd..cc1ecdbf7 100644 --- a/extralit-server/src/extralit_server/contexts/document/analysis.py +++ b/extralit-server/src/extralit_server/contexts/document/analysis.py @@ -12,447 +12,259 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging - -from typing import Dict, List, Optional, Tuple, TYPE_CHECKING -import numpy as np +""" +PDF text layer detection using OCRmyPDF internal functions. -import lazy_loader as lazy +This module provides functionality to detect whether a PDF already has an OCR text layer +by leveraging OCRmyPDF's internal PdfInfo and PageInfo classes. +""" -cv2 = lazy.load("cv2") -pdf2image = lazy.load("pdf2image") -PIL = lazy.load("PIL") - -if TYPE_CHECKING: - from PIL.Image import Image +import logging +from dataclasses import dataclass +from io import BytesIO +from pathlib import Path +from typing import List, Optional, Union +from concurrent.futures import ThreadPoolExecutor + +try: + from ocrmypdf.pdfinfo.info import PageInfo + from ocrmypdf.exceptions import EncryptedPdfError, InputFileError + from ocrmypdf._pipeline import get_pdfinfo + from ocrmypdf._concurrent import Executor +except ImportError as e: + raise ImportError( + "OCRmyPDF is required for PDF text layer detection. " "Please install it with: pip install ocrmypdf" + ) from e _LOGGER = logging.getLogger(__name__) +DEFAULT_EXECUTOR = ThreadPoolExecutor(max_workers=1) -def pil_to_cv(image: "Image") -> np.ndarray: - """Convert PIL Image to OpenCV format.""" - return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # type: ignore +@dataclass +class PageTextInfo: + """Information about text content on a specific PDF page.""" -def classify_and_draw_layout_regions( - reference: "Image", mask: "Image", min_area: int = 5000, label: bool = True -) -> Tuple["Image", List[Dict]]: - """ - Classify and optionally draw layout regions using contour detection. - - Returns: - Tuple of (annotated image, list of detected regions) - """ + page_number: int + has_text: bool + has_images: bool + has_corrupt_text: bool = False + width_pixels: Optional[int] = None + height_pixels: Optional[int] = None + text_extraction_confidence: Optional[float] = None + needs_ocr: bool = True - mask_np = np.array(mask.convert("L")) - h, w = mask_np.shape - # Clean up the mask using morphological operations - kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) # type: ignore - cleaned = cv2.morphologyEx(mask_np, cv2.MORPH_CLOSE, kernel) # type: ignore +@dataclass +class PDFTextAnalysisResult: + """Result of PDF text layer analysis.""" - contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # type: ignore + total_pages: int + has_text_layer: bool + pages_with_text: int + pages_with_images: int + pages_needing_ocr: int + is_encrypted: bool + analysis_error: Optional[str] = None + pages: List[PageTextInfo] = [] - img = reference.copy() if label else reference - regions = [] - if label: - draw = PIL.ImageDraw.Draw(img) # type: ignore - - for cnt in contours: - x, y, rw, rh = cv2.boundingRect(cnt) # type: ignore - area = rw * rh +class PDFTextLayerDetector: + """ + Detector for PDF text layers using OCRmyPDF internal functions. - if area < min_area: - continue + This class uses OCRmyPDF's PdfInfo to analyze PDF pages and determine + which pages already have text content and which would require OCR processing. + """ - cx, cy = x + rw // 2, y + rh // 2 + def __init__(self, executor: Optional["Executor"] = None): + """ + Initialize the PDF text layer detector. - # Classify region based on position - if cy < h * 0.25: - region = "header" - elif cy > h * 0.75: - region = "footer" - elif cx < w * 0.15: - region = "left_margin" - elif cx > w * 0.85: - region = "right_margin" - else: - region = "body" - - region_data = { - "type": region, - "x": x, - "y": y, - "width": rw, - "height": rh, - "area": area, - "center_x": cx, - "center_y": cy, - } - regions.append(region_data) - - if label: - draw.rectangle([x, y, x + rw, y + rh], outline="green", width=2) - draw.text((x, y - 10), region, fill="green") - - return img, regions - - -def find_horizontal_bands(mask: "Image", min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]: - """Find horizontal bands of similar content across pages.""" - mask_np = np.array(mask.convert("L")) - h, w = mask_np.shape - - row_sums = np.sum(mask_np == 255, axis=1) / w # white = same - same_rows = row_sums >= min_ratio - - bands = [] - start = None - for i, val in enumerate(same_rows): - if val and start is None: - start = i - elif not val and start is not None: - if i - start >= min_height: - bands.append((start, i)) - start = None - if start is not None and h - start >= min_height: - bands.append((start, h)) - - return bands - - -class PDFAnalyzer: - def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: + Args: + executor: Optional executor for concurrent processing. Defaults to ThreadPoolExecutor. """ - Analyze PDF layout to extract margin and region information. + self.executor = executor or DEFAULT_EXECUTOR + + def detect_text_layer( + self, + pdf_data: Union[bytes, str, Path], + filename: str, + detailed_analysis: bool = True, + check_pages: Optional[range] = None, + ) -> PDFTextAnalysisResult: + """ + Detect if a PDF has an OCR text layer. Args: - pdf_data: PDF file data as bytes - filename: Filename for logging + pdf_data: PDF data as bytes, file path string, or Path object + filename: Filename for logging and identification (required) + detailed_analysis: Whether to perform detailed page-by-page analysis + check_pages: Optional range of pages to check (None = check all pages) Returns: - Dictionary containing layout analysis metadata + PDFTextAnalysisResult containing text layer analysis information """ + # Handle different input types + if isinstance(pdf_data, bytes): + # Use BytesIO for bytes input - OCRmyPDF can work with file-like objects + input_file = BytesIO(pdf_data) + else: + # Handle string or Path input + input_path = Path(pdf_data) + if filename is None: + filename = input_path.name + input_file = input_path try: - images = pdf2image.convert_from_bytes(pdf_data, dpi=150) # type: ignore - if not images: - return {"analysis_available": False, "error": "No pages found"} - - _LOGGER.info(f"Analyzing layout for {filename} with {len(images)} pages") + # Use OCRmyPDF's get_pdfinfo function to analyze the PDF + pdf_info = get_pdfinfo( + input_file, + executor=self.executor, # type: ignore + detailed_analysis=detailed_analysis, + progbar=False, + check_pages=check_pages, + ) - # Analyze layout - layout_data = self._analyze_page_layout(images) + # Analyze pages + pages_info = [] + pages_with_text = 0 + pages_with_images = 0 + pages_needing_ocr = 0 + + for page_num, page_info in enumerate(pdf_info.pages): + if page_info is None: + continue + + # Create PageTextInfo from OCRmyPDF's PageInfo + page_text_info = PageTextInfo( + page_number=page_num + 1, # 1-based page numbering + has_text=page_info.has_text, + has_images=bool(page_info.images), + has_corrupt_text=getattr(page_info, "has_corrupt_text", False), + width_pixels=getattr(page_info, "width_pixels", None), + height_pixels=getattr(page_info, "height_pixels", None), + needs_ocr=self._determine_ocr_requirement(page_info), + ) + + pages_info.append(page_text_info) + + if page_text_info.has_text: + pages_with_text += 1 + if page_text_info.has_images: + pages_with_images += 1 + if page_text_info.needs_ocr: + pages_needing_ocr += 1 + + # Determine overall text layer status + has_text_layer = pages_with_text > 0 + + result = PDFTextAnalysisResult( + total_pages=len(pdf_info.pages), + has_text_layer=has_text_layer, + pages_with_text=pages_with_text, + pages_with_images=pages_with_images, + pages_needing_ocr=pages_needing_ocr, + is_encrypted=False, + pages=pages_info, + ) - return { - "analysis_available": True, - "total_pages": len(images), - "page_dimensions": {"width": images[0].size[0], "height": images[0].size[1]} if images else {}, - **layout_data, - } + _LOGGER.info( + f"PDF text analysis for {filename}: " + f"{pages_with_text}/{len(pdf_info.pages)} pages have text, " + f"{pages_needing_ocr} pages need OCR" + ) - except Exception as e: - _LOGGER.error(f"PDF layout analysis failed for {filename}: {e}") - return {"analysis_available": False, "error": str(e)} + return result + + except EncryptedPdfError: + _LOGGER.warning(f"PDF {filename} is encrypted") + return PDFTextAnalysisResult( + total_pages=0, + has_text_layer=False, + pages_with_text=0, + pages_with_images=0, + pages_needing_ocr=0, + is_encrypted=True, + analysis_error="PDF is encrypted", + ) - def _analyze_page_layout(self, images: List["Image"]) -> Dict: - """ - Analyze page layout by comparing pages to find common regions. - """ - if len(images) < 2: - return self._analyze_single_page(images[0]) if images else {} - - # Use first page as reference, compare with others - reference_img = images[0].convert("RGB") - margin_data = [] - - for i in range(1, min(len(images), 5)): # Analyze up to 5 pages for efficiency - compare_img = images[i].convert("RGB") - page_margins = self._compare_pages_for_margins(reference_img, compare_img) - if page_margins: - margin_data.append(page_margins) - - # Aggregate margin data - if margin_data: - return self._aggregate_margin_data(margin_data, reference_img.size) - else: - return self._analyze_single_page(reference_img) + except InputFileError as e: + _LOGGER.error(f"Invalid PDF file {filename}: {e}") + return PDFTextAnalysisResult( + total_pages=0, + has_text_layer=False, + pages_with_text=0, + pages_with_images=0, + pages_needing_ocr=0, + is_encrypted=False, + analysis_error=f"Invalid PDF file: {e}", + ) - def _compare_pages_for_margins(self, reference: "Image", compare: "Image") -> Optional[Dict]: - """ - Compare two pages to identify common regions using advanced CV2 techniques. - """ - try: - # Ensure same size - if reference.size != compare.size: - _LOGGER.debug(f"Resizing page to match reference size") - compare = compare.resize(reference.size) - - # Step 1: Compute difference and invert so white = same - diff = PIL.ImageChops.difference(reference, compare) # type: ignore - sameness_mask = PIL.ImageChops.invert(diff.convert("L")) # type: ignore - - # Step 2: Threshold the mask (keep high-sameness pixels) - # Create a lookup table for thresholding - threshold = 30 - lut = [255 if i > threshold else 0 for i in range(256)] - sameness_mask.point(lut).convert("1") - - # Step 3: Find horizontal bands (potential headers/footers) - horizontal_bands = find_horizontal_bands(sameness_mask) - - # Step 4: Use contour-based region classification - annotated_img, detected_regions = classify_and_draw_layout_regions( - reference, sameness_mask, min_area=5000, label=False + except Exception as e: + _LOGGER.error(f"PDF text analysis failed for {filename}: {e}") + return PDFTextAnalysisResult( + total_pages=0, + has_text_layer=False, + pages_with_text=0, + pages_with_images=0, + pages_needing_ocr=0, + is_encrypted=False, + analysis_error=str(e), ) - # Step 5: Classify and aggregate results - regions = self._classify_regions_advanced(horizontal_bands, detected_regions, reference.size) + def _determine_ocr_requirement(self, page_info: PageInfo) -> bool: + """ + Determine if a page requires OCR processing based on OCRmyPDF logic. - return regions + This mirrors the logic from OCRmyPDF's is_ocr_required function but + simplified for detection purposes. - except Exception as e: - _LOGGER.debug(f"Page comparison failed: {e}") - return None + Args: + page_info: PageInfo object from OCRmyPDF - def _classify_regions_advanced( - self, bands: List[Tuple[int, int]], detected_regions: List[Dict], page_size: Tuple[int, int] - ) -> Dict: - """ - Advanced region classification combining horizontal bands and contour detection. - """ - width, height = page_size - regions = { - "header_bands": [], - "footer_bands": [], - "detected_regions": detected_regions, - "estimated_margins": {}, - } - - # Process horizontal bands - for start_y, end_y in bands: - band_center = (start_y + end_y) / 2 - band_height = end_y - start_y - - # Classify based on position - if band_center < height * 0.25: # Top 25% - regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) - elif band_center > height * 0.75: # Bottom 25% - regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) - - # Estimate margins using both techniques - regions["estimated_margins"] = self._estimate_margins_advanced(regions, detected_regions, page_size) - - return regions - - def _estimate_margins_advanced( - self, regions: Dict, detected_regions: List[Dict], page_size: Tuple[int, int] - ) -> Dict: - """ - Advanced margin estimation using both band and contour information. - """ - width, height = page_size - margins = { - "top": 0, - "bottom": 0, - "left": 50, # Default estimates - "right": 50, - } - - # Calculate top margin from header regions - header_sources = [] - if regions["header_bands"]: - header_sources.append(max(band["end_y"] for band in regions["header_bands"])) - - # Add header regions from contour detection - header_regions = [r for r in detected_regions if r["type"] == "header"] - if header_regions: - header_sources.append(max(r["y"] + r["height"] for r in header_regions)) - - if header_sources: - margins["top"] = max(header_sources) - - # Calculate bottom margin from footer regions - footer_sources = [] - if regions["footer_bands"]: - footer_sources.append(min(band["start_y"] for band in regions["footer_bands"])) - - # Add footer regions from contour detection - footer_regions = [r for r in detected_regions if r["type"] == "footer"] - if footer_regions: - footer_sources.append(min(r["y"] for r in footer_regions)) - - if footer_sources: - margins["bottom"] = height - min(footer_sources) - - # Calculate left/right margins from contour detection - left_regions = [r for r in detected_regions if r["type"] == "left_margin"] - if left_regions: - margins["left"] = max(r["x"] + r["width"] for r in left_regions) - - right_regions = [r for r in detected_regions if r["type"] == "right_margin"] - if right_regions: - margins["right"] = width - min(r["x"] for r in right_regions) - - # Convert to relative percentages for consistency - return { - "top_px": margins["top"], - "bottom_px": margins["bottom"], - "left_px": margins["left"], - "right_px": margins["right"], - "top_percent": (margins["top"] / height) * 100 if height > 0 else 0, - "bottom_percent": (margins["bottom"] / height) * 100 if height > 0 else 0, - "left_percent": (margins["left"] / width) * 100 if width > 0 else 0, - "right_percent": (margins["right"] / width) * 100 if width > 0 else 0, - } - - def _classify_regions(self, bands: List[Tuple[int, int]], page_size: Tuple[int, int]) -> Dict: - """ - Classify horizontal bands into headers, footers, and margins. + Returns: + True if the page needs OCR, False otherwise """ - width, height = page_size - regions = {"header_bands": [], "footer_bands": [], "estimated_margins": {}} + # If page has text, it typically doesn't need OCR (unless forcing) + if page_info.has_text: + return False - for start_y, end_y in bands: - band_center = (start_y + end_y) / 2 - band_height = end_y - start_y + # If page has images, it likely needs OCR + if page_info.images: + return True - # Classify based on position - if band_center < height * 0.25: # Top 25% - regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) - elif band_center > height * 0.75: # Bottom 25% - regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) + # If page has no text and no images, it might be vector art + # For detection purposes, we'll assume it doesn't need OCR + return False - # Estimate margins based on bands - regions["estimated_margins"] = self._estimate_margins_from_bands(regions, page_size) + def has_text_layer(self, pdf_data: Union[bytes, str, Path], filename: str) -> bool: + """ + Simple boolean check if PDF has any text layer. - return regions + Args: + pdf_data: PDF data as bytes, file path string, or Path object + filename: Filename for logging (required) - def _estimate_margins_from_bands(self, regions: Dict, page_size: Tuple[int, int]) -> Dict: - """ - Estimate page margins based on detected bands. - """ - width, height = page_size - margins = { - "top": 0, - "bottom": 0, - "left": 50, # Default estimates - "right": 50, - } - - # Calculate top margin from header bands - if regions["header_bands"]: - max_header_end = max(band["end_y"] for band in regions["header_bands"]) - margins["top"] = max_header_end - - # Calculate bottom margin from footer bands - if regions["footer_bands"]: - min_footer_start = min(band["start_y"] for band in regions["footer_bands"]) - margins["bottom"] = height - min_footer_start - - # Convert to relative percentages for consistency - return { - "top_px": margins["top"], - "bottom_px": margins["bottom"], - "left_px": margins["left"], - "right_px": margins["right"], - "top_percent": (margins["top"] / height) * 100, - "bottom_percent": (margins["bottom"] / height) * 100, - "left_percent": (margins["left"] / width) * 100, - "right_percent": (margins["right"] / width) * 100, - } - - def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, int]) -> Dict: - """ - Aggregate margin data from multiple page comparisons. - """ - # Average the margin estimates - all_margins = [data.get("estimated_margins", {}) for data in margin_data if data.get("estimated_margins")] - - if not all_margins: - return self._analyze_single_page_size(page_size) - - # Calculate average margins - avg_margins = {} - for key in [ - "top_px", - "bottom_px", - "left_px", - "right_px", - "top_percent", - "bottom_percent", - "left_percent", - "right_percent", - ]: - values = [m.get(key, 0) for m in all_margins if key in m] - avg_margins[key] = sum(values) / len(values) if values else 0 - - # Collect all bands and regions - all_header_bands = [] - all_footer_bands = [] - all_detected_regions = [] - - for data in margin_data: - all_header_bands.extend(data.get("header_bands", [])) - all_footer_bands.extend(data.get("footer_bands", [])) - all_detected_regions.extend(data.get("detected_regions", [])) - - # Aggregate detected regions by type - region_stats = {} - for region in all_detected_regions: - region_type = region["type"] - if region_type not in region_stats: - region_stats[region_type] = [] - region_stats[region_type].append(region) - - return { - "layout_analysis": { - "header_bands": all_header_bands, - "footer_bands": all_footer_bands, - "detected_regions": all_detected_regions, - "region_statistics": { - region_type: { - "count": len(regions), - "avg_area": sum(r["area"] for r in regions) / len(regions) if regions else 0, - "total_area": sum(r["area"] for r in regions), - } - for region_type, regions in region_stats.items() - }, - "estimated_margins": avg_margins, - "analysis_method": "multi_page_comparison_advanced", - } - } - - def _analyze_single_page(self, image: "Image") -> Dict: - """ - Analyze a single page when comparison isn't possible. + Returns: + True if PDF has any text content, False otherwise """ - return self._analyze_single_page_size(image.size) + result = self.detect_text_layer(pdf_data, filename, detailed_analysis=False) + return result.has_text_layer and result.analysis_error is None - def _analyze_single_page_size(self, page_size: Tuple[int, int]) -> Dict: + def get_pages_needing_ocr(self, pdf_data: Union[bytes, str, Path], filename: str) -> List[int]: """ - Provide default margin estimates for single page analysis. + Get list of page numbers that need OCR processing. + + Args: + pdf_data: PDF data as bytes, file path string, or Path object + filename: Filename for logging (required) + + Returns: + List of 1-based page numbers that need OCR """ - width, height = page_size - - # Use common academic paper margins as defaults - default_margins = { - "top_px": int(height * 0.1), # 10% top margin - "bottom_px": int(height * 0.1), # 10% bottom margin - "left_px": int(width * 0.1), # 10% left margin - "right_px": int(width * 0.1), # 10% right margin - "top_percent": 10.0, - "bottom_percent": 10.0, - "left_percent": 10.0, - "right_percent": 10.0, - } - - return { - "layout_analysis": { - "header_bands": [], - "footer_bands": [], - "estimated_margins": default_margins, - "analysis_method": "default_estimates", - } - } + result = self.detect_text_layer(pdf_data, filename, detailed_analysis=True) + if result.analysis_error: + return [] + + return [page.page_number for page in result.pages if page.needs_ocr] diff --git a/extralit-server/src/extralit_server/contexts/document/margin.py b/extralit-server/src/extralit_server/contexts/document/margin.py new file mode 100644 index 000000000..e470a11fd --- /dev/null +++ b/extralit-server/src/extralit_server/contexts/document/margin.py @@ -0,0 +1,458 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +from typing import Dict, List, Optional, Tuple, TYPE_CHECKING +import numpy as np + +import lazy_loader as lazy + +cv2 = lazy.load("cv2") +pdf2image = lazy.load("pdf2image") +PIL = lazy.load("PIL") + +if TYPE_CHECKING: + from PIL.Image import Image + +_LOGGER = logging.getLogger(__name__) + + +def pil_to_cv(image: "Image") -> np.ndarray: + """Convert PIL Image to OpenCV format.""" + return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # type: ignore + + +def classify_and_draw_layout_regions( + reference: "Image", mask: "Image", min_area: int = 5000, label: bool = True +) -> Tuple["Image", List[Dict]]: + """ + Classify and optionally draw layout regions using contour detection. + + Returns: + Tuple of (annotated image, list of detected regions) + """ + + mask_np = np.array(mask.convert("L")) + h, w = mask_np.shape + + # Clean up the mask using morphological operations + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) # type: ignore + cleaned = cv2.morphologyEx(mask_np, cv2.MORPH_CLOSE, kernel) # type: ignore + + contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # type: ignore + + img = reference.copy() if label else reference + regions = [] + + if label: + draw = PIL.ImageDraw.Draw(img) # type: ignore + + for cnt in contours: + x, y, rw, rh = cv2.boundingRect(cnt) # type: ignore + area = rw * rh + + if area < min_area: + continue + + cx, cy = x + rw // 2, y + rh // 2 + + # Classify region based on position + if cy < h * 0.25: + region = "header" + elif cy > h * 0.75: + region = "footer" + elif cx < w * 0.15: + region = "left_margin" + elif cx > w * 0.85: + region = "right_margin" + else: + region = "body" + + region_data = { + "type": region, + "x": x, + "y": y, + "width": rw, + "height": rh, + "area": area, + "center_x": cx, + "center_y": cy, + } + regions.append(region_data) + + if label: + draw.rectangle([x, y, x + rw, y + rh], outline="green", width=2) + draw.text((x, y - 10), region, fill="green") + + return img, regions + + +def find_horizontal_bands(mask: "Image", min_height: int = 15, min_ratio: float = 0.95) -> List[Tuple[int, int]]: + """Find horizontal bands of similar content across pages.""" + mask_np = np.array(mask.convert("L")) + h, w = mask_np.shape + + row_sums = np.sum(mask_np == 255, axis=1) / w # white = same + same_rows = row_sums >= min_ratio + + bands = [] + start = None + for i, val in enumerate(same_rows): + if val and start is None: + start = i + elif not val and start is not None: + if i - start >= min_height: + bands.append((start, i)) + start = None + if start is not None and h - start >= min_height: + bands.append((start, h)) + + return bands + + +class PDFAnalyzer: + def analyze_pdf_layout(self, pdf_data: bytes, filename: str) -> Dict: + """ + Analyze PDF layout to extract margin and region information. + + Args: + pdf_data: PDF file data as bytes + filename: Filename for logging + + Returns: + Dictionary containing layout analysis metadata + """ + + try: + images = pdf2image.convert_from_bytes(pdf_data, dpi=150) # type: ignore + if not images: + return {"analysis_available": False, "error": "No pages found"} + + _LOGGER.info(f"Analyzing layout for {filename} with {len(images)} pages") + + # Analyze layout + layout_data = self._analyze_page_layout(images) + + return { + "analysis_available": True, + "total_pages": len(images), + "page_dimensions": {"width": images[0].size[0], "height": images[0].size[1]} if images else {}, + **layout_data, + } + + except Exception as e: + _LOGGER.error(f"PDF layout analysis failed for {filename}: {e}") + return {"analysis_available": False, "error": str(e)} + + def _analyze_page_layout(self, images: List["Image"]) -> Dict: + """ + Analyze page layout by comparing pages to find common regions. + """ + if len(images) < 2: + return self._analyze_single_page(images[0]) if images else {} + + # Use first page as reference, compare with others + reference_img = images[0].convert("RGB") + margin_data = [] + + for i in range(1, min(len(images), 5)): # Analyze up to 5 pages for efficiency + compare_img = images[i].convert("RGB") + page_margins = self._compare_pages_for_margins(reference_img, compare_img) + if page_margins: + margin_data.append(page_margins) + + # Aggregate margin data + if margin_data: + return self._aggregate_margin_data(margin_data, reference_img.size) + else: + return self._analyze_single_page(reference_img) + + def _compare_pages_for_margins(self, reference: "Image", compare: "Image") -> Optional[Dict]: + """ + Compare two pages to identify common regions using advanced CV2 techniques. + """ + try: + # Ensure same size + if reference.size != compare.size: + _LOGGER.debug(f"Resizing page to match reference size") + compare = compare.resize(reference.size) + + # Step 1: Compute difference and invert so white = same + diff = PIL.ImageChops.difference(reference, compare) # type: ignore + sameness_mask = PIL.ImageChops.invert(diff.convert("L")) # type: ignore + + # Step 2: Threshold the mask (keep high-sameness pixels) + # Create a lookup table for thresholding + threshold = 30 + lut = [255 if i > threshold else 0 for i in range(256)] + sameness_mask.point(lut).convert("1") + + # Step 3: Find horizontal bands (potential headers/footers) + horizontal_bands = find_horizontal_bands(sameness_mask) + + # Step 4: Use contour-based region classification + annotated_img, detected_regions = classify_and_draw_layout_regions( + reference, sameness_mask, min_area=5000, label=False + ) + + # Step 5: Classify and aggregate results + regions = self._classify_regions_advanced(horizontal_bands, detected_regions, reference.size) + + return regions + + except Exception as e: + _LOGGER.debug(f"Page comparison failed: {e}") + return None + + def _classify_regions_advanced( + self, bands: List[Tuple[int, int]], detected_regions: List[Dict], page_size: Tuple[int, int] + ) -> Dict: + """ + Advanced region classification combining horizontal bands and contour detection. + """ + width, height = page_size + regions = { + "header_bands": [], + "footer_bands": [], + "detected_regions": detected_regions, + "estimated_margins": {}, + } + + # Process horizontal bands + for start_y, end_y in bands: + band_center = (start_y + end_y) / 2 + band_height = end_y - start_y + + # Classify based on position + if band_center < height * 0.25: # Top 25% + regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) + elif band_center > height * 0.75: # Bottom 25% + regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) + + # Estimate margins using both techniques + regions["estimated_margins"] = self._estimate_margins_advanced(regions, detected_regions, page_size) + + return regions + + def _estimate_margins_advanced( + self, regions: Dict, detected_regions: List[Dict], page_size: Tuple[int, int] + ) -> Dict: + """ + Advanced margin estimation using both band and contour information. + """ + width, height = page_size + margins = { + "top": 0, + "bottom": 0, + "left": 50, # Default estimates + "right": 50, + } + + # Calculate top margin from header regions + header_sources = [] + if regions["header_bands"]: + header_sources.append(max(band["end_y"] for band in regions["header_bands"])) + + # Add header regions from contour detection + header_regions = [r for r in detected_regions if r["type"] == "header"] + if header_regions: + header_sources.append(max(r["y"] + r["height"] for r in header_regions)) + + if header_sources: + margins["top"] = max(header_sources) + + # Calculate bottom margin from footer regions + footer_sources = [] + if regions["footer_bands"]: + footer_sources.append(min(band["start_y"] for band in regions["footer_bands"])) + + # Add footer regions from contour detection + footer_regions = [r for r in detected_regions if r["type"] == "footer"] + if footer_regions: + footer_sources.append(min(r["y"] for r in footer_regions)) + + if footer_sources: + margins["bottom"] = height - min(footer_sources) + + # Calculate left/right margins from contour detection + left_regions = [r for r in detected_regions if r["type"] == "left_margin"] + if left_regions: + margins["left"] = max(r["x"] + r["width"] for r in left_regions) + + right_regions = [r for r in detected_regions if r["type"] == "right_margin"] + if right_regions: + margins["right"] = width - min(r["x"] for r in right_regions) + + # Convert to relative percentages for consistency + return { + "top_px": margins["top"], + "bottom_px": margins["bottom"], + "left_px": margins["left"], + "right_px": margins["right"], + "top_percent": (margins["top"] / height) * 100 if height > 0 else 0, + "bottom_percent": (margins["bottom"] / height) * 100 if height > 0 else 0, + "left_percent": (margins["left"] / width) * 100 if width > 0 else 0, + "right_percent": (margins["right"] / width) * 100 if width > 0 else 0, + } + + def _classify_regions(self, bands: List[Tuple[int, int]], page_size: Tuple[int, int]) -> Dict: + """ + Classify horizontal bands into headers, footers, and margins. + """ + width, height = page_size + regions = {"header_bands": [], "footer_bands": [], "estimated_margins": {}} + + for start_y, end_y in bands: + band_center = (start_y + end_y) / 2 + band_height = end_y - start_y + + # Classify based on position + if band_center < height * 0.25: # Top 25% + regions["header_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) + elif band_center > height * 0.75: # Bottom 25% + regions["footer_bands"].append({"start_y": start_y, "end_y": end_y, "height": band_height}) + + # Estimate margins based on bands + regions["estimated_margins"] = self._estimate_margins_from_bands(regions, page_size) + + return regions + + def _estimate_margins_from_bands(self, regions: Dict, page_size: Tuple[int, int]) -> Dict: + """ + Estimate page margins based on detected bands. + """ + width, height = page_size + margins = { + "top": 0, + "bottom": 0, + "left": 50, # Default estimates + "right": 50, + } + + # Calculate top margin from header bands + if regions["header_bands"]: + max_header_end = max(band["end_y"] for band in regions["header_bands"]) + margins["top"] = max_header_end + + # Calculate bottom margin from footer bands + if regions["footer_bands"]: + min_footer_start = min(band["start_y"] for band in regions["footer_bands"]) + margins["bottom"] = height - min_footer_start + + # Convert to relative percentages for consistency + return { + "top_px": margins["top"], + "bottom_px": margins["bottom"], + "left_px": margins["left"], + "right_px": margins["right"], + "top_percent": (margins["top"] / height) * 100, + "bottom_percent": (margins["bottom"] / height) * 100, + "left_percent": (margins["left"] / width) * 100, + "right_percent": (margins["right"] / width) * 100, + } + + def _aggregate_margin_data(self, margin_data: List[Dict], page_size: Tuple[int, int]) -> Dict: + """ + Aggregate margin data from multiple page comparisons. + """ + # Average the margin estimates + all_margins = [data.get("estimated_margins", {}) for data in margin_data if data.get("estimated_margins")] + + if not all_margins: + return self._analyze_single_page_size(page_size) + + # Calculate average margins + avg_margins = {} + for key in [ + "top_px", + "bottom_px", + "left_px", + "right_px", + "top_percent", + "bottom_percent", + "left_percent", + "right_percent", + ]: + values = [m.get(key, 0) for m in all_margins if key in m] + avg_margins[key] = sum(values) / len(values) if values else 0 + + # Collect all bands and regions + all_header_bands = [] + all_footer_bands = [] + all_detected_regions = [] + + for data in margin_data: + all_header_bands.extend(data.get("header_bands", [])) + all_footer_bands.extend(data.get("footer_bands", [])) + all_detected_regions.extend(data.get("detected_regions", [])) + + # Aggregate detected regions by type + region_stats = {} + for region in all_detected_regions: + region_type = region["type"] + if region_type not in region_stats: + region_stats[region_type] = [] + region_stats[region_type].append(region) + + return { + "layout_analysis": { + "header_bands": all_header_bands, + "footer_bands": all_footer_bands, + "detected_regions": all_detected_regions, + "region_statistics": { + region_type: { + "count": len(regions), + "avg_area": sum(r["area"] for r in regions) / len(regions) if regions else 0, + "total_area": sum(r["area"] for r in regions), + } + for region_type, regions in region_stats.items() + }, + "estimated_margins": avg_margins, + "analysis_method": "multi_page_comparison_advanced", + } + } + + def _analyze_single_page(self, image: "Image") -> Dict: + """ + Analyze a single page when comparison isn't possible. + """ + return self._analyze_single_page_size(image.size) + + def _analyze_single_page_size(self, page_size: Tuple[int, int]) -> Dict: + """ + Provide default margin estimates for single page analysis. + """ + width, height = page_size + + # Use common academic paper margins as defaults + default_margins = { + "top_px": int(height * 0.1), # 10% top margin + "bottom_px": int(height * 0.1), # 10% bottom margin + "left_px": int(width * 0.1), # 10% left margin + "right_px": int(width * 0.1), # 10% right margin + "top_percent": 10.0, + "bottom_percent": 10.0, + "left_percent": 10.0, + "right_percent": 10.0, + } + + return { + "layout_analysis": { + "header_bands": [], + "footer_bands": [], + "estimated_margins": default_margins, + "analysis_method": "default_estimates", + } + } diff --git a/extralit-server/src/extralit_server/contexts/embeddings/__init__.py b/extralit-server/src/extralit_server/contexts/embeddings/__init__.py new file mode 100644 index 000000000..fb5dffc96 --- /dev/null +++ b/extralit-server/src/extralit_server/contexts/embeddings/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/extralit-server/src/extralit_server/contexts/ocr/__init__.py b/extralit-server/src/extralit_server/contexts/ocr/__init__.py new file mode 100644 index 000000000..fb5dffc96 --- /dev/null +++ b/extralit-server/src/extralit_server/contexts/ocr/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/extralit-server/src/extralit_server/jobs/ocr_jobs.py b/extralit-server/src/extralit_server/jobs/ocr_jobs.py new file mode 100644 index 000000000..fb5dffc96 --- /dev/null +++ b/extralit-server/src/extralit_server/jobs/ocr_jobs.py @@ -0,0 +1,14 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + From 960a4009ec50fd36cec1241e774ea40ba0f498c7 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Sat, 9 Aug 2025 16:34:40 -0700 Subject: [PATCH 19/22] fix opengl issues and import errors - Updated `PDFTextAnalysisResult` to use `field(default_factory=list)` for better default list handling. - Enhanced OpenCV loading in `margin.py` to set CPU-only mode and added error handling for loading failures. - Adjusted imports in `preprocessing.py` to correctly reference `PDFAnalyzer` from the margin module. --- .../extralit_server/contexts/document/analysis.py | 4 ++-- .../extralit_server/contexts/document/margin.py | 14 +++++++++++++- .../contexts/document/preprocessing.py | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py index cc1ecdbf7..fa43e85dc 100644 --- a/extralit-server/src/extralit_server/contexts/document/analysis.py +++ b/extralit-server/src/extralit_server/contexts/document/analysis.py @@ -20,7 +20,7 @@ """ import logging -from dataclasses import dataclass +from dataclasses import dataclass, field from io import BytesIO from pathlib import Path from typing import List, Optional, Union @@ -66,7 +66,7 @@ class PDFTextAnalysisResult: pages_needing_ocr: int is_encrypted: bool analysis_error: Optional[str] = None - pages: List[PageTextInfo] = [] + pages: List[PageTextInfo] = field(default_factory=list) class PDFTextLayerDetector: diff --git a/extralit-server/src/extralit_server/contexts/document/margin.py b/extralit-server/src/extralit_server/contexts/document/margin.py index e470a11fd..dbcfb87a9 100644 --- a/extralit-server/src/extralit_server/contexts/document/margin.py +++ b/extralit-server/src/extralit_server/contexts/document/margin.py @@ -13,13 +13,25 @@ # limitations under the License. import logging +import os from typing import Dict, List, Optional, Tuple, TYPE_CHECKING import numpy as np import lazy_loader as lazy -cv2 = lazy.load("cv2") +os.environ["OPENCV_VIDEOIO_PRIORITY_MSMF"] = "0" +os.environ["OPENCV_VIDEOIO_PRIORITY_INTEL_MFX"] = "0" + +try: + cv2 = lazy.load("cv2") + # Set OpenCV to use CPU-only mode to avoid OpenGL issues + cv2.setUseOptimized(False) # type: ignore + cv2.setNumThreads(1) # type: ignore +except Exception as e: + _LOGGER = logging.getLogger(__name__) + _LOGGER.warning(f"OpenCV not available or failed to load: {e}") + pdf2image = lazy.load("pdf2image") PIL = lazy.load("PIL") diff --git a/extralit-server/src/extralit_server/contexts/document/preprocessing.py b/extralit-server/src/extralit_server/contexts/document/preprocessing.py index b2a22f02b..b467eed3e 100644 --- a/extralit-server/src/extralit_server/contexts/document/preprocessing.py +++ b/extralit-server/src/extralit_server/contexts/document/preprocessing.py @@ -26,8 +26,8 @@ import lazy_loader as lazy from pydantic import Field from pydantic_settings import BaseSettings +from extralit_server.contexts.document.margin import PDFAnalyzer from extralit_server.api.schemas.v1.document.preprocessing import PDFMetadata -from extralit_server.contexts.document.analysis import PDFAnalyzer ocrmypdf = lazy.load("ocrmypdf") From a8d9ef05e9d9d9e85c2e8e5c5101247a4af8710f Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Sun, 10 Aug 2025 10:09:26 -0700 Subject: [PATCH 20/22] chore: EXTRALIT_DATABASE_URL to use a relative path in .env.dev --- extralit-server/.env.dev | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/extralit-server/.env.dev b/extralit-server/.env.dev index afbd12a3e..fd993b7d0 100644 --- a/extralit-server/.env.dev +++ b/extralit-server/.env.dev @@ -1,7 +1,7 @@ OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # Needed by RQ to work with forked processes on MacOS ALEMBIC_CONFIG=src/extralit_server/alembic.ini EXTRALIT_AUTH_SECRET_KEY=8VO7na5N/jQx+yP/N+HlE8q51vPdrxqlh6OzoebIyko= # With this we avoid using a different key every time the server is reloaded -EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${HOME}/.extralit/extralit-dev.db?check_same_thread=False +EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///./extralit-dev.db?check_same_thread=False HF_HUB_DISABLE_TELEMETRY=1 # S3 Configuration (skipped to use LocalFileStorage) @@ -24,10 +24,9 @@ EXTRALIT_REDIS_URL=redis://localhost:6379/0 # PDF Preprocessing PREPROCESSING_ENABLED=true PREPROCESSING_ENABLE_ANALYSIS=true -PREPROCESSING_LANGUAGE='["eng"]' PREPROCESSING_ROTATE_PAGES=true PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0 PREPROCESSING_CLEAN=false PREPROCESSING_SKIP_TEXT=true -# PREPROCESSING_TESSERACT_TIMEOUT=0 # Uncomment to disable Tesseract OCR +PREPROCESSING_TESSERACT_TIMEOUT=0 PREPROCESSING_QUIET=false From 290b51917debc755957302b708f8e7fbdd335ed5 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Sun, 10 Aug 2025 12:27:57 -0700 Subject: [PATCH 21/22] Refactor PDF text layer detection and analysis - Replaced the `PDFTextLayerDetector` class with `PDFOCRLayerDetector` to streamline OCR text layer detection using `pdfminer`. - Introduced methods for checking font resources and analyzing character quality in PDFs. - Removed unused `figures.py` and `tables.py` files to clean up the codebase. - Enhanced error handling and logging for better debugging and user feedback. --- .../contexts/document/analysis.py | 380 +++++++----------- .../contexts/{document => ocr}/figures.py | 0 .../contexts/{document => ocr}/tables.py | 0 .../src/extralit_server/contexts/ocr/text.py | 14 + 4 files changed, 162 insertions(+), 232 deletions(-) rename extralit-server/src/extralit_server/contexts/{document => ocr}/figures.py (100%) rename extralit-server/src/extralit_server/contexts/{document => ocr}/tables.py (100%) create mode 100644 extralit-server/src/extralit_server/contexts/ocr/text.py diff --git a/extralit-server/src/extralit_server/contexts/document/analysis.py b/extralit-server/src/extralit_server/contexts/document/analysis.py index fa43e85dc..5b3008db8 100644 --- a/extralit-server/src/extralit_server/contexts/document/analysis.py +++ b/extralit-server/src/extralit_server/contexts/document/analysis.py @@ -12,259 +12,175 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -PDF text layer detection using OCRmyPDF internal functions. +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfminer.converter import PDFPageAggregator +from pdfminer.layout import LAParams, LTTextBox, LTChar +from typing import Dict, List +from io import BytesIO -This module provides functionality to detect whether a PDF already has an OCR text layer -by leveraging OCRmyPDF's internal PdfInfo and PageInfo classes. -""" -import logging -from dataclasses import dataclass, field -from io import BytesIO -from pathlib import Path -from typing import List, Optional, Union -from concurrent.futures import ThreadPoolExecutor - -try: - from ocrmypdf.pdfinfo.info import PageInfo - from ocrmypdf.exceptions import EncryptedPdfError, InputFileError - from ocrmypdf._pipeline import get_pdfinfo - from ocrmypdf._concurrent import Executor -except ImportError as e: - raise ImportError( - "OCRmyPDF is required for PDF text layer detection. " "Please install it with: pip install ocrmypdf" - ) from e - -_LOGGER = logging.getLogger(__name__) - -DEFAULT_EXECUTOR = ThreadPoolExecutor(max_workers=1) - - -@dataclass -class PageTextInfo: - """Information about text content on a specific PDF page.""" - - page_number: int - has_text: bool - has_images: bool - has_corrupt_text: bool = False - width_pixels: Optional[int] = None - height_pixels: Optional[int] = None - text_extraction_confidence: Optional[float] = None - needs_ocr: bool = True - - -@dataclass -class PDFTextAnalysisResult: - """Result of PDF text layer analysis.""" - - total_pages: int - has_text_layer: bool - pages_with_text: int - pages_with_images: int - pages_needing_ocr: int - is_encrypted: bool - analysis_error: Optional[str] = None - pages: List[PageTextInfo] = field(default_factory=list) - - -class PDFTextLayerDetector: - """ - Detector for PDF text layers using OCRmyPDF internal functions. - - This class uses OCRmyPDF's PdfInfo to analyze PDF pages and determine - which pages already have text content and which would require OCR processing. - """ - - def __init__(self, executor: Optional["Executor"] = None): - """ - Initialize the PDF text layer detector. +class PDFOCRLayerDetector: + def __init__(self): + self.resource_manager = PDFResourceManager() + self.laparams = LAParams() + self.device = PDFPageAggregator(self.resource_manager, laparams=self.laparams) + self.interpreter = PDFPageInterpreter(self.resource_manager, self.device) - Args: - executor: Optional executor for concurrent processing. Defaults to ThreadPoolExecutor. + def has_ocr_text_layer(self, pdf_bytes: bytes, threshold: float = 0.5, verbose=False) -> bool: """ - self.executor = executor or DEFAULT_EXECUTOR - - def detect_text_layer( - self, - pdf_data: Union[bytes, str, Path], - filename: str, - detailed_analysis: bool = True, - check_pages: Optional[range] = None, - ) -> PDFTextAnalysisResult: - """ - Detect if a PDF has an OCR text layer. + Detect if PDF has OCR text layer by analyzing font resources per page. + Returns True if more than 50% of pages have font resources (indicating searchable text). Args: - pdf_data: PDF data as bytes, file path string, or Path object - filename: Filename for logging and identification (required) - detailed_analysis: Whether to perform detailed page-by-page analysis - check_pages: Optional range of pages to check (None = check all pages) + pdf_bytes: PDF file content as bytes Returns: - PDFTextAnalysisResult containing text layer analysis information - """ - # Handle different input types - if isinstance(pdf_data, bytes): - # Use BytesIO for bytes input - OCRmyPDF can work with file-like objects - input_file = BytesIO(pdf_data) - else: - # Handle string or Path input - input_path = Path(pdf_data) - if filename is None: - filename = input_path.name - input_file = input_path - - try: - # Use OCRmyPDF's get_pdfinfo function to analyze the PDF - pdf_info = get_pdfinfo( - input_file, - executor=self.executor, # type: ignore - detailed_analysis=detailed_analysis, - progbar=False, - check_pages=check_pages, - ) - - # Analyze pages - pages_info = [] - pages_with_text = 0 - pages_with_images = 0 - pages_needing_ocr = 0 - - for page_num, page_info in enumerate(pdf_info.pages): - if page_info is None: - continue - - # Create PageTextInfo from OCRmyPDF's PageInfo - page_text_info = PageTextInfo( - page_number=page_num + 1, # 1-based page numbering - has_text=page_info.has_text, - has_images=bool(page_info.images), - has_corrupt_text=getattr(page_info, "has_corrupt_text", False), - width_pixels=getattr(page_info, "width_pixels", None), - height_pixels=getattr(page_info, "height_pixels", None), - needs_ocr=self._determine_ocr_requirement(page_info), - ) - - pages_info.append(page_text_info) - - if page_text_info.has_text: - pages_with_text += 1 - if page_text_info.has_images: - pages_with_images += 1 - if page_text_info.needs_ocr: - pages_needing_ocr += 1 - - # Determine overall text layer status - has_text_layer = pages_with_text > 0 - - result = PDFTextAnalysisResult( - total_pages=len(pdf_info.pages), - has_text_layer=has_text_layer, - pages_with_text=pages_with_text, - pages_with_images=pages_with_images, - pages_needing_ocr=pages_needing_ocr, - is_encrypted=False, - pages=pages_info, - ) - - _LOGGER.info( - f"PDF text analysis for {filename}: " - f"{pages_with_text}/{len(pdf_info.pages)} pages have text, " - f"{pages_needing_ocr} pages need OCR" - ) - - return result - - except EncryptedPdfError: - _LOGGER.warning(f"PDF {filename} is encrypted") - return PDFTextAnalysisResult( - total_pages=0, - has_text_layer=False, - pages_with_text=0, - pages_with_images=0, - pages_needing_ocr=0, - is_encrypted=True, - analysis_error="PDF is encrypted", - ) - - except InputFileError as e: - _LOGGER.error(f"Invalid PDF file {filename}: {e}") - return PDFTextAnalysisResult( - total_pages=0, - has_text_layer=False, - pages_with_text=0, - pages_with_images=0, - pages_needing_ocr=0, - is_encrypted=False, - analysis_error=f"Invalid PDF file: {e}", - ) - - except Exception as e: - _LOGGER.error(f"PDF text analysis failed for {filename}: {e}") - return PDFTextAnalysisResult( - total_pages=0, - has_text_layer=False, - pages_with_text=0, - pages_with_images=0, - pages_needing_ocr=0, - is_encrypted=False, - analysis_error=str(e), - ) - - def _determine_ocr_requirement(self, page_info: PageInfo) -> bool: + bool: True if PDF has OCR text layer, False otherwise """ - Determine if a page requires OCR processing based on OCRmyPDF logic. + page_info = self._check_font_resources_per_page(pdf_bytes) - This mirrors the logic from OCRmyPDF's is_ocr_required function but - simplified for detection purposes. + if not page_info: + return False - Args: - page_info: PageInfo object from OCRmyPDF + if verbose: + print(f"Total pages: {len(page_info)}") + print(page_info) - Returns: - True if the page needs OCR, False otherwise + pages_with_fonts = sum(1 for page in page_info if page.get("has_fonts", False)) + total_pages = len(page_info) + + # Return True if more than 50% of pages have fonts + return pages_with_fonts > (total_pages * threshold) + + def _check_font_resources_per_page(self, pdf_bytes: bytes) -> List[Dict]: """ - # If page has text, it typically doesn't need OCR (unless forcing) - if page_info.has_text: - return False + Check each page for font resources - indicates searchable text + """ + page_info = [] + + pdf_stream = BytesIO(pdf_bytes) + for page_num, page in enumerate(PDFPage.get_pages(pdf_stream)): + page_data = { + "page_number": page_num + 1, + "has_fonts": False, + "font_count": 0, + "has_images": False, + "resource_types": [], + } + + if hasattr(page, "resources") and page.resources: + resources = page.resources + + if "Font" in resources: + page_data["has_fonts"] = True + font_resource = resources["Font"] + try: + page_data["font_count"] = len(font_resource) # type: ignore + except (TypeError, AttributeError): + page_data["font_count"] = 1 + + if "XObject" in resources: + page_data["has_images"] = True + + page_data["resource_types"] = list(resources.keys()) + + page_info.append(page_data) + + return page_info + + def analyze_character_quality(self, pdf_bytes: bytes) -> Dict: + char_stats = { + "total_chars": 0, + "font_variations": set(), + "suspicious_patterns": 0, + "ocr_artifacts": 0, + "avg_char_size": 0, + "size_variations": [], + } + + pdf_stream = BytesIO(pdf_bytes) + for page in PDFPage.get_pages(pdf_stream): + self.interpreter.process_page(page) + layout = self.device.get_result() + + for element in layout: + if isinstance(element, LTTextBox): + for line in element: + for char in line: + if isinstance(char, LTChar): + char_stats["total_chars"] += 1 + + if self._is_ocr_artifact(char): + char_stats["ocr_artifacts"] += 1 + + if self._is_suspicious_char(char): + char_stats["suspicious_patterns"] += 1 + + char_stats["ocr_quality_score"] = self._calculate_quality_score(char_stats) + + return char_stats + + def _is_ocr_artifact(self, char: LTChar) -> bool: + if "hidden" in char.fontname.lower() or "ocr" in char.fontname.lower(): + return True + + char_text = char.get_text() + if len(char_text) == 1: + # Look for replacement characters or unusual Unicode + if ord(char_text) > 65535 or char_text in ["�", "□", "▯"]: + return True + + return False + + def _is_suspicious_char(self, char: LTChar) -> bool: + char_text = char.get_text() + + # Single character that's not alphanumeric or common punctuation + if len(char_text) == 1 and not (char_text.isalnum() or char_text in ".,!?;: "): + return True - # If page has images, it likely needs OCR - if page_info.images: + # Very small font size (might indicate hidden text) + if char.size < 1.0: return True - # If page has no text and no images, it might be vector art - # For detection purposes, we'll assume it doesn't need OCR return False - def has_text_layer(self, pdf_data: Union[bytes, str, Path], filename: str) -> bool: - """ - Simple boolean check if PDF has any text layer. + def _calculate_quality_score(self, char_stats: Dict) -> float: + if char_stats["total_chars"] == 0: + return 0.0 - Args: - pdf_data: PDF data as bytes, file path string, or Path object - filename: Filename for logging (required) + score = 1.0 - Returns: - True if PDF has any text content, False otherwise - """ - result = self.detect_text_layer(pdf_data, filename, detailed_analysis=False) - return result.has_text_layer and result.analysis_error is None + # Penalize OCR artifacts + artifact_ratio = char_stats["ocr_artifacts"] / char_stats["total_chars"] + score -= artifact_ratio * 0.5 - def get_pages_needing_ocr(self, pdf_data: Union[bytes, str, Path], filename: str) -> List[int]: - """ - Get list of page numbers that need OCR processing. + # Penalize suspicious patterns + suspicious_ratio = char_stats["suspicious_patterns"] / char_stats["total_chars"] + score -= suspicious_ratio * 0.3 - Args: - pdf_data: PDF data as bytes, file path string, or Path object - filename: Filename for logging (required) + return max(0.0, min(1.0, score)) - Returns: - List of 1-based page numbers that need OCR - """ - result = self.detect_text_layer(pdf_data, filename, detailed_analysis=True) - if result.analysis_error: - return [] - return [page.page_number for page in result.pages if page.needs_ocr] +if __name__ == "__main__": + import sys + from pathlib import Path + + if len(sys.argv) != 2: + print("Usage: python analysis.py ") + sys.exit(1) + + pdf_path = sys.argv[1] + if not Path(pdf_path).is_file(): + print(f"File not found: {pdf_path}") + sys.exit(1) + + with open(pdf_path, "rb") as f: + pdf_bytes = f.read() + + ocr_detector = PDFOCRLayerDetector() + has_ocr = ocr_detector.has_ocr_text_layer(pdf_bytes) + print(f"PDF has_ocr_text_layer: {has_ocr}") + ocr_quality = ocr_detector.analyze_character_quality(pdf_bytes) + print(f"PDF analyze_character_quality: {ocr_quality}") diff --git a/extralit-server/src/extralit_server/contexts/document/figures.py b/extralit-server/src/extralit_server/contexts/ocr/figures.py similarity index 100% rename from extralit-server/src/extralit_server/contexts/document/figures.py rename to extralit-server/src/extralit_server/contexts/ocr/figures.py diff --git a/extralit-server/src/extralit_server/contexts/document/tables.py b/extralit-server/src/extralit_server/contexts/ocr/tables.py similarity index 100% rename from extralit-server/src/extralit_server/contexts/document/tables.py rename to extralit-server/src/extralit_server/contexts/ocr/tables.py diff --git a/extralit-server/src/extralit_server/contexts/ocr/text.py b/extralit-server/src/extralit_server/contexts/ocr/text.py new file mode 100644 index 000000000..fb5dffc96 --- /dev/null +++ b/extralit-server/src/extralit_server/contexts/ocr/text.py @@ -0,0 +1,14 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + From edb2b0cd3b828322c1ab1c49524286f3fa018b14 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Mon, 11 Aug 2025 18:31:41 -0700 Subject: [PATCH 22/22] chore: lazy import bibtexparser --- extralit/src/extralit/cli/documents/import_bib.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/extralit/src/extralit/cli/documents/import_bib.py b/extralit/src/extralit/cli/documents/import_bib.py index 3f597e791..8084d48eb 100644 --- a/extralit/src/extralit/cli/documents/import_bib.py +++ b/extralit/src/extralit/cli/documents/import_bib.py @@ -38,16 +38,19 @@ import pandas as pd import typer -import bibtexparser from rich.console import Console from rich.progress import Progress, SpinnerColumn, TextColumn from rich.table import Table +import lazy_loader as lazy from extralit.workspaces._resource import Workspace from extralit.client import Extralit from extralit.cli.rich import get_themed_panel +bibtexparser = lazy.load("bibtexparser") + + def _clean_bibtex_field(value: str) -> str: """Clean BibTeX field by removing braces and extra whitespace.""" if not value: