From 6eae383af71cd12d772ed8595475826e5ab6c108 Mon Sep 17 00:00:00 2001 From: Ali Tavallaie Date: Mon, 12 May 2025 21:44:24 +0330 Subject: [PATCH 1/6] move to TOML and UV due to main repo #39 and this fork #1 and PEP 518 --- pyproject.toml | 18 + uv.lock | 1163 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1181 insertions(+) create mode 100644 pyproject.toml create mode 100644 uv.lock diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4c995d9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,18 @@ +[project] +name = "moss" +version = "0.1.0" +requires-python = ">=3.10" +dependencies = [ + "alembic>=1.15.2", + "celery>=5.3.6", + "concurrent-log-handler>=0.9.23", + "fastapi>=0.115.12", + "networkx>=3.0", + "psycopg-binary>=3.2.8", + "python-dotenv>=1.1.0", + "python-louvain>=0.16", + "redis>=5.0.4", + "requests>=2.32.3", + "sqlalchemy>=2.0.40", + "uvicorn[standard]>=0.34.2", +] diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..83de40e --- /dev/null +++ b/uv.lock @@ -0,0 +1,1163 @@ +version = 1 +revision = 2 +requires-python = ">=3.10" + +[[package]] +name = "alembic" +version = "1.15.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mako" }, + { name = "sqlalchemy" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e6/57/e314c31b261d1e8a5a5f1908065b4ff98270a778ce7579bd4254477209a7/alembic-1.15.2.tar.gz", hash = "sha256:1c72391bbdeffccfe317eefba686cb9a3c078005478885413b95c3b26c57a8a7", size = 1925573, upload-time = "2025-03-28T13:52:00.443Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/18/d89a443ed1ab9bcda16264716f809c663866d4ca8de218aa78fd50b38ead/alembic-1.15.2-py3-none-any.whl", hash = "sha256:2e76bd916d547f6900ec4bb5a90aeac1485d2c92536923d0b138c02b126edc53", size = 231911, upload-time = "2025-03-28T13:52:02.218Z" }, +] + +[[package]] +name = "amqp" +version = "5.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "vine" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/79/fc/ec94a357dfc6683d8c86f8b4cfa5416a4c36b28052ec8260c77aca96a443/amqp-5.3.1.tar.gz", hash = "sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432", size = 129013, upload-time = "2024-11-12T19:55:44.051Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/99/fc813cd978842c26c82534010ea849eee9ab3a13ea2b74e95cb9c99e747b/amqp-5.3.1-py3-none-any.whl", hash = "sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2", size = 50944, upload-time = "2024-11-12T19:55:41.782Z" }, +] + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "anyio" +version = "4.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "idna" }, + { name = "sniffio" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949, upload-time = "2025-03-17T00:02:54.77Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c", size = 100916, upload-time = "2025-03-17T00:02:52.713Z" }, +] + +[[package]] +name = "async-timeout" +version = "5.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274, upload-time = "2024-11-06T16:41:39.6Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233, upload-time = "2024-11-06T16:41:37.9Z" }, +] + +[[package]] +name = "billiard" +version = "4.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7c/58/1546c970afcd2a2428b1bfafecf2371d8951cc34b46701bea73f4280989e/billiard-4.2.1.tar.gz", hash = "sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f", size = 155031, upload-time = "2024-09-21T13:40:22.491Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/30/da/43b15f28fe5f9e027b41c539abc5469052e9d48fd75f8ff094ba2a0ae767/billiard-4.2.1-py3-none-any.whl", hash = "sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb", size = 86766, upload-time = "2024-09-21T13:40:20.188Z" }, +] + +[[package]] +name = "celery" +version = "5.5.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "billiard" }, + { name = "click" }, + { name = "click-didyoumean" }, + { name = "click-plugins" }, + { name = "click-repl" }, + { name = "kombu" }, + { name = "python-dateutil" }, + { name = "vine" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bf/03/5d9c6c449248958f1a5870e633a29d7419ff3724c452a98ffd22688a1a6a/celery-5.5.2.tar.gz", hash = "sha256:4d6930f354f9d29295425d7a37261245c74a32807c45d764bedc286afd0e724e", size = 1666892, upload-time = "2025-04-25T20:10:04.695Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/94/8e825ac1cf59d45d20c4345d4461e6b5263ae475f708d047c3dad0ac6401/celery-5.5.2-py3-none-any.whl", hash = "sha256:54425a067afdc88b57cd8d94ed4af2ffaf13ab8c7680041ac2c4ac44357bdf4c", size = 438626, upload-time = "2025-04-25T20:10:01.383Z" }, +] + +[[package]] +name = "certifi" +version = "2025.4.26" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/9e/c05b3920a3b7d20d3d3310465f50348e5b3694f4f88c6daf736eef3024c4/certifi-2025.4.26.tar.gz", hash = "sha256:0a816057ea3cdefcef70270d2c515e4506bbc954f417fa5ade2021213bb8f0c6", size = 160705, upload-time = "2025-04-26T02:12:29.51Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/7e/3db2bd1b1f9e95f7cddca6d6e75e2f2bd9f51b1246e546d88addca0106bd/certifi-2025.4.26-py3-none-any.whl", hash = "sha256:30350364dfe371162649852c63336a15c70c6510c2ad5015b21c2345311805f3", size = 159618, upload-time = "2025-04-26T02:12:27.662Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload-time = "2025-05-02T08:34:42.01Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/28/9901804da60055b406e1a1c5ba7aac1276fb77f1dde635aabfc7fd84b8ab/charset_normalizer-3.4.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c48ed483eb946e6c04ccbe02c6b4d1d48e51944b6db70f697e089c193404941", size = 201818, upload-time = "2025-05-02T08:31:46.725Z" }, + { url = "https://files.pythonhosted.org/packages/d9/9b/892a8c8af9110935e5adcbb06d9c6fe741b6bb02608c6513983048ba1a18/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2d318c11350e10662026ad0eb71bb51c7812fc8590825304ae0bdd4ac283acd", size = 144649, upload-time = "2025-05-02T08:31:48.889Z" }, + { url = "https://files.pythonhosted.org/packages/7b/a5/4179abd063ff6414223575e008593861d62abfc22455b5d1a44995b7c101/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9cbfacf36cb0ec2897ce0ebc5d08ca44213af24265bd56eca54bee7923c48fd6", size = 155045, upload-time = "2025-05-02T08:31:50.757Z" }, + { url = "https://files.pythonhosted.org/packages/3b/95/bc08c7dfeddd26b4be8c8287b9bb055716f31077c8b0ea1cd09553794665/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18dd2e350387c87dabe711b86f83c9c78af772c748904d372ade190b5c7c9d4d", size = 147356, upload-time = "2025-05-02T08:31:52.634Z" }, + { url = "https://files.pythonhosted.org/packages/a8/2d/7a5b635aa65284bf3eab7653e8b4151ab420ecbae918d3e359d1947b4d61/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8075c35cd58273fee266c58c0c9b670947c19df5fb98e7b66710e04ad4e9ff86", size = 149471, upload-time = "2025-05-02T08:31:56.207Z" }, + { url = "https://files.pythonhosted.org/packages/ae/38/51fc6ac74251fd331a8cfdb7ec57beba8c23fd5493f1050f71c87ef77ed0/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5bf4545e3b962767e5c06fe1738f951f77d27967cb2caa64c28be7c4563e162c", size = 151317, upload-time = "2025-05-02T08:31:57.613Z" }, + { url = "https://files.pythonhosted.org/packages/b7/17/edee1e32215ee6e9e46c3e482645b46575a44a2d72c7dfd49e49f60ce6bf/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a6ab32f7210554a96cd9e33abe3ddd86732beeafc7a28e9955cdf22ffadbab0", size = 146368, upload-time = "2025-05-02T08:31:59.468Z" }, + { url = "https://files.pythonhosted.org/packages/26/2c/ea3e66f2b5f21fd00b2825c94cafb8c326ea6240cd80a91eb09e4a285830/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b33de11b92e9f75a2b545d6e9b6f37e398d86c3e9e9653c4864eb7e89c5773ef", size = 154491, upload-time = "2025-05-02T08:32:01.219Z" }, + { url = "https://files.pythonhosted.org/packages/52/47/7be7fa972422ad062e909fd62460d45c3ef4c141805b7078dbab15904ff7/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8755483f3c00d6c9a77f490c17e6ab0c8729e39e6390328e42521ef175380ae6", size = 157695, upload-time = "2025-05-02T08:32:03.045Z" }, + { url = "https://files.pythonhosted.org/packages/2f/42/9f02c194da282b2b340f28e5fb60762de1151387a36842a92b533685c61e/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:68a328e5f55ec37c57f19ebb1fdc56a248db2e3e9ad769919a58672958e8f366", size = 154849, upload-time = "2025-05-02T08:32:04.651Z" }, + { url = "https://files.pythonhosted.org/packages/67/44/89cacd6628f31fb0b63201a618049be4be2a7435a31b55b5eb1c3674547a/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:21b2899062867b0e1fde9b724f8aecb1af14f2778d69aacd1a5a1853a597a5db", size = 150091, upload-time = "2025-05-02T08:32:06.719Z" }, + { url = "https://files.pythonhosted.org/packages/1f/79/4b8da9f712bc079c0f16b6d67b099b0b8d808c2292c937f267d816ec5ecc/charset_normalizer-3.4.2-cp310-cp310-win32.whl", hash = "sha256:e8082b26888e2f8b36a042a58307d5b917ef2b1cacab921ad3323ef91901c71a", size = 98445, upload-time = "2025-05-02T08:32:08.66Z" }, + { url = "https://files.pythonhosted.org/packages/7d/d7/96970afb4fb66497a40761cdf7bd4f6fca0fc7bafde3a84f836c1f57a926/charset_normalizer-3.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:f69a27e45c43520f5487f27627059b64aaf160415589230992cec34c5e18a509", size = 105782, upload-time = "2025-05-02T08:32:10.46Z" }, + { url = "https://files.pythonhosted.org/packages/05/85/4c40d00dcc6284a1c1ad5de5e0996b06f39d8232f1031cd23c2f5c07ee86/charset_normalizer-3.4.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:be1e352acbe3c78727a16a455126d9ff83ea2dfdcbc83148d2982305a04714c2", size = 198794, upload-time = "2025-05-02T08:32:11.945Z" }, + { url = "https://files.pythonhosted.org/packages/41/d9/7a6c0b9db952598e97e93cbdfcb91bacd89b9b88c7c983250a77c008703c/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa88ca0b1932e93f2d961bf3addbb2db902198dca337d88c89e1559e066e7645", size = 142846, upload-time = "2025-05-02T08:32:13.946Z" }, + { url = "https://files.pythonhosted.org/packages/66/82/a37989cda2ace7e37f36c1a8ed16c58cf48965a79c2142713244bf945c89/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d524ba3f1581b35c03cb42beebab4a13e6cdad7b36246bd22541fa585a56cccd", size = 153350, upload-time = "2025-05-02T08:32:15.873Z" }, + { url = "https://files.pythonhosted.org/packages/df/68/a576b31b694d07b53807269d05ec3f6f1093e9545e8607121995ba7a8313/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28a1005facc94196e1fb3e82a3d442a9d9110b8434fc1ded7a24a2983c9888d8", size = 145657, upload-time = "2025-05-02T08:32:17.283Z" }, + { url = "https://files.pythonhosted.org/packages/92/9b/ad67f03d74554bed3aefd56fe836e1623a50780f7c998d00ca128924a499/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdb20a30fe1175ecabed17cbf7812f7b804b8a315a25f24678bcdf120a90077f", size = 147260, upload-time = "2025-05-02T08:32:18.807Z" }, + { url = "https://files.pythonhosted.org/packages/a6/e6/8aebae25e328160b20e31a7e9929b1578bbdc7f42e66f46595a432f8539e/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f5d9ed7f254402c9e7d35d2f5972c9bbea9040e99cd2861bd77dc68263277c7", size = 149164, upload-time = "2025-05-02T08:32:20.333Z" }, + { url = "https://files.pythonhosted.org/packages/8b/f2/b3c2f07dbcc248805f10e67a0262c93308cfa149a4cd3d1fe01f593e5fd2/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd387a49825780ff861998cd959767800d54f8308936b21025326de4b5a42b9", size = 144571, upload-time = "2025-05-02T08:32:21.86Z" }, + { url = "https://files.pythonhosted.org/packages/60/5b/c3f3a94bc345bc211622ea59b4bed9ae63c00920e2e8f11824aa5708e8b7/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f0aa37f3c979cf2546b73e8222bbfa3dc07a641585340179d768068e3455e544", size = 151952, upload-time = "2025-05-02T08:32:23.434Z" }, + { url = "https://files.pythonhosted.org/packages/e2/4d/ff460c8b474122334c2fa394a3f99a04cf11c646da895f81402ae54f5c42/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e70e990b2137b29dc5564715de1e12701815dacc1d056308e2b17e9095372a82", size = 155959, upload-time = "2025-05-02T08:32:24.993Z" }, + { url = "https://files.pythonhosted.org/packages/a2/2b/b964c6a2fda88611a1fe3d4c400d39c66a42d6c169c924818c848f922415/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0c8c57f84ccfc871a48a47321cfa49ae1df56cd1d965a09abe84066f6853b9c0", size = 153030, upload-time = "2025-05-02T08:32:26.435Z" }, + { url = "https://files.pythonhosted.org/packages/59/2e/d3b9811db26a5ebf444bc0fa4f4be5aa6d76fc6e1c0fd537b16c14e849b6/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6b66f92b17849b85cad91259efc341dce9c1af48e2173bf38a85c6329f1033e5", size = 148015, upload-time = "2025-05-02T08:32:28.376Z" }, + { url = "https://files.pythonhosted.org/packages/90/07/c5fd7c11eafd561bb51220d600a788f1c8d77c5eef37ee49454cc5c35575/charset_normalizer-3.4.2-cp311-cp311-win32.whl", hash = "sha256:daac4765328a919a805fa5e2720f3e94767abd632ae410a9062dff5412bae65a", size = 98106, upload-time = "2025-05-02T08:32:30.281Z" }, + { url = "https://files.pythonhosted.org/packages/a8/05/5e33dbef7e2f773d672b6d79f10ec633d4a71cd96db6673625838a4fd532/charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53efc7c7cee4c1e70661e2e112ca46a575f90ed9ae3fef200f2a25e954f4b28", size = 105402, upload-time = "2025-05-02T08:32:32.191Z" }, + { url = "https://files.pythonhosted.org/packages/d7/a4/37f4d6035c89cac7930395a35cc0f1b872e652eaafb76a6075943754f095/charset_normalizer-3.4.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7", size = 199936, upload-time = "2025-05-02T08:32:33.712Z" }, + { url = "https://files.pythonhosted.org/packages/ee/8a/1a5e33b73e0d9287274f899d967907cd0bf9c343e651755d9307e0dbf2b3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3", size = 143790, upload-time = "2025-05-02T08:32:35.768Z" }, + { url = "https://files.pythonhosted.org/packages/66/52/59521f1d8e6ab1482164fa21409c5ef44da3e9f653c13ba71becdd98dec3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a", size = 153924, upload-time = "2025-05-02T08:32:37.284Z" }, + { url = "https://files.pythonhosted.org/packages/86/2d/fb55fdf41964ec782febbf33cb64be480a6b8f16ded2dbe8db27a405c09f/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d41c4d287cfc69060fa91cae9683eacffad989f1a10811995fa309df656ec214", size = 146626, upload-time = "2025-05-02T08:32:38.803Z" }, + { url = "https://files.pythonhosted.org/packages/8c/73/6ede2ec59bce19b3edf4209d70004253ec5f4e319f9a2e3f2f15601ed5f7/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e594135de17ab3866138f496755f302b72157d115086d100c3f19370839dd3a", size = 148567, upload-time = "2025-05-02T08:32:40.251Z" }, + { url = "https://files.pythonhosted.org/packages/09/14/957d03c6dc343c04904530b6bef4e5efae5ec7d7990a7cbb868e4595ee30/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf713fe9a71ef6fd5adf7a79670135081cd4431c2943864757f0fa3a65b1fafd", size = 150957, upload-time = "2025-05-02T08:32:41.705Z" }, + { url = "https://files.pythonhosted.org/packages/0d/c8/8174d0e5c10ccebdcb1b53cc959591c4c722a3ad92461a273e86b9f5a302/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a370b3e078e418187da8c3674eddb9d983ec09445c99a3a263c2011993522981", size = 145408, upload-time = "2025-05-02T08:32:43.709Z" }, + { url = "https://files.pythonhosted.org/packages/58/aa/8904b84bc8084ac19dc52feb4f5952c6df03ffb460a887b42615ee1382e8/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a955b438e62efdf7e0b7b52a64dc5c3396e2634baa62471768a64bc2adb73d5c", size = 153399, upload-time = "2025-05-02T08:32:46.197Z" }, + { url = "https://files.pythonhosted.org/packages/c2/26/89ee1f0e264d201cb65cf054aca6038c03b1a0c6b4ae998070392a3ce605/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7222ffd5e4de8e57e03ce2cef95a4c43c98fcb72ad86909abdfc2c17d227fc1b", size = 156815, upload-time = "2025-05-02T08:32:48.105Z" }, + { url = "https://files.pythonhosted.org/packages/fd/07/68e95b4b345bad3dbbd3a8681737b4338ff2c9df29856a6d6d23ac4c73cb/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:bee093bf902e1d8fc0ac143c88902c3dfc8941f7ea1d6a8dd2bcb786d33db03d", size = 154537, upload-time = "2025-05-02T08:32:49.719Z" }, + { url = "https://files.pythonhosted.org/packages/77/1a/5eefc0ce04affb98af07bc05f3bac9094513c0e23b0562d64af46a06aae4/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dedb8adb91d11846ee08bec4c8236c8549ac721c245678282dcb06b221aab59f", size = 149565, upload-time = "2025-05-02T08:32:51.404Z" }, + { url = "https://files.pythonhosted.org/packages/37/a0/2410e5e6032a174c95e0806b1a6585eb21e12f445ebe239fac441995226a/charset_normalizer-3.4.2-cp312-cp312-win32.whl", hash = "sha256:db4c7bf0e07fc3b7d89ac2a5880a6a8062056801b83ff56d8464b70f65482b6c", size = 98357, upload-time = "2025-05-02T08:32:53.079Z" }, + { url = "https://files.pythonhosted.org/packages/6c/4f/c02d5c493967af3eda9c771ad4d2bbc8df6f99ddbeb37ceea6e8716a32bc/charset_normalizer-3.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:5a9979887252a82fefd3d3ed2a8e3b937a7a809f65dcb1e068b090e165bbe99e", size = 105776, upload-time = "2025-05-02T08:32:54.573Z" }, + { url = "https://files.pythonhosted.org/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622, upload-time = "2025-05-02T08:32:56.363Z" }, + { url = "https://files.pythonhosted.org/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435, upload-time = "2025-05-02T08:32:58.551Z" }, + { url = "https://files.pythonhosted.org/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653, upload-time = "2025-05-02T08:33:00.342Z" }, + { url = "https://files.pythonhosted.org/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231, upload-time = "2025-05-02T08:33:02.081Z" }, + { url = "https://files.pythonhosted.org/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243, upload-time = "2025-05-02T08:33:04.063Z" }, + { url = "https://files.pythonhosted.org/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442, upload-time = "2025-05-02T08:33:06.418Z" }, + { url = "https://files.pythonhosted.org/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147, upload-time = "2025-05-02T08:33:08.183Z" }, + { url = "https://files.pythonhosted.org/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057, upload-time = "2025-05-02T08:33:09.986Z" }, + { url = "https://files.pythonhosted.org/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454, upload-time = "2025-05-02T08:33:11.814Z" }, + { url = "https://files.pythonhosted.org/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174, upload-time = "2025-05-02T08:33:13.707Z" }, + { url = "https://files.pythonhosted.org/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166, upload-time = "2025-05-02T08:33:15.458Z" }, + { url = "https://files.pythonhosted.org/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064, upload-time = "2025-05-02T08:33:17.06Z" }, + { url = "https://files.pythonhosted.org/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641, upload-time = "2025-05-02T08:33:18.753Z" }, + { url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload-time = "2025-05-02T08:34:40.053Z" }, +] + +[[package]] +name = "click" +version = "8.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cd/0f/62ca20172d4f87d93cf89665fbaedcd560ac48b465bd1d92bfc7ea6b0a41/click-8.2.0.tar.gz", hash = "sha256:f5452aeddd9988eefa20f90f05ab66f17fce1ee2a36907fd30b05bbb5953814d", size = 235857, upload-time = "2025-05-10T22:21:03.111Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/58/1f37bf81e3c689cc74ffa42102fa8915b59085f54a6e4a80bc6265c0f6bf/click-8.2.0-py3-none-any.whl", hash = "sha256:6b303f0b2aa85f1cb4e5303078fadcbcd4e476f114fab9b5007005711839325c", size = 102156, upload-time = "2025-05-10T22:21:01.352Z" }, +] + +[[package]] +name = "click-didyoumean" +version = "0.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/30/ce/217289b77c590ea1e7c24242d9ddd6e249e52c795ff10fac2c50062c48cb/click_didyoumean-0.3.1.tar.gz", hash = "sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463", size = 3089, upload-time = "2024-03-24T08:22:07.499Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/5b/974430b5ffdb7a4f1941d13d83c64a0395114503cc357c6b9ae4ce5047ed/click_didyoumean-0.3.1-py3-none-any.whl", hash = "sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c", size = 3631, upload-time = "2024-03-24T08:22:06.356Z" }, +] + +[[package]] +name = "click-plugins" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5f/1d/45434f64ed749540af821fd7e42b8e4d23ac04b1eda7c26613288d6cd8a8/click-plugins-1.1.1.tar.gz", hash = "sha256:46ab999744a9d831159c3411bb0c79346d94a444df9a3a3742e9ed63645f264b", size = 8164, upload-time = "2019-04-04T04:27:04.82Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/da/824b92d9942f4e472702488857914bdd50f73021efea15b4cad9aca8ecef/click_plugins-1.1.1-py2.py3-none-any.whl", hash = "sha256:5d262006d3222f5057fd81e1623d4443e41dcda5dc815c06b442aa3c02889fc8", size = 7497, upload-time = "2019-04-04T04:27:03.36Z" }, +] + +[[package]] +name = "click-repl" +version = "0.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "prompt-toolkit" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cb/a2/57f4ac79838cfae6912f997b4d1a64a858fb0c86d7fcaae6f7b58d267fca/click-repl-0.3.0.tar.gz", hash = "sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9", size = 10449, upload-time = "2023-06-15T12:43:51.141Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/40/9d857001228658f0d59e97ebd4c346fe73e138c6de1bce61dc568a57c7f8/click_repl-0.3.0-py3-none-any.whl", hash = "sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812", size = 10289, upload-time = "2023-06-15T12:43:48.626Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "concurrent-log-handler" +version = "0.9.26" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "portalocker" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c2/d1/5a2c5aed6d39610e8936273dfd3ac7789cb70a3f55ae835701f182a1c027/concurrent_log_handler-0.9.26.tar.gz", hash = "sha256:8f22bf79724a0152b9e97d9c2dcf4ecb339607c80bf312f68066070243006b49", size = 29958, upload-time = "2025-05-09T19:52:01.633Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d4/f6/a6a9f45769e955ed52fb2c1e06599c37f481028530a405793a7de5ba2625/concurrent_log_handler-0.9.26-py3-none-any.whl", hash = "sha256:0b03a8f1dcb1a03ad292647ee4930b3f9ba2bdb45e55bf2699d2c053f8e6531f", size = 28348, upload-time = "2025-05-09T19:52:00.147Z" }, +] + +[[package]] +name = "exceptiongroup" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" }, +] + +[[package]] +name = "fastapi" +version = "0.115.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "starlette" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/55/ae499352d82338331ca1e28c7f4a63bfd09479b16395dce38cf50a39e2c2/fastapi-0.115.12.tar.gz", hash = "sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681", size = 295236, upload-time = "2025-03-23T22:55:43.822Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/b3/b51f09c2ba432a576fe63758bddc81f78f0c6309d9e5c10d194313bf021e/fastapi-0.115.12-py3-none-any.whl", hash = "sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d", size = 95164, upload-time = "2025-03-23T22:55:42.101Z" }, +] + +[[package]] +name = "greenlet" +version = "3.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/34/c1/a82edae11d46c0d83481aacaa1e578fea21d94a1ef400afd734d47ad95ad/greenlet-3.2.2.tar.gz", hash = "sha256:ad053d34421a2debba45aa3cc39acf454acbcd025b3fc1a9f8a0dee237abd485", size = 185797, upload-time = "2025-05-09T19:47:35.066Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/66/910217271189cc3f32f670040235f4bf026ded8ca07270667d69c06e7324/greenlet-3.2.2-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:c49e9f7c6f625507ed83a7485366b46cbe325717c60837f7244fc99ba16ba9d6", size = 267395, upload-time = "2025-05-09T14:50:45.357Z" }, + { url = "https://files.pythonhosted.org/packages/a8/36/8d812402ca21017c82880f399309afadb78a0aa300a9b45d741e4df5d954/greenlet-3.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3cc1a3ed00ecfea8932477f729a9f616ad7347a5e55d50929efa50a86cb7be7", size = 625742, upload-time = "2025-05-09T15:23:58.293Z" }, + { url = "https://files.pythonhosted.org/packages/7b/77/66d7b59dfb7cc1102b2f880bc61cb165ee8998c9ec13c96606ba37e54c77/greenlet-3.2.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7c9896249fbef2c615853b890ee854f22c671560226c9221cfd27c995db97e5c", size = 637014, upload-time = "2025-05-09T15:24:47.025Z" }, + { url = "https://files.pythonhosted.org/packages/36/a7/ff0d408f8086a0d9a5aac47fa1b33a040a9fca89bd5a3f7b54d1cd6e2793/greenlet-3.2.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7409796591d879425997a518138889d8d17e63ada7c99edc0d7a1c22007d4907", size = 632874, upload-time = "2025-05-09T15:29:20.014Z" }, + { url = "https://files.pythonhosted.org/packages/a1/75/1dc2603bf8184da9ebe69200849c53c3c1dca5b3a3d44d9f5ca06a930550/greenlet-3.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7791dcb496ec53d60c7f1c78eaa156c21f402dda38542a00afc3e20cae0f480f", size = 631652, upload-time = "2025-05-09T14:53:30.961Z" }, + { url = "https://files.pythonhosted.org/packages/7b/74/ddc8c3bd4c2c20548e5bf2b1d2e312a717d44e2eca3eadcfc207b5f5ad80/greenlet-3.2.2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d8009ae46259e31bc73dc183e402f548e980c96f33a6ef58cc2e7865db012e13", size = 580619, upload-time = "2025-05-09T14:53:42.049Z" }, + { url = "https://files.pythonhosted.org/packages/7e/f2/40f26d7b3077b1c7ae7318a4de1f8ffc1d8ccbad8f1d8979bf5080250fd6/greenlet-3.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fd9fb7c941280e2c837b603850efc93c999ae58aae2b40765ed682a6907ebbc5", size = 1109809, upload-time = "2025-05-09T15:26:59.063Z" }, + { url = "https://files.pythonhosted.org/packages/c5/21/9329e8c276746b0d2318b696606753f5e7b72d478adcf4ad9a975521ea5f/greenlet-3.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:00cd814b8959b95a546e47e8d589610534cfb71f19802ea8a2ad99d95d702057", size = 1133455, upload-time = "2025-05-09T14:53:55.823Z" }, + { url = "https://files.pythonhosted.org/packages/bb/1e/0dca9619dbd736d6981f12f946a497ec21a0ea27262f563bca5729662d4d/greenlet-3.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:d0cb7d47199001de7658c213419358aa8937df767936506db0db7ce1a71f4a2f", size = 294991, upload-time = "2025-05-09T15:05:56.847Z" }, + { url = "https://files.pythonhosted.org/packages/a3/9f/a47e19261747b562ce88219e5ed8c859d42c6e01e73da6fbfa3f08a7be13/greenlet-3.2.2-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:dcb9cebbf3f62cb1e5afacae90761ccce0effb3adaa32339a0670fe7805d8068", size = 268635, upload-time = "2025-05-09T14:50:39.007Z" }, + { url = "https://files.pythonhosted.org/packages/11/80/a0042b91b66975f82a914d515e81c1944a3023f2ce1ed7a9b22e10b46919/greenlet-3.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf3fc9145141250907730886b031681dfcc0de1c158f3cc51c092223c0f381ce", size = 628786, upload-time = "2025-05-09T15:24:00.692Z" }, + { url = "https://files.pythonhosted.org/packages/38/a2/8336bf1e691013f72a6ebab55da04db81a11f68e82bb691f434909fa1327/greenlet-3.2.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:efcdfb9df109e8a3b475c016f60438fcd4be68cd13a365d42b35914cdab4bb2b", size = 640866, upload-time = "2025-05-09T15:24:48.153Z" }, + { url = "https://files.pythonhosted.org/packages/f8/7e/f2a3a13e424670a5d08826dab7468fa5e403e0fbe0b5f951ff1bc4425b45/greenlet-3.2.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd139e4943547ce3a56ef4b8b1b9479f9e40bb47e72cc906f0f66b9d0d5cab3", size = 636752, upload-time = "2025-05-09T15:29:23.182Z" }, + { url = "https://files.pythonhosted.org/packages/fd/5d/ce4a03a36d956dcc29b761283f084eb4a3863401c7cb505f113f73af8774/greenlet-3.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71566302219b17ca354eb274dfd29b8da3c268e41b646f330e324e3967546a74", size = 636028, upload-time = "2025-05-09T14:53:32.854Z" }, + { url = "https://files.pythonhosted.org/packages/4b/29/b130946b57e3ceb039238413790dd3793c5e7b8e14a54968de1fe449a7cf/greenlet-3.2.2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3091bc45e6b0c73f225374fefa1536cd91b1e987377b12ef5b19129b07d93ebe", size = 583869, upload-time = "2025-05-09T14:53:43.614Z" }, + { url = "https://files.pythonhosted.org/packages/ac/30/9f538dfe7f87b90ecc75e589d20cbd71635531a617a336c386d775725a8b/greenlet-3.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:44671c29da26539a5f142257eaba5110f71887c24d40df3ac87f1117df589e0e", size = 1112886, upload-time = "2025-05-09T15:27:01.304Z" }, + { url = "https://files.pythonhosted.org/packages/be/92/4b7deeb1a1e9c32c1b59fdca1cac3175731c23311ddca2ea28a8b6ada91c/greenlet-3.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c23ea227847c9dbe0b3910f5c0dd95658b607137614eb821e6cbaecd60d81cc6", size = 1138355, upload-time = "2025-05-09T14:53:58.011Z" }, + { url = "https://files.pythonhosted.org/packages/c5/eb/7551c751a2ea6498907b2fcbe31d7a54b602ba5e8eb9550a9695ca25d25c/greenlet-3.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:0a16fb934fcabfdfacf21d79e6fed81809d8cd97bc1be9d9c89f0e4567143d7b", size = 295437, upload-time = "2025-05-09T15:00:57.733Z" }, + { url = "https://files.pythonhosted.org/packages/2c/a1/88fdc6ce0df6ad361a30ed78d24c86ea32acb2b563f33e39e927b1da9ea0/greenlet-3.2.2-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:df4d1509efd4977e6a844ac96d8be0b9e5aa5d5c77aa27ca9f4d3f92d3fcf330", size = 270413, upload-time = "2025-05-09T14:51:32.455Z" }, + { url = "https://files.pythonhosted.org/packages/a6/2e/6c1caffd65490c68cd9bcec8cb7feb8ac7b27d38ba1fea121fdc1f2331dc/greenlet-3.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da956d534a6d1b9841f95ad0f18ace637668f680b1339ca4dcfb2c1837880a0b", size = 637242, upload-time = "2025-05-09T15:24:02.63Z" }, + { url = "https://files.pythonhosted.org/packages/98/28/088af2cedf8823b6b7ab029a5626302af4ca1037cf8b998bed3a8d3cb9e2/greenlet-3.2.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c7b15fb9b88d9ee07e076f5a683027bc3befd5bb5d25954bb633c385d8b737e", size = 651444, upload-time = "2025-05-09T15:24:49.856Z" }, + { url = "https://files.pythonhosted.org/packages/4a/9f/0116ab876bb0bc7a81eadc21c3f02cd6100dcd25a1cf2a085a130a63a26a/greenlet-3.2.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:752f0e79785e11180ebd2e726c8a88109ded3e2301d40abced2543aa5d164275", size = 646067, upload-time = "2025-05-09T15:29:24.989Z" }, + { url = "https://files.pythonhosted.org/packages/35/17/bb8f9c9580e28a94a9575da847c257953d5eb6e39ca888239183320c1c28/greenlet-3.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ae572c996ae4b5e122331e12bbb971ea49c08cc7c232d1bd43150800a2d6c65", size = 648153, upload-time = "2025-05-09T14:53:34.716Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ee/7f31b6f7021b8df6f7203b53b9cc741b939a2591dcc6d899d8042fcf66f2/greenlet-3.2.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02f5972ff02c9cf615357c17ab713737cccfd0eaf69b951084a9fd43f39833d3", size = 603865, upload-time = "2025-05-09T14:53:45.738Z" }, + { url = "https://files.pythonhosted.org/packages/b5/2d/759fa59323b521c6f223276a4fc3d3719475dc9ae4c44c2fe7fc750f8de0/greenlet-3.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4fefc7aa68b34b9224490dfda2e70ccf2131368493add64b4ef2d372955c207e", size = 1119575, upload-time = "2025-05-09T15:27:04.248Z" }, + { url = "https://files.pythonhosted.org/packages/30/05/356813470060bce0e81c3df63ab8cd1967c1ff6f5189760c1a4734d405ba/greenlet-3.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a31ead8411a027c2c4759113cf2bd473690517494f3d6e4bf67064589afcd3c5", size = 1147460, upload-time = "2025-05-09T14:54:00.315Z" }, + { url = "https://files.pythonhosted.org/packages/07/f4/b2a26a309a04fb844c7406a4501331b9400e1dd7dd64d3450472fd47d2e1/greenlet-3.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:b24c7844c0a0afc3ccbeb0b807adeefb7eff2b5599229ecedddcfeb0ef333bec", size = 296239, upload-time = "2025-05-09T14:57:17.633Z" }, + { url = "https://files.pythonhosted.org/packages/89/30/97b49779fff8601af20972a62cc4af0c497c1504dfbb3e93be218e093f21/greenlet-3.2.2-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:3ab7194ee290302ca15449f601036007873028712e92ca15fc76597a0aeb4c59", size = 269150, upload-time = "2025-05-09T14:50:30.784Z" }, + { url = "https://files.pythonhosted.org/packages/21/30/877245def4220f684bc2e01df1c2e782c164e84b32e07373992f14a2d107/greenlet-3.2.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dc5c43bb65ec3669452af0ab10729e8fdc17f87a1f2ad7ec65d4aaaefabf6bf", size = 637381, upload-time = "2025-05-09T15:24:12.893Z" }, + { url = "https://files.pythonhosted.org/packages/8e/16/adf937908e1f913856b5371c1d8bdaef5f58f251d714085abeea73ecc471/greenlet-3.2.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:decb0658ec19e5c1f519faa9a160c0fc85a41a7e6654b3ce1b44b939f8bf1325", size = 651427, upload-time = "2025-05-09T15:24:51.074Z" }, + { url = "https://files.pythonhosted.org/packages/ad/49/6d79f58fa695b618654adac64e56aff2eeb13344dc28259af8f505662bb1/greenlet-3.2.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fadd183186db360b61cb34e81117a096bff91c072929cd1b529eb20dd46e6c5", size = 645795, upload-time = "2025-05-09T15:29:26.673Z" }, + { url = "https://files.pythonhosted.org/packages/5a/e6/28ed5cb929c6b2f001e96b1d0698c622976cd8f1e41fe7ebc047fa7c6dd4/greenlet-3.2.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1919cbdc1c53ef739c94cf2985056bcc0838c1f217b57647cbf4578576c63825", size = 648398, upload-time = "2025-05-09T14:53:36.61Z" }, + { url = "https://files.pythonhosted.org/packages/9d/70/b200194e25ae86bc57077f695b6cc47ee3118becf54130c5514456cf8dac/greenlet-3.2.2-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3885f85b61798f4192d544aac7b25a04ece5fe2704670b4ab73c2d2c14ab740d", size = 606795, upload-time = "2025-05-09T14:53:47.039Z" }, + { url = "https://files.pythonhosted.org/packages/f8/c8/ba1def67513a941154ed8f9477ae6e5a03f645be6b507d3930f72ed508d3/greenlet-3.2.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:85f3e248507125bf4af607a26fd6cb8578776197bd4b66e35229cdf5acf1dfbf", size = 1117976, upload-time = "2025-05-09T15:27:06.542Z" }, + { url = "https://files.pythonhosted.org/packages/c3/30/d0e88c1cfcc1b3331d63c2b54a0a3a4a950ef202fb8b92e772ca714a9221/greenlet-3.2.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:1e76106b6fc55fa3d6fe1c527f95ee65e324a13b62e243f77b48317346559708", size = 1145509, upload-time = "2025-05-09T14:54:02.223Z" }, + { url = "https://files.pythonhosted.org/packages/90/2e/59d6491834b6e289051b252cf4776d16da51c7c6ca6a87ff97e3a50aa0cd/greenlet-3.2.2-cp313-cp313-win_amd64.whl", hash = "sha256:fe46d4f8e94e637634d54477b0cfabcf93c53f29eedcbdeecaf2af32029b4421", size = 296023, upload-time = "2025-05-09T14:53:24.157Z" }, + { url = "https://files.pythonhosted.org/packages/65/66/8a73aace5a5335a1cba56d0da71b7bd93e450f17d372c5b7c5fa547557e9/greenlet-3.2.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba30e88607fb6990544d84caf3c706c4b48f629e18853fc6a646f82db9629418", size = 629911, upload-time = "2025-05-09T15:24:22.376Z" }, + { url = "https://files.pythonhosted.org/packages/48/08/c8b8ebac4e0c95dcc68ec99198842e7db53eda4ab3fb0a4e785690883991/greenlet-3.2.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:055916fafad3e3388d27dd68517478933a97edc2fc54ae79d3bec827de2c64c4", size = 635251, upload-time = "2025-05-09T15:24:52.205Z" }, + { url = "https://files.pythonhosted.org/packages/37/26/7db30868f73e86b9125264d2959acabea132b444b88185ba5c462cb8e571/greenlet-3.2.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2593283bf81ca37d27d110956b79e8723f9aa50c4bcdc29d3c0543d4743d2763", size = 632620, upload-time = "2025-05-09T15:29:28.051Z" }, + { url = "https://files.pythonhosted.org/packages/10/ec/718a3bd56249e729016b0b69bee4adea0dfccf6ca43d147ef3b21edbca16/greenlet-3.2.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89c69e9a10670eb7a66b8cef6354c24671ba241f46152dd3eed447f79c29fb5b", size = 628851, upload-time = "2025-05-09T14:53:38.472Z" }, + { url = "https://files.pythonhosted.org/packages/9b/9d/d1c79286a76bc62ccdc1387291464af16a4204ea717f24e77b0acd623b99/greenlet-3.2.2-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02a98600899ca1ca5d3a2590974c9e3ec259503b2d6ba6527605fcd74e08e207", size = 593718, upload-time = "2025-05-09T14:53:48.313Z" }, + { url = "https://files.pythonhosted.org/packages/cd/41/96ba2bf948f67b245784cd294b84e3d17933597dffd3acdb367a210d1949/greenlet-3.2.2-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:b50a8c5c162469c3209e5ec92ee4f95c8231b11db6a04db09bbe338176723bb8", size = 1105752, upload-time = "2025-05-09T15:27:08.217Z" }, + { url = "https://files.pythonhosted.org/packages/68/3b/3b97f9d33c1f2eb081759da62bd6162159db260f602f048bc2f36b4c453e/greenlet-3.2.2-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:45f9f4853fb4cc46783085261c9ec4706628f3b57de3e68bae03e8f8b3c0de51", size = 1125170, upload-time = "2025-05-09T14:54:04.082Z" }, + { url = "https://files.pythonhosted.org/packages/31/df/b7d17d66c8d0f578d2885a3d8f565e9e4725eacc9d3fdc946d0031c055c4/greenlet-3.2.2-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:9ea5231428af34226c05f927e16fc7f6fa5e39e3ad3cd24ffa48ba53a47f4240", size = 269899, upload-time = "2025-05-09T14:54:01.581Z" }, +] + +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + +[[package]] +name = "httptools" +version = "0.6.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/9a/ce5e1f7e131522e6d3426e8e7a490b3a01f39a6696602e1c4f33f9e94277/httptools-0.6.4.tar.gz", hash = "sha256:4e93eee4add6493b59a5c514da98c939b244fce4a0d8879cd3f466562f4b7d5c", size = 240639, upload-time = "2024-10-16T19:45:08.902Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/6f/972f8eb0ea7d98a1c6be436e2142d51ad2a64ee18e02b0e7ff1f62171ab1/httptools-0.6.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3c73ce323711a6ffb0d247dcd5a550b8babf0f757e86a52558fe5b86d6fefcc0", size = 198780, upload-time = "2024-10-16T19:44:06.882Z" }, + { url = "https://files.pythonhosted.org/packages/6a/b0/17c672b4bc5c7ba7f201eada4e96c71d0a59fbc185e60e42580093a86f21/httptools-0.6.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:345c288418f0944a6fe67be8e6afa9262b18c7626c3ef3c28adc5eabc06a68da", size = 103297, upload-time = "2024-10-16T19:44:08.129Z" }, + { url = "https://files.pythonhosted.org/packages/92/5e/b4a826fe91971a0b68e8c2bd4e7db3e7519882f5a8ccdb1194be2b3ab98f/httptools-0.6.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deee0e3343f98ee8047e9f4c5bc7cedbf69f5734454a94c38ee829fb2d5fa3c1", size = 443130, upload-time = "2024-10-16T19:44:09.45Z" }, + { url = "https://files.pythonhosted.org/packages/b0/51/ce61e531e40289a681a463e1258fa1e05e0be54540e40d91d065a264cd8f/httptools-0.6.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca80b7485c76f768a3bc83ea58373f8db7b015551117375e4918e2aa77ea9b50", size = 442148, upload-time = "2024-10-16T19:44:11.539Z" }, + { url = "https://files.pythonhosted.org/packages/ea/9e/270b7d767849b0c96f275c695d27ca76c30671f8eb8cc1bab6ced5c5e1d0/httptools-0.6.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:90d96a385fa941283ebd231464045187a31ad932ebfa541be8edf5b3c2328959", size = 415949, upload-time = "2024-10-16T19:44:13.388Z" }, + { url = "https://files.pythonhosted.org/packages/81/86/ced96e3179c48c6f656354e106934e65c8963d48b69be78f355797f0e1b3/httptools-0.6.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:59e724f8b332319e2875efd360e61ac07f33b492889284a3e05e6d13746876f4", size = 417591, upload-time = "2024-10-16T19:44:15.258Z" }, + { url = "https://files.pythonhosted.org/packages/75/73/187a3f620ed3175364ddb56847d7a608a6fc42d551e133197098c0143eca/httptools-0.6.4-cp310-cp310-win_amd64.whl", hash = "sha256:c26f313951f6e26147833fc923f78f95604bbec812a43e5ee37f26dc9e5a686c", size = 88344, upload-time = "2024-10-16T19:44:16.54Z" }, + { url = "https://files.pythonhosted.org/packages/7b/26/bb526d4d14c2774fe07113ca1db7255737ffbb119315839af2065abfdac3/httptools-0.6.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f47f8ed67cc0ff862b84a1189831d1d33c963fb3ce1ee0c65d3b0cbe7b711069", size = 199029, upload-time = "2024-10-16T19:44:18.427Z" }, + { url = "https://files.pythonhosted.org/packages/a6/17/3e0d3e9b901c732987a45f4f94d4e2c62b89a041d93db89eafb262afd8d5/httptools-0.6.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a", size = 103492, upload-time = "2024-10-16T19:44:19.515Z" }, + { url = "https://files.pythonhosted.org/packages/b7/24/0fe235d7b69c42423c7698d086d4db96475f9b50b6ad26a718ef27a0bce6/httptools-0.6.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8787367fbdfccae38e35abf7641dafc5310310a5987b689f4c32cc8cc3ee975", size = 462891, upload-time = "2024-10-16T19:44:21.067Z" }, + { url = "https://files.pythonhosted.org/packages/b1/2f/205d1f2a190b72da6ffb5f41a3736c26d6fa7871101212b15e9b5cd8f61d/httptools-0.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40b0f7fe4fd38e6a507bdb751db0379df1e99120c65fbdc8ee6c1d044897a636", size = 459788, upload-time = "2024-10-16T19:44:22.958Z" }, + { url = "https://files.pythonhosted.org/packages/6e/4c/d09ce0eff09057a206a74575ae8f1e1e2f0364d20e2442224f9e6612c8b9/httptools-0.6.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:40a5ec98d3f49904b9fe36827dcf1aadfef3b89e2bd05b0e35e94f97c2b14721", size = 433214, upload-time = "2024-10-16T19:44:24.513Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d2/84c9e23edbccc4a4c6f96a1b8d99dfd2350289e94f00e9ccc7aadde26fb5/httptools-0.6.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dacdd3d10ea1b4ca9df97a0a303cbacafc04b5cd375fa98732678151643d4988", size = 434120, upload-time = "2024-10-16T19:44:26.295Z" }, + { url = "https://files.pythonhosted.org/packages/d0/46/4d8e7ba9581416de1c425b8264e2cadd201eb709ec1584c381f3e98f51c1/httptools-0.6.4-cp311-cp311-win_amd64.whl", hash = "sha256:288cd628406cc53f9a541cfaf06041b4c71d751856bab45e3702191f931ccd17", size = 88565, upload-time = "2024-10-16T19:44:29.188Z" }, + { url = "https://files.pythonhosted.org/packages/bb/0e/d0b71465c66b9185f90a091ab36389a7352985fe857e352801c39d6127c8/httptools-0.6.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:df017d6c780287d5c80601dafa31f17bddb170232d85c066604d8558683711a2", size = 200683, upload-time = "2024-10-16T19:44:30.175Z" }, + { url = "https://files.pythonhosted.org/packages/e2/b8/412a9bb28d0a8988de3296e01efa0bd62068b33856cdda47fe1b5e890954/httptools-0.6.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:85071a1e8c2d051b507161f6c3e26155b5c790e4e28d7f236422dbacc2a9cc44", size = 104337, upload-time = "2024-10-16T19:44:31.786Z" }, + { url = "https://files.pythonhosted.org/packages/9b/01/6fb20be3196ffdc8eeec4e653bc2a275eca7f36634c86302242c4fbb2760/httptools-0.6.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69422b7f458c5af875922cdb5bd586cc1f1033295aa9ff63ee196a87519ac8e1", size = 508796, upload-time = "2024-10-16T19:44:32.825Z" }, + { url = "https://files.pythonhosted.org/packages/f7/d8/b644c44acc1368938317d76ac991c9bba1166311880bcc0ac297cb9d6bd7/httptools-0.6.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16e603a3bff50db08cd578d54f07032ca1631450ceb972c2f834c2b860c28ea2", size = 510837, upload-time = "2024-10-16T19:44:33.974Z" }, + { url = "https://files.pythonhosted.org/packages/52/d8/254d16a31d543073a0e57f1c329ca7378d8924e7e292eda72d0064987486/httptools-0.6.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec4f178901fa1834d4a060320d2f3abc5c9e39766953d038f1458cb885f47e81", size = 485289, upload-time = "2024-10-16T19:44:35.111Z" }, + { url = "https://files.pythonhosted.org/packages/5f/3c/4aee161b4b7a971660b8be71a92c24d6c64372c1ab3ae7f366b3680df20f/httptools-0.6.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f", size = 489779, upload-time = "2024-10-16T19:44:36.253Z" }, + { url = "https://files.pythonhosted.org/packages/12/b7/5cae71a8868e555f3f67a50ee7f673ce36eac970f029c0c5e9d584352961/httptools-0.6.4-cp312-cp312-win_amd64.whl", hash = "sha256:db78cb9ca56b59b016e64b6031eda5653be0589dba2b1b43453f6e8b405a0970", size = 88634, upload-time = "2024-10-16T19:44:37.357Z" }, + { url = "https://files.pythonhosted.org/packages/94/a3/9fe9ad23fd35f7de6b91eeb60848986058bd8b5a5c1e256f5860a160cc3e/httptools-0.6.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ade273d7e767d5fae13fa637f4d53b6e961fb7fd93c7797562663f0171c26660", size = 197214, upload-time = "2024-10-16T19:44:38.738Z" }, + { url = "https://files.pythonhosted.org/packages/ea/d9/82d5e68bab783b632023f2fa31db20bebb4e89dfc4d2293945fd68484ee4/httptools-0.6.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:856f4bc0478ae143bad54a4242fccb1f3f86a6e1be5548fecfd4102061b3a083", size = 102431, upload-time = "2024-10-16T19:44:39.818Z" }, + { url = "https://files.pythonhosted.org/packages/96/c1/cb499655cbdbfb57b577734fde02f6fa0bbc3fe9fb4d87b742b512908dff/httptools-0.6.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:322d20ea9cdd1fa98bd6a74b77e2ec5b818abdc3d36695ab402a0de8ef2865a3", size = 473121, upload-time = "2024-10-16T19:44:41.189Z" }, + { url = "https://files.pythonhosted.org/packages/af/71/ee32fd358f8a3bb199b03261f10921716990808a675d8160b5383487a317/httptools-0.6.4-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d87b29bd4486c0093fc64dea80231f7c7f7eb4dc70ae394d70a495ab8436071", size = 473805, upload-time = "2024-10-16T19:44:42.384Z" }, + { url = "https://files.pythonhosted.org/packages/8a/0a/0d4df132bfca1507114198b766f1737d57580c9ad1cf93c1ff673e3387be/httptools-0.6.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:342dd6946aa6bda4b8f18c734576106b8a31f2fe31492881a9a160ec84ff4bd5", size = 448858, upload-time = "2024-10-16T19:44:43.959Z" }, + { url = "https://files.pythonhosted.org/packages/1e/6a/787004fdef2cabea27bad1073bf6a33f2437b4dbd3b6fb4a9d71172b1c7c/httptools-0.6.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b36913ba52008249223042dca46e69967985fb4051951f94357ea681e1f5dc0", size = 452042, upload-time = "2024-10-16T19:44:45.071Z" }, + { url = "https://files.pythonhosted.org/packages/4d/dc/7decab5c404d1d2cdc1bb330b1bf70e83d6af0396fd4fc76fc60c0d522bf/httptools-0.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8", size = 87682, upload-time = "2024-10-16T19:44:46.46Z" }, +] + +[[package]] +name = "idna" +version = "3.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, +] + +[[package]] +name = "kombu" +version = "5.5.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "amqp" }, + { name = "tzdata" }, + { name = "vine" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/60/0a/128b65651ed8120460fc5af754241ad595eac74993115ec0de4f2d7bc459/kombu-5.5.3.tar.gz", hash = "sha256:021a0e11fcfcd9b0260ef1fb64088c0e92beb976eb59c1dfca7ddd4ad4562ea2", size = 461784, upload-time = "2025-04-16T12:46:17.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/35/1407fb0b2f5b07b50cbaf97fce09ad87d3bfefbf64f7171a8651cd8d2f68/kombu-5.5.3-py3-none-any.whl", hash = "sha256:5b0dbceb4edee50aa464f59469d34b97864be09111338cfb224a10b6a163909b", size = 209921, upload-time = "2025-04-16T12:46:15.139Z" }, +] + +[[package]] +name = "mako" +version = "1.3.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/38/bd5b78a920a64d708fe6bc8e0a2c075e1389d53bef8413725c63ba041535/mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28", size = 392474, upload-time = "2025-04-10T12:44:31.16Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509, upload-time = "2025-04-10T12:50:53.297Z" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537, upload-time = "2024-10-18T15:21:54.129Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/90/d08277ce111dd22f77149fd1a5d4653eeb3b3eaacbdfcbae5afb2600eebd/MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8", size = 14357, upload-time = "2024-10-18T15:20:51.44Z" }, + { url = "https://files.pythonhosted.org/packages/04/e1/6e2194baeae0bca1fae6629dc0cbbb968d4d941469cbab11a3872edff374/MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158", size = 12393, upload-time = "2024-10-18T15:20:52.426Z" }, + { url = "https://files.pythonhosted.org/packages/1d/69/35fa85a8ece0a437493dc61ce0bb6d459dcba482c34197e3efc829aa357f/MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579", size = 21732, upload-time = "2024-10-18T15:20:53.578Z" }, + { url = "https://files.pythonhosted.org/packages/22/35/137da042dfb4720b638d2937c38a9c2df83fe32d20e8c8f3185dbfef05f7/MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d", size = 20866, upload-time = "2024-10-18T15:20:55.06Z" }, + { url = "https://files.pythonhosted.org/packages/29/28/6d029a903727a1b62edb51863232152fd335d602def598dade38996887f0/MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb", size = 20964, upload-time = "2024-10-18T15:20:55.906Z" }, + { url = "https://files.pythonhosted.org/packages/cc/cd/07438f95f83e8bc028279909d9c9bd39e24149b0d60053a97b2bc4f8aa51/MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b", size = 21977, upload-time = "2024-10-18T15:20:57.189Z" }, + { url = "https://files.pythonhosted.org/packages/29/01/84b57395b4cc062f9c4c55ce0df7d3108ca32397299d9df00fedd9117d3d/MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c", size = 21366, upload-time = "2024-10-18T15:20:58.235Z" }, + { url = "https://files.pythonhosted.org/packages/bd/6e/61ebf08d8940553afff20d1fb1ba7294b6f8d279df9fd0c0db911b4bbcfd/MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171", size = 21091, upload-time = "2024-10-18T15:20:59.235Z" }, + { url = "https://files.pythonhosted.org/packages/11/23/ffbf53694e8c94ebd1e7e491de185124277964344733c45481f32ede2499/MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50", size = 15065, upload-time = "2024-10-18T15:21:00.307Z" }, + { url = "https://files.pythonhosted.org/packages/44/06/e7175d06dd6e9172d4a69a72592cb3f7a996a9c396eee29082826449bbc3/MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a", size = 15514, upload-time = "2024-10-18T15:21:01.122Z" }, + { url = "https://files.pythonhosted.org/packages/6b/28/bbf83e3f76936960b850435576dd5e67034e200469571be53f69174a2dfd/MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d", size = 14353, upload-time = "2024-10-18T15:21:02.187Z" }, + { url = "https://files.pythonhosted.org/packages/6c/30/316d194b093cde57d448a4c3209f22e3046c5bb2fb0820b118292b334be7/MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93", size = 12392, upload-time = "2024-10-18T15:21:02.941Z" }, + { url = "https://files.pythonhosted.org/packages/f2/96/9cdafba8445d3a53cae530aaf83c38ec64c4d5427d975c974084af5bc5d2/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832", size = 23984, upload-time = "2024-10-18T15:21:03.953Z" }, + { url = "https://files.pythonhosted.org/packages/f1/a4/aefb044a2cd8d7334c8a47d3fb2c9f328ac48cb349468cc31c20b539305f/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84", size = 23120, upload-time = "2024-10-18T15:21:06.495Z" }, + { url = "https://files.pythonhosted.org/packages/8d/21/5e4851379f88f3fad1de30361db501300d4f07bcad047d3cb0449fc51f8c/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca", size = 23032, upload-time = "2024-10-18T15:21:07.295Z" }, + { url = "https://files.pythonhosted.org/packages/00/7b/e92c64e079b2d0d7ddf69899c98842f3f9a60a1ae72657c89ce2655c999d/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798", size = 24057, upload-time = "2024-10-18T15:21:08.073Z" }, + { url = "https://files.pythonhosted.org/packages/f9/ac/46f960ca323037caa0a10662ef97d0a4728e890334fc156b9f9e52bcc4ca/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e", size = 23359, upload-time = "2024-10-18T15:21:09.318Z" }, + { url = "https://files.pythonhosted.org/packages/69/84/83439e16197337b8b14b6a5b9c2105fff81d42c2a7c5b58ac7b62ee2c3b1/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4", size = 23306, upload-time = "2024-10-18T15:21:10.185Z" }, + { url = "https://files.pythonhosted.org/packages/9a/34/a15aa69f01e2181ed8d2b685c0d2f6655d5cca2c4db0ddea775e631918cd/MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d", size = 15094, upload-time = "2024-10-18T15:21:11.005Z" }, + { url = "https://files.pythonhosted.org/packages/da/b8/3a3bd761922d416f3dc5d00bfbed11f66b1ab89a0c2b6e887240a30b0f6b/MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b", size = 15521, upload-time = "2024-10-18T15:21:12.911Z" }, + { url = "https://files.pythonhosted.org/packages/22/09/d1f21434c97fc42f09d290cbb6350d44eb12f09cc62c9476effdb33a18aa/MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf", size = 14274, upload-time = "2024-10-18T15:21:13.777Z" }, + { url = "https://files.pythonhosted.org/packages/6b/b0/18f76bba336fa5aecf79d45dcd6c806c280ec44538b3c13671d49099fdd0/MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225", size = 12348, upload-time = "2024-10-18T15:21:14.822Z" }, + { url = "https://files.pythonhosted.org/packages/e0/25/dd5c0f6ac1311e9b40f4af06c78efde0f3b5cbf02502f8ef9501294c425b/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028", size = 24149, upload-time = "2024-10-18T15:21:15.642Z" }, + { url = "https://files.pythonhosted.org/packages/f3/f0/89e7aadfb3749d0f52234a0c8c7867877876e0a20b60e2188e9850794c17/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8", size = 23118, upload-time = "2024-10-18T15:21:17.133Z" }, + { url = "https://files.pythonhosted.org/packages/d5/da/f2eeb64c723f5e3777bc081da884b414671982008c47dcc1873d81f625b6/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c", size = 22993, upload-time = "2024-10-18T15:21:18.064Z" }, + { url = "https://files.pythonhosted.org/packages/da/0e/1f32af846df486dce7c227fe0f2398dc7e2e51d4a370508281f3c1c5cddc/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557", size = 24178, upload-time = "2024-10-18T15:21:18.859Z" }, + { url = "https://files.pythonhosted.org/packages/c4/f6/bb3ca0532de8086cbff5f06d137064c8410d10779c4c127e0e47d17c0b71/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22", size = 23319, upload-time = "2024-10-18T15:21:19.671Z" }, + { url = "https://files.pythonhosted.org/packages/a2/82/8be4c96ffee03c5b4a034e60a31294daf481e12c7c43ab8e34a1453ee48b/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48", size = 23352, upload-time = "2024-10-18T15:21:20.971Z" }, + { url = "https://files.pythonhosted.org/packages/51/ae/97827349d3fcffee7e184bdf7f41cd6b88d9919c80f0263ba7acd1bbcb18/MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30", size = 15097, upload-time = "2024-10-18T15:21:22.646Z" }, + { url = "https://files.pythonhosted.org/packages/c1/80/a61f99dc3a936413c3ee4e1eecac96c0da5ed07ad56fd975f1a9da5bc630/MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87", size = 15601, upload-time = "2024-10-18T15:21:23.499Z" }, + { url = "https://files.pythonhosted.org/packages/83/0e/67eb10a7ecc77a0c2bbe2b0235765b98d164d81600746914bebada795e97/MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd", size = 14274, upload-time = "2024-10-18T15:21:24.577Z" }, + { url = "https://files.pythonhosted.org/packages/2b/6d/9409f3684d3335375d04e5f05744dfe7e9f120062c9857df4ab490a1031a/MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430", size = 12352, upload-time = "2024-10-18T15:21:25.382Z" }, + { url = "https://files.pythonhosted.org/packages/d2/f5/6eadfcd3885ea85fe2a7c128315cc1bb7241e1987443d78c8fe712d03091/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094", size = 24122, upload-time = "2024-10-18T15:21:26.199Z" }, + { url = "https://files.pythonhosted.org/packages/0c/91/96cf928db8236f1bfab6ce15ad070dfdd02ed88261c2afafd4b43575e9e9/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396", size = 23085, upload-time = "2024-10-18T15:21:27.029Z" }, + { url = "https://files.pythonhosted.org/packages/c2/cf/c9d56af24d56ea04daae7ac0940232d31d5a8354f2b457c6d856b2057d69/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79", size = 22978, upload-time = "2024-10-18T15:21:27.846Z" }, + { url = "https://files.pythonhosted.org/packages/2a/9f/8619835cd6a711d6272d62abb78c033bda638fdc54c4e7f4272cf1c0962b/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a", size = 24208, upload-time = "2024-10-18T15:21:28.744Z" }, + { url = "https://files.pythonhosted.org/packages/f9/bf/176950a1792b2cd2102b8ffeb5133e1ed984547b75db47c25a67d3359f77/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca", size = 23357, upload-time = "2024-10-18T15:21:29.545Z" }, + { url = "https://files.pythonhosted.org/packages/ce/4f/9a02c1d335caabe5c4efb90e1b6e8ee944aa245c1aaaab8e8a618987d816/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c", size = 23344, upload-time = "2024-10-18T15:21:30.366Z" }, + { url = "https://files.pythonhosted.org/packages/ee/55/c271b57db36f748f0e04a759ace9f8f759ccf22b4960c270c78a394f58be/MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1", size = 15101, upload-time = "2024-10-18T15:21:31.207Z" }, + { url = "https://files.pythonhosted.org/packages/29/88/07df22d2dd4df40aba9f3e402e6dc1b8ee86297dddbad4872bd5e7b0094f/MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f", size = 15603, upload-time = "2024-10-18T15:21:32.032Z" }, + { url = "https://files.pythonhosted.org/packages/62/6a/8b89d24db2d32d433dffcd6a8779159da109842434f1dd2f6e71f32f738c/MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c", size = 14510, upload-time = "2024-10-18T15:21:33.625Z" }, + { url = "https://files.pythonhosted.org/packages/7a/06/a10f955f70a2e5a9bf78d11a161029d278eeacbd35ef806c3fd17b13060d/MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb", size = 12486, upload-time = "2024-10-18T15:21:34.611Z" }, + { url = "https://files.pythonhosted.org/packages/34/cf/65d4a571869a1a9078198ca28f39fba5fbb910f952f9dbc5220afff9f5e6/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c", size = 25480, upload-time = "2024-10-18T15:21:35.398Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e3/90e9651924c430b885468b56b3d597cabf6d72be4b24a0acd1fa0e12af67/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d", size = 23914, upload-time = "2024-10-18T15:21:36.231Z" }, + { url = "https://files.pythonhosted.org/packages/66/8c/6c7cf61f95d63bb866db39085150df1f2a5bd3335298f14a66b48e92659c/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe", size = 23796, upload-time = "2024-10-18T15:21:37.073Z" }, + { url = "https://files.pythonhosted.org/packages/bb/35/cbe9238ec3f47ac9a7c8b3df7a808e7cb50fe149dc7039f5f454b3fba218/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5", size = 25473, upload-time = "2024-10-18T15:21:37.932Z" }, + { url = "https://files.pythonhosted.org/packages/e6/32/7621a4382488aa283cc05e8984a9c219abad3bca087be9ec77e89939ded9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a", size = 24114, upload-time = "2024-10-18T15:21:39.799Z" }, + { url = "https://files.pythonhosted.org/packages/0d/80/0985960e4b89922cb5a0bac0ed39c5b96cbc1a536a99f30e8c220a996ed9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9", size = 24098, upload-time = "2024-10-18T15:21:40.813Z" }, + { url = "https://files.pythonhosted.org/packages/82/78/fedb03c7d5380df2427038ec8d973587e90561b2d90cd472ce9254cf348b/MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6", size = 15208, upload-time = "2024-10-18T15:21:41.814Z" }, + { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload-time = "2024-10-18T15:21:42.784Z" }, +] + +[[package]] +name = "moss" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "alembic" }, + { name = "celery" }, + { name = "concurrent-log-handler" }, + { name = "fastapi" }, + { name = "networkx" }, + { name = "psycopg-binary" }, + { name = "python-dotenv" }, + { name = "python-louvain" }, + { name = "redis" }, + { name = "requests" }, + { name = "sqlalchemy" }, + { name = "uvicorn", extra = ["standard"] }, +] + +[package.metadata] +requires-dist = [ + { name = "alembic", specifier = ">=1.15.2" }, + { name = "celery", specifier = ">=5.3.6" }, + { name = "concurrent-log-handler", specifier = ">=0.9.23" }, + { name = "fastapi", specifier = ">=0.115.12" }, + { name = "networkx", specifier = ">=3.0" }, + { name = "psycopg-binary", specifier = ">=3.2.8" }, + { name = "python-dotenv", specifier = ">=1.1.0" }, + { name = "python-louvain", specifier = ">=0.16" }, + { name = "redis", specifier = ">=5.0.4" }, + { name = "requests", specifier = ">=2.32.3" }, + { name = "sqlalchemy", specifier = ">=2.0.40" }, + { name = "uvicorn", extras = ["standard"], specifier = ">=0.34.2" }, +] + +[[package]] +name = "networkx" +version = "3.4.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263, upload-time = "2024-10-21T12:39:36.247Z" }, +] + +[[package]] +name = "numpy" +version = "2.2.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/dc/b2/ce4b867d8cd9c0ee84938ae1e6a6f7926ebf928c9090d036fc3c6a04f946/numpy-2.2.5.tar.gz", hash = "sha256:a9c0d994680cd991b1cb772e8b297340085466a6fe964bc9d4e80f5e2f43c291", size = 20273920, upload-time = "2025-04-19T23:27:42.561Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/4e/3d9e6d16237c2aa5485695f0626cbba82f6481efca2e9132368dea3b885e/numpy-2.2.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1f4a922da1729f4c40932b2af4fe84909c7a6e167e6e99f71838ce3a29f3fe26", size = 21252117, upload-time = "2025-04-19T22:31:01.142Z" }, + { url = "https://files.pythonhosted.org/packages/38/e4/db91349d4079cd15c02ff3b4b8882a529991d6aca077db198a2f2a670406/numpy-2.2.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b6f91524d31b34f4a5fee24f5bc16dcd1491b668798b6d85585d836c1e633a6a", size = 14424615, upload-time = "2025-04-19T22:31:24.873Z" }, + { url = "https://files.pythonhosted.org/packages/f8/59/6e5b011f553c37b008bd115c7ba7106a18f372588fbb1b430b7a5d2c41ce/numpy-2.2.5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:19f4718c9012e3baea91a7dba661dcab2451cda2550678dc30d53acb91a7290f", size = 5428691, upload-time = "2025-04-19T22:31:33.998Z" }, + { url = "https://files.pythonhosted.org/packages/a2/58/d5d70ebdac82b3a6ddf409b3749ca5786636e50fd64d60edb46442af6838/numpy-2.2.5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:eb7fd5b184e5d277afa9ec0ad5e4eb562ecff541e7f60e69ee69c8d59e9aeaba", size = 6965010, upload-time = "2025-04-19T22:31:45.281Z" }, + { url = "https://files.pythonhosted.org/packages/dc/a8/c290394be346d4e7b48a40baf292626fd96ec56a6398ace4c25d9079bc6a/numpy-2.2.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6413d48a9be53e183eb06495d8e3b006ef8f87c324af68241bbe7a39e8ff54c3", size = 14369885, upload-time = "2025-04-19T22:32:06.557Z" }, + { url = "https://files.pythonhosted.org/packages/c2/70/fed13c70aabe7049368553e81d7ca40f305f305800a007a956d7cd2e5476/numpy-2.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7451f92eddf8503c9b8aa4fe6aa7e87fd51a29c2cfc5f7dbd72efde6c65acf57", size = 16418372, upload-time = "2025-04-19T22:32:31.716Z" }, + { url = "https://files.pythonhosted.org/packages/04/ab/c3c14f25ddaecd6fc58a34858f6a93a21eea6c266ba162fa99f3d0de12ac/numpy-2.2.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0bcb1d057b7571334139129b7f941588f69ce7c4ed15a9d6162b2ea54ded700c", size = 15883173, upload-time = "2025-04-19T22:32:55.106Z" }, + { url = "https://files.pythonhosted.org/packages/50/18/f53710a19042911c7aca824afe97c203728a34b8cf123e2d94621a12edc3/numpy-2.2.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:36ab5b23915887543441efd0417e6a3baa08634308894316f446027611b53bf1", size = 18206881, upload-time = "2025-04-19T22:33:22.08Z" }, + { url = "https://files.pythonhosted.org/packages/6b/ec/5b407bab82f10c65af5a5fe754728df03f960fd44d27c036b61f7b3ef255/numpy-2.2.5-cp310-cp310-win32.whl", hash = "sha256:422cc684f17bc963da5f59a31530b3936f57c95a29743056ef7a7903a5dbdf88", size = 6609852, upload-time = "2025-04-19T22:33:33.357Z" }, + { url = "https://files.pythonhosted.org/packages/b6/f5/467ca8675c7e6c567f571d8db942cc10a87588bd9e20a909d8af4171edda/numpy-2.2.5-cp310-cp310-win_amd64.whl", hash = "sha256:e4f0b035d9d0ed519c813ee23e0a733db81ec37d2e9503afbb6e54ccfdee0fa7", size = 12944922, upload-time = "2025-04-19T22:33:53.192Z" }, + { url = "https://files.pythonhosted.org/packages/f5/fb/e4e4c254ba40e8f0c78218f9e86304628c75b6900509b601c8433bdb5da7/numpy-2.2.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c42365005c7a6c42436a54d28c43fe0e01ca11eb2ac3cefe796c25a5f98e5e9b", size = 21256475, upload-time = "2025-04-19T22:34:24.174Z" }, + { url = "https://files.pythonhosted.org/packages/81/32/dd1f7084f5c10b2caad778258fdaeedd7fbd8afcd2510672811e6138dfac/numpy-2.2.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:498815b96f67dc347e03b719ef49c772589fb74b8ee9ea2c37feae915ad6ebda", size = 14461474, upload-time = "2025-04-19T22:34:46.578Z" }, + { url = "https://files.pythonhosted.org/packages/0e/65/937cdf238ef6ac54ff749c0f66d9ee2b03646034c205cea9b6c51f2f3ad1/numpy-2.2.5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:6411f744f7f20081b1b4e7112e0f4c9c5b08f94b9f086e6f0adf3645f85d3a4d", size = 5426875, upload-time = "2025-04-19T22:34:56.281Z" }, + { url = "https://files.pythonhosted.org/packages/25/17/814515fdd545b07306eaee552b65c765035ea302d17de1b9cb50852d2452/numpy-2.2.5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:9de6832228f617c9ef45d948ec1cd8949c482238d68b2477e6f642c33a7b0a54", size = 6969176, upload-time = "2025-04-19T22:35:07.518Z" }, + { url = "https://files.pythonhosted.org/packages/e5/32/a66db7a5c8b5301ec329ab36d0ecca23f5e18907f43dbd593c8ec326d57c/numpy-2.2.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:369e0d4647c17c9363244f3468f2227d557a74b6781cb62ce57cf3ef5cc7c610", size = 14374850, upload-time = "2025-04-19T22:35:31.347Z" }, + { url = "https://files.pythonhosted.org/packages/ad/c9/1bf6ada582eebcbe8978f5feb26584cd2b39f94ededeea034ca8f84af8c8/numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:262d23f383170f99cd9191a7c85b9a50970fe9069b2f8ab5d786eca8a675d60b", size = 16430306, upload-time = "2025-04-19T22:35:57.573Z" }, + { url = "https://files.pythonhosted.org/packages/6a/f0/3f741863f29e128f4fcfdb99253cc971406b402b4584663710ee07f5f7eb/numpy-2.2.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aa70fdbdc3b169d69e8c59e65c07a1c9351ceb438e627f0fdcd471015cd956be", size = 15884767, upload-time = "2025-04-19T22:36:22.245Z" }, + { url = "https://files.pythonhosted.org/packages/98/d9/4ccd8fd6410f7bf2d312cbc98892e0e43c2fcdd1deae293aeb0a93b18071/numpy-2.2.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37e32e985f03c06206582a7323ef926b4e78bdaa6915095ef08070471865b906", size = 18219515, upload-time = "2025-04-19T22:36:49.822Z" }, + { url = "https://files.pythonhosted.org/packages/b1/56/783237243d4395c6dd741cf16eeb1a9035ee3d4310900e6b17e875d1b201/numpy-2.2.5-cp311-cp311-win32.whl", hash = "sha256:f5045039100ed58fa817a6227a356240ea1b9a1bc141018864c306c1a16d4175", size = 6607842, upload-time = "2025-04-19T22:37:01.624Z" }, + { url = "https://files.pythonhosted.org/packages/98/89/0c93baaf0094bdaaaa0536fe61a27b1dce8a505fa262a865ec142208cfe9/numpy-2.2.5-cp311-cp311-win_amd64.whl", hash = "sha256:b13f04968b46ad705f7c8a80122a42ae8f620536ea38cf4bdd374302926424dd", size = 12949071, upload-time = "2025-04-19T22:37:21.098Z" }, + { url = "https://files.pythonhosted.org/packages/e2/f7/1fd4ff108cd9d7ef929b8882692e23665dc9c23feecafbb9c6b80f4ec583/numpy-2.2.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ee461a4eaab4f165b68780a6a1af95fb23a29932be7569b9fab666c407969051", size = 20948633, upload-time = "2025-04-19T22:37:52.4Z" }, + { url = "https://files.pythonhosted.org/packages/12/03/d443c278348371b20d830af155ff2079acad6a9e60279fac2b41dbbb73d8/numpy-2.2.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ec31367fd6a255dc8de4772bd1658c3e926d8e860a0b6e922b615e532d320ddc", size = 14176123, upload-time = "2025-04-19T22:38:15.058Z" }, + { url = "https://files.pythonhosted.org/packages/2b/0b/5ca264641d0e7b14393313304da48b225d15d471250376f3fbdb1a2be603/numpy-2.2.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:47834cde750d3c9f4e52c6ca28a7361859fcaf52695c7dc3cc1a720b8922683e", size = 5163817, upload-time = "2025-04-19T22:38:24.885Z" }, + { url = "https://files.pythonhosted.org/packages/04/b3/d522672b9e3d28e26e1613de7675b441bbd1eaca75db95680635dd158c67/numpy-2.2.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:2c1a1c6ccce4022383583a6ded7bbcda22fc635eb4eb1e0a053336425ed36dfa", size = 6698066, upload-time = "2025-04-19T22:38:35.782Z" }, + { url = "https://files.pythonhosted.org/packages/a0/93/0f7a75c1ff02d4b76df35079676b3b2719fcdfb39abdf44c8b33f43ef37d/numpy-2.2.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d75f338f5f79ee23548b03d801d28a505198297534f62416391857ea0479571", size = 14087277, upload-time = "2025-04-19T22:38:57.697Z" }, + { url = "https://files.pythonhosted.org/packages/b0/d9/7c338b923c53d431bc837b5b787052fef9ae68a56fe91e325aac0d48226e/numpy-2.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a801fef99668f309b88640e28d261991bfad9617c27beda4a3aec4f217ea073", size = 16135742, upload-time = "2025-04-19T22:39:22.689Z" }, + { url = "https://files.pythonhosted.org/packages/2d/10/4dec9184a5d74ba9867c6f7d1e9f2e0fb5fe96ff2bf50bb6f342d64f2003/numpy-2.2.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:abe38cd8381245a7f49967a6010e77dbf3680bd3627c0fe4362dd693b404c7f8", size = 15581825, upload-time = "2025-04-19T22:39:45.794Z" }, + { url = "https://files.pythonhosted.org/packages/80/1f/2b6fcd636e848053f5b57712a7d1880b1565eec35a637fdfd0a30d5e738d/numpy-2.2.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5a0ac90e46fdb5649ab6369d1ab6104bfe5854ab19b645bf5cda0127a13034ae", size = 17899600, upload-time = "2025-04-19T22:40:13.427Z" }, + { url = "https://files.pythonhosted.org/packages/ec/87/36801f4dc2623d76a0a3835975524a84bd2b18fe0f8835d45c8eae2f9ff2/numpy-2.2.5-cp312-cp312-win32.whl", hash = "sha256:0cd48122a6b7eab8f06404805b1bd5856200e3ed6f8a1b9a194f9d9054631beb", size = 6312626, upload-time = "2025-04-19T22:40:25.223Z" }, + { url = "https://files.pythonhosted.org/packages/8b/09/4ffb4d6cfe7ca6707336187951992bd8a8b9142cf345d87ab858d2d7636a/numpy-2.2.5-cp312-cp312-win_amd64.whl", hash = "sha256:ced69262a8278547e63409b2653b372bf4baff0870c57efa76c5703fd6543282", size = 12645715, upload-time = "2025-04-19T22:40:44.528Z" }, + { url = "https://files.pythonhosted.org/packages/e2/a0/0aa7f0f4509a2e07bd7a509042967c2fab635690d4f48c6c7b3afd4f448c/numpy-2.2.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:059b51b658f4414fff78c6d7b1b4e18283ab5fa56d270ff212d5ba0c561846f4", size = 20935102, upload-time = "2025-04-19T22:41:16.234Z" }, + { url = "https://files.pythonhosted.org/packages/7e/e4/a6a9f4537542912ec513185396fce52cdd45bdcf3e9d921ab02a93ca5aa9/numpy-2.2.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:47f9ed103af0bc63182609044b0490747e03bd20a67e391192dde119bf43d52f", size = 14191709, upload-time = "2025-04-19T22:41:38.472Z" }, + { url = "https://files.pythonhosted.org/packages/be/65/72f3186b6050bbfe9c43cb81f9df59ae63603491d36179cf7a7c8d216758/numpy-2.2.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:261a1ef047751bb02f29dfe337230b5882b54521ca121fc7f62668133cb119c9", size = 5149173, upload-time = "2025-04-19T22:41:47.823Z" }, + { url = "https://files.pythonhosted.org/packages/e5/e9/83e7a9432378dde5802651307ae5e9ea07bb72b416728202218cd4da2801/numpy-2.2.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4520caa3807c1ceb005d125a75e715567806fed67e315cea619d5ec6e75a4191", size = 6684502, upload-time = "2025-04-19T22:41:58.689Z" }, + { url = "https://files.pythonhosted.org/packages/ea/27/b80da6c762394c8ee516b74c1f686fcd16c8f23b14de57ba0cad7349d1d2/numpy-2.2.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d14b17b9be5f9c9301f43d2e2a4886a33b53f4e6fdf9ca2f4cc60aeeee76372", size = 14084417, upload-time = "2025-04-19T22:42:19.897Z" }, + { url = "https://files.pythonhosted.org/packages/aa/fc/ebfd32c3e124e6a1043e19c0ab0769818aa69050ce5589b63d05ff185526/numpy-2.2.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ba321813a00e508d5421104464510cc962a6f791aa2fca1c97b1e65027da80d", size = 16133807, upload-time = "2025-04-19T22:42:44.433Z" }, + { url = "https://files.pythonhosted.org/packages/bf/9b/4cc171a0acbe4666f7775cfd21d4eb6bb1d36d3a0431f48a73e9212d2278/numpy-2.2.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4cbdef3ddf777423060c6f81b5694bad2dc9675f110c4b2a60dc0181543fac7", size = 15575611, upload-time = "2025-04-19T22:43:09.928Z" }, + { url = "https://files.pythonhosted.org/packages/a3/45/40f4135341850df48f8edcf949cf47b523c404b712774f8855a64c96ef29/numpy-2.2.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:54088a5a147ab71a8e7fdfd8c3601972751ded0739c6b696ad9cb0343e21ab73", size = 17895747, upload-time = "2025-04-19T22:43:36.983Z" }, + { url = "https://files.pythonhosted.org/packages/f8/4c/b32a17a46f0ffbde8cc82df6d3daeaf4f552e346df143e1b188a701a8f09/numpy-2.2.5-cp313-cp313-win32.whl", hash = "sha256:c8b82a55ef86a2d8e81b63da85e55f5537d2157165be1cb2ce7cfa57b6aef38b", size = 6309594, upload-time = "2025-04-19T22:47:10.523Z" }, + { url = "https://files.pythonhosted.org/packages/13/ae/72e6276feb9ef06787365b05915bfdb057d01fceb4a43cb80978e518d79b/numpy-2.2.5-cp313-cp313-win_amd64.whl", hash = "sha256:d8882a829fd779f0f43998e931c466802a77ca1ee0fe25a3abe50278616b1471", size = 12638356, upload-time = "2025-04-19T22:47:30.253Z" }, + { url = "https://files.pythonhosted.org/packages/79/56/be8b85a9f2adb688e7ded6324e20149a03541d2b3297c3ffc1a73f46dedb/numpy-2.2.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:e8b025c351b9f0e8b5436cf28a07fa4ac0204d67b38f01433ac7f9b870fa38c6", size = 20963778, upload-time = "2025-04-19T22:44:09.251Z" }, + { url = "https://files.pythonhosted.org/packages/ff/77/19c5e62d55bff507a18c3cdff82e94fe174957bad25860a991cac719d3ab/numpy-2.2.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8dfa94b6a4374e7851bbb6f35e6ded2120b752b063e6acdd3157e4d2bb922eba", size = 14207279, upload-time = "2025-04-19T22:44:31.383Z" }, + { url = "https://files.pythonhosted.org/packages/75/22/aa11f22dc11ff4ffe4e849d9b63bbe8d4ac6d5fae85ddaa67dfe43be3e76/numpy-2.2.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:97c8425d4e26437e65e1d189d22dff4a079b747ff9c2788057bfb8114ce1e133", size = 5199247, upload-time = "2025-04-19T22:44:40.361Z" }, + { url = "https://files.pythonhosted.org/packages/4f/6c/12d5e760fc62c08eded0394f62039f5a9857f758312bf01632a81d841459/numpy-2.2.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:352d330048c055ea6db701130abc48a21bec690a8d38f8284e00fab256dc1376", size = 6711087, upload-time = "2025-04-19T22:44:51.188Z" }, + { url = "https://files.pythonhosted.org/packages/ef/94/ece8280cf4218b2bee5cec9567629e61e51b4be501e5c6840ceb593db945/numpy-2.2.5-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b4c0773b6ada798f51f0f8e30c054d32304ccc6e9c5d93d46cb26f3d385ab19", size = 14059964, upload-time = "2025-04-19T22:45:12.451Z" }, + { url = "https://files.pythonhosted.org/packages/39/41/c5377dac0514aaeec69115830a39d905b1882819c8e65d97fc60e177e19e/numpy-2.2.5-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55f09e00d4dccd76b179c0f18a44f041e5332fd0e022886ba1c0bbf3ea4a18d0", size = 16121214, upload-time = "2025-04-19T22:45:37.734Z" }, + { url = "https://files.pythonhosted.org/packages/db/54/3b9f89a943257bc8e187145c6bc0eb8e3d615655f7b14e9b490b053e8149/numpy-2.2.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:02f226baeefa68f7d579e213d0f3493496397d8f1cff5e2b222af274c86a552a", size = 15575788, upload-time = "2025-04-19T22:46:01.908Z" }, + { url = "https://files.pythonhosted.org/packages/b1/c4/2e407e85df35b29f79945751b8f8e671057a13a376497d7fb2151ba0d290/numpy-2.2.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c26843fd58f65da9491165072da2cccc372530681de481ef670dcc8e27cfb066", size = 17893672, upload-time = "2025-04-19T22:46:28.585Z" }, + { url = "https://files.pythonhosted.org/packages/29/7e/d0b44e129d038dba453f00d0e29ebd6eaf2f06055d72b95b9947998aca14/numpy-2.2.5-cp313-cp313t-win32.whl", hash = "sha256:1a161c2c79ab30fe4501d5a2bbfe8b162490757cf90b7f05be8b80bc02f7bb8e", size = 6377102, upload-time = "2025-04-19T22:46:39.949Z" }, + { url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096, upload-time = "2025-04-19T22:47:00.147Z" }, + { url = "https://files.pythonhosted.org/packages/35/e4/5ef5ef1d4308f96961198b2323bfc7c7afb0ccc0d623b01c79bc87ab496d/numpy-2.2.5-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b4ea7e1cff6784e58fe281ce7e7f05036b3e1c89c6f922a6bfbc0a7e8768adbe", size = 21083404, upload-time = "2025-04-19T22:48:01.605Z" }, + { url = "https://files.pythonhosted.org/packages/a3/5f/bde9238e8e977652a16a4b114ed8aa8bb093d718c706eeecb5f7bfa59572/numpy-2.2.5-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:d7543263084a85fbc09c704b515395398d31d6395518446237eac219eab9e55e", size = 6828578, upload-time = "2025-04-19T22:48:13.118Z" }, + { url = "https://files.pythonhosted.org/packages/ef/7f/813f51ed86e559ab2afb6a6f33aa6baf8a560097e25e4882a938986c76c2/numpy-2.2.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0255732338c4fdd00996c0421884ea8a3651eea555c3a56b84892b66f696eb70", size = 16234796, upload-time = "2025-04-19T22:48:37.102Z" }, + { url = "https://files.pythonhosted.org/packages/68/67/1175790323026d3337cc285cc9c50eca637d70472b5e622529df74bb8f37/numpy-2.2.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d2e3bdadaba0e040d1e7ab39db73e0afe2c74ae277f5614dad53eadbecbbb169", size = 12859001, upload-time = "2025-04-19T22:48:57.665Z" }, +] + +[[package]] +name = "portalocker" +version = "3.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pywin32", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ac/91/8bfe23e1f7f630f2061ef38b5225d9fda9068d6a30fcbc187951e678e630/portalocker-3.1.1.tar.gz", hash = "sha256:ec20f6dda2ad9ce89fa399a5f31f4f1495f515958f0cb7ca6543cef7bb5a749e", size = 43708, upload-time = "2024-12-31T14:22:48.535Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/60/1974cfdd5bb770568ddc6f89f3e0df4cfdd1acffd5a609dff5e95f48c6e2/portalocker-3.1.1-py3-none-any.whl", hash = "sha256:80e984e24de292ff258a5bea0e4f3f778fff84c0ae1275dbaebc4658de4aacb3", size = 19661, upload-time = "2024-12-31T14:22:47.019Z" }, +] + +[[package]] +name = "prompt-toolkit" +version = "3.0.51" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bb/6e/9d084c929dfe9e3bfe0c6a47e31f78a25c54627d64a66e884a8bf5474f1c/prompt_toolkit-3.0.51.tar.gz", hash = "sha256:931a162e3b27fc90c86f1b48bb1fb2c528c2761475e57c9c06de13311c7b54ed", size = 428940, upload-time = "2025-04-15T09:18:47.731Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl", hash = "sha256:52742911fde84e2d423e2f9a4cf1de7d7ac4e51958f648d9540e0fb8db077b07", size = 387810, upload-time = "2025-04-15T09:18:44.753Z" }, +] + +[[package]] +name = "psycopg-binary" +version = "3.2.8" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/4e/f753d7b5a8a63e5884adde8a45e5a99be5c219ff4484761af923a0619b47/psycopg_binary-3.2.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0694548e1633c2ea819406c5bfd297bf1b4f6f8638dec0d639ab9764fdebcb2a", size = 4033084, upload-time = "2025-05-11T17:15:49.386Z" }, + { url = "https://files.pythonhosted.org/packages/af/d3/94c9509011244a0b5518c77caab7ff4f8c36d0ee66a6125ce06692a32b62/psycopg_binary-3.2.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:85851cdc18b514f80790f711a25406515b42f6b64e9a5d3940ae399e3b0e2c23", size = 4082142, upload-time = "2025-05-11T17:15:55.043Z" }, + { url = "https://files.pythonhosted.org/packages/ea/a0/6e1e21777c6eb65bc0152671db707ac73068079706a2e1375265529aa942/psycopg_binary-3.2.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:040c2a768bd9ae572421ee5695a6299e08147dd44bc8ac514961323dc5c31a62", size = 4678993, upload-time = "2025-05-11T17:16:02.8Z" }, + { url = "https://files.pythonhosted.org/packages/ca/6e/fc78d0fcc620c983bd6fcd41ba504c6513640cb11c3cec5f29f788768603/psycopg_binary-3.2.8-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0bdb5567e81374734539f7b7deb9d547271585ec42a7866ea06bffa58fa5cd5a", size = 4500118, upload-time = "2025-05-11T17:16:09.636Z" }, + { url = "https://files.pythonhosted.org/packages/c8/1c/a2325279cf4e085e8f09f1c0a1a405802406140b6125d2c960987f5265a0/psycopg_binary-3.2.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:289d2575edc00391c4bf586048701638126f396a76db83f36463d1c2b3495aae", size = 4766984, upload-time = "2025-05-11T17:16:14.237Z" }, + { url = "https://files.pythonhosted.org/packages/db/b0/4311b96362c0451ca037a363db1bb3769f03b8ea5a0459b69f924eb786a7/psycopg_binary-3.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c3a3b330c44e01ee29b3b76ddbb86890fbaf7e4b2f9abd43220d050642edee3", size = 4461989, upload-time = "2025-05-11T17:16:18.015Z" }, + { url = "https://files.pythonhosted.org/packages/84/cc/f8ba7eddfa61460713c88130843da65fa5ecbe85108a4a5b4261cef01a38/psycopg_binary-3.2.8-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:814d533e6a8359c2962e28a36fad2698c15639716459fe1100e859b6173c3b6d", size = 3777949, upload-time = "2025-05-11T17:16:22.003Z" }, + { url = "https://files.pythonhosted.org/packages/8e/9c/7398af2ad041fe278e0b98edcb2ee5dd176500ff24a51fd3f0296f29886a/psycopg_binary-3.2.8-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b67f78f75b033d8833ec40432c28610c275455e0172762919912a5e6b9db6366", size = 3337502, upload-time = "2025-05-11T17:16:25.996Z" }, + { url = "https://files.pythonhosted.org/packages/94/a0/308b4720c0b8d63ce96253f288d0ad7a36508d7d457d61ebb3ffaf3c494a/psycopg_binary-3.2.8-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:b98f7dc1ed83889803d0df2d327c94c95a487b9976215c3e9adb0dbb7a220d76", size = 3440809, upload-time = "2025-05-11T17:16:30.095Z" }, + { url = "https://files.pythonhosted.org/packages/51/3e/1f16b908a903ac5adb3af4d3b2643cda334928bd530b8618df262d89baf2/psycopg_binary-3.2.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a9c54bd5d91c6e1cc1e6f9127f175ce3162d8435cf8d4715149598c9baab4ff5", size = 3497231, upload-time = "2025-05-11T17:16:34.39Z" }, + { url = "https://files.pythonhosted.org/packages/1e/d1/4e09eda60266ef96f5c8f061d43b413040bfcb469b715078c7b55d6d53fd/psycopg_binary-3.2.8-cp310-cp310-win_amd64.whl", hash = "sha256:2aba18f57da97b96ea9a6663c8982038a9d4a47b1f94f004ffa9491bd7d21160", size = 3782900, upload-time = "2025-05-11T17:16:38.937Z" }, + { url = "https://files.pythonhosted.org/packages/31/40/87bbdef58f347b54241a9df97f4870cde4083e8611b0e9404af9ed2fbeb3/psycopg_binary-3.2.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:076bd384a0d8bb7a59514b0d62bb75b48f83955a32ebec408b08db0e51bb06e5", size = 4040776, upload-time = "2025-05-11T17:16:43.159Z" }, + { url = "https://files.pythonhosted.org/packages/f9/2b/c7927dc71f570a8d7da0b0582c8c8a937aaa154a62bae5119377a9532ba8/psycopg_binary-3.2.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f162a44ed7e06ed075cbc9dfda23850a7f702c44af4b62061e9c83430130ff36", size = 4087603, upload-time = "2025-05-11T17:16:47.151Z" }, + { url = "https://files.pythonhosted.org/packages/99/a7/34c8eb1762ab4e27321992febff0589f994dd50ef0f457bc9fa42573ecbc/psycopg_binary-3.2.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e450989848bb63315e1768e6c6026cfdf6f72450c3752ce9f6e307c1d62b8d", size = 4676528, upload-time = "2025-05-11T17:16:52.587Z" }, + { url = "https://files.pythonhosted.org/packages/91/b0/54e4175b4113d46c172ac7423c0270cae4f947456b69ec7ceba966869c92/psycopg_binary-3.2.8-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:90c0f2c88578db2bbeea98cd10fcb6f635c0b5bdd23ae90a931716589094ed08", size = 4495671, upload-time = "2025-05-11T17:16:57.58Z" }, + { url = "https://files.pythonhosted.org/packages/8e/ab/1cb155dd800584547f0b282ecb0db16dd96e309b1d6e9fee28ecf18a7886/psycopg_binary-3.2.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:75a929759a498b1b59481091da731f928e0cdbd3d7393b8a1022a1b57f01a91a", size = 4768129, upload-time = "2025-05-11T17:17:01.741Z" }, + { url = "https://files.pythonhosted.org/packages/5b/09/3ea950dea55a5e6aaba6b15baffd121e08ad3adfaa47308593301fd1f979/psycopg_binary-3.2.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d310d188bb349a5f66cc037f7416fd640ca9847d0083a63ba6c091fd45075482", size = 4458392, upload-time = "2025-05-11T17:17:10.136Z" }, + { url = "https://files.pythonhosted.org/packages/d0/a4/c8ee70d5ca48d0f8447d986727a163c72b49f884d4206463e7711734943b/psycopg_binary-3.2.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f4965bc9d2ef8eed31ff411840e2ab0e1d0c1c59575e0154ced7b652ef0eaa33", size = 3776879, upload-time = "2025-05-11T17:17:16.614Z" }, + { url = "https://files.pythonhosted.org/packages/71/b9/e5a92b9dffe503f199018e784f2171dbf059136ea8be052eda1e0d81185e/psycopg_binary-3.2.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5f1c26c1213efba8102911099af2203db6859855f7ceba21fd941e6d2bc7e84e", size = 3333329, upload-time = "2025-05-11T17:17:20.998Z" }, + { url = "https://files.pythonhosted.org/packages/a6/b1/61aefcc3b38fa970c0ed2530cd42440707550b273bbaf26f6f51a34872a4/psycopg_binary-3.2.8-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:58c5c7ef4daaaefb1e656a307ceb61aa3a101a5eb843004579d423428bef66e5", size = 3435684, upload-time = "2025-05-11T17:17:24.326Z" }, + { url = "https://files.pythonhosted.org/packages/4b/51/c3bf340054e999fafdba6b114c7f1cddeb71c53de1bba3ff1571ae9b96b9/psycopg_binary-3.2.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4f501ee2b41a153aee59a3a5db238718f801ac39eec54ad3f28fbe657002e944", size = 3497123, upload-time = "2025-05-11T17:17:28.633Z" }, + { url = "https://files.pythonhosted.org/packages/9a/83/8b7131d778d9e57d332f7bc174411a5987da2e36e6fcac3838794e6152aa/psycopg_binary-3.2.8-cp311-cp311-win_amd64.whl", hash = "sha256:fe51d8297bc8c178be1cc0ac6c060bfd706afb5cb04e794a44feae27c0afe6f4", size = 3785752, upload-time = "2025-05-11T17:17:32.838Z" }, + { url = "https://files.pythonhosted.org/packages/06/8e/d4ec28505cc1694bc3d9bbb329864fa9ca13f236bf78b16da092b9a99595/psycopg_binary-3.2.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1c330b86bc5ea67fee529d3c7b53c6394f8cacad77a3214c50fce0d5bdbc10cf", size = 4022230, upload-time = "2025-05-11T17:17:37.381Z" }, + { url = "https://files.pythonhosted.org/packages/d0/58/ee9bbecdf02f3f2c4beaef7764438fc2f468bb72fc6bfbe570ad6359f6e6/psycopg_binary-3.2.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9ce4e637ac339bfe583ac26e18232c33f9039c93cfc01adaec550cb5e8a03f87", size = 4083799, upload-time = "2025-05-11T17:17:41.519Z" }, + { url = "https://files.pythonhosted.org/packages/bc/da/3c52acf0e267d128bb066e53add32cbc71a2f82d523f1748e3ca530c913c/psycopg_binary-3.2.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:272ee7cd175996c7262f7ffb561593829b448032a52c545d844bc6a4fb77b078", size = 4655046, upload-time = "2025-05-11T17:17:46.134Z" }, + { url = "https://files.pythonhosted.org/packages/58/9b/b2ef57c791f098805299da38a0cb6929aff94e7056f5be2721d6739c6e60/psycopg_binary-3.2.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7237b1abcc36c04b45916c983a6c3d799104201f72475eab367874a5f37d3e7", size = 4477969, upload-time = "2025-05-11T17:17:50.661Z" }, + { url = "https://files.pythonhosted.org/packages/1f/d9/be82b51b12ea514573cd249eab01e59949a8f4db33a10e832cff0217eef1/psycopg_binary-3.2.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c9a30a1d8338823603cf064637aae5580c41ed95675c7aee6a47165784d0464", size = 4737511, upload-time = "2025-05-11T17:17:55.586Z" }, + { url = "https://files.pythonhosted.org/packages/14/14/386413b8cf41d8bc921dd8e749a8e7cf9c5439e61849caa2511d265d699d/psycopg_binary-3.2.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f27d5ae05062f8ea0da6c11262ba8a1ab70864b1c18ea65d9e61636a8c72da4", size = 4436158, upload-time = "2025-05-11T17:18:00.181Z" }, + { url = "https://files.pythonhosted.org/packages/b9/a8/757a5d85a38e3c2bd9b580d2911d7af3eb3a97818a115a82c1854707f2e1/psycopg_binary-3.2.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:10fa234801b9b8b23799f869300c632a3298fb8daecd2d5734d08ab76e7a17cb", size = 3753518, upload-time = "2025-05-11T17:18:04.559Z" }, + { url = "https://files.pythonhosted.org/packages/0a/52/7b38e6a81d97aeacdb58cb73ca9cd29514071409ec7bd8b301bed97df199/psycopg_binary-3.2.8-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b055dba7df07c39f6a40a71862bf5525320350e3bd4c6d1809342fb7061d111f", size = 3313599, upload-time = "2025-05-11T17:18:10.247Z" }, + { url = "https://files.pythonhosted.org/packages/83/77/e74d3f5dcdd94858b5f6e255fd7cab5a7cdc5e9812b08faf3ae88a9b30ba/psycopg_binary-3.2.8-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8c36b8d3f76e2831f3b33f34226952ed39d1d6a79cb2ca2bf044f28df9c6b5f0", size = 3407291, upload-time = "2025-05-11T17:18:15.932Z" }, + { url = "https://files.pythonhosted.org/packages/fd/30/3d0a5931dacd5faeb94136d26a5cdbcd6bc4fa0005e71e6932b86f34db2e/psycopg_binary-3.2.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:764f9163ad9cfd32abd2d06f3000a52faf7a2b2411801d681ebe9158d72b46d5", size = 3472496, upload-time = "2025-05-11T17:18:20.318Z" }, + { url = "https://files.pythonhosted.org/packages/3f/2d/21663d776fdbb3f49b581d9be5137aef9fe5d7dee750ee8085d383449d3a/psycopg_binary-3.2.8-cp312-cp312-win_amd64.whl", hash = "sha256:d8fa6fec9f7e225458d0031c43dd6d20673f55953eebe539d37e4b94b8831984", size = 3773878, upload-time = "2025-05-11T17:18:24.673Z" }, + { url = "https://files.pythonhosted.org/packages/e8/0c/6a29d13d947021e200b5933858a1399a45587bc2e698a2864622e454e84d/psycopg_binary-3.2.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84f03982598a6353cf70cafae34c16da28eac74ba9862cc740b6ba0dcf9721fc", size = 4017121, upload-time = "2025-05-11T17:18:29.089Z" }, + { url = "https://files.pythonhosted.org/packages/7b/2d/49b881a66b8264ae8f9cb60db588838a97f12d2c8355bbbe6966539895d9/psycopg_binary-3.2.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d247f55b28afc4a87b77240e733419ad0c82be2ec122a0b93fbb227ee0e6608e", size = 4080326, upload-time = "2025-05-11T17:18:33.424Z" }, + { url = "https://files.pythonhosted.org/packages/44/bd/3752c86f6819797c722b48af3513837d1c31accc2216ebe5c02f857ff6aa/psycopg_binary-3.2.8-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89eb0c15c0eec1c81256e9df3c01d9bd1067f4365872f6f81da7521ab30e19de", size = 4655096, upload-time = "2025-05-11T17:18:37.883Z" }, + { url = "https://files.pythonhosted.org/packages/fe/c8/ee544b8a73b52ab5b91ff36274f48628204b6f2edafdbe1f47a5473ee4c4/psycopg_binary-3.2.8-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aef90bdc201f2d375e5996d44124c588d3a7ce9f67c79f30531cdc5ead2c3d", size = 4482112, upload-time = "2025-05-11T17:18:42.75Z" }, + { url = "https://files.pythonhosted.org/packages/dc/f1/5d83d6069c0e69fd623088022f08bcaab3af39ca82be82846278f83ff6ea/psycopg_binary-3.2.8-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b60a17eca6a6906af8084c518be81bd71a3d50ddc69c0dc667d6ce9b8f4d8604", size = 4737683, upload-time = "2025-05-11T17:18:47.579Z" }, + { url = "https://files.pythonhosted.org/packages/84/19/2e1df0c4e30ec95d7c553507329661400f2deed7f54734196ce9fb6257aa/psycopg_binary-3.2.8-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8297d92f41e19b6794b04bdf7d53938a5ad8e68f7105b50048a078477b7ee4b8", size = 4437422, upload-time = "2025-05-11T17:18:52.811Z" }, + { url = "https://files.pythonhosted.org/packages/ad/8c/491827d42ebca49b3478b66ee160ba3055f3122eb27db33de8606d02e1e4/psycopg_binary-3.2.8-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a547d53e615776f8e79768aacd7a12c6f0131fa1d6820d2e3e848261b0ad3849", size = 3758667, upload-time = "2025-05-11T17:18:57.438Z" }, + { url = "https://files.pythonhosted.org/packages/09/55/617735f4110cc0d0e5e24a42e738f9d3ea73a00d9e88d57a657af0b7cb5f/psycopg_binary-3.2.8-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:058cfd606f1dc0be9b5a80d208fb9b487f7b4986a955322cbb45cee7e3e8056e", size = 3320577, upload-time = "2025-05-11T17:19:01.713Z" }, + { url = "https://files.pythonhosted.org/packages/88/97/69300bf1354c43bba633826ebd82a1c804541679e4ab53b96bb0eaafe4fb/psycopg_binary-3.2.8-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:15d21ed3292fb19b6ab096c3522d561d196eeef3903c31f1318df7478eb96fa5", size = 3411439, upload-time = "2025-05-11T17:19:06.088Z" }, + { url = "https://files.pythonhosted.org/packages/14/64/5a0aa4c3ddfbf6530b24aecff97e3eb9a0eedf67c61a0ff1dd95d847f5c7/psycopg_binary-3.2.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a6384f81c33a369144e4b98cbb4bf3ec4ac102ae11cfb84e70cf99aa43a44925", size = 3477479, upload-time = "2025-05-11T17:19:09.624Z" }, + { url = "https://files.pythonhosted.org/packages/50/33/f08b2d0b6608e51f013fa877bcc296baaac653b1658d7f1e35c6793fece4/psycopg_binary-3.2.8-cp313-cp313-win_amd64.whl", hash = "sha256:60db59a0f1676f70c027a8273b7b360af85ef87bf43cd49eb63727b72a170a9f", size = 3774539, upload-time = "2025-05-11T17:19:16.679Z" }, +] + +[[package]] +name = "pydantic" +version = "2.11.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/77/ab/5250d56ad03884ab5efd07f734203943c8a8ab40d551e208af81d0257bf2/pydantic-2.11.4.tar.gz", hash = "sha256:32738d19d63a226a52eed76645a98ee07c1f410ee41d93b4afbfa85ed8111c2d", size = 786540, upload-time = "2025-04-29T20:38:55.02Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/12/46b65f3534d099349e38ef6ec98b1a5a81f42536d17e0ba382c28c67ba67/pydantic-2.11.4-py3-none-any.whl", hash = "sha256:d9615eaa9ac5a063471da949c8fc16376a84afb5024688b3ff885693506764eb", size = 443900, upload-time = "2025-04-29T20:38:52.724Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.33.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/92/b31726561b5dae176c2d2c2dc43a9c5bfba5d32f96f8b4c0a600dd492447/pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8", size = 2028817, upload-time = "2025-04-23T18:30:43.919Z" }, + { url = "https://files.pythonhosted.org/packages/a3/44/3f0b95fafdaca04a483c4e685fe437c6891001bf3ce8b2fded82b9ea3aa1/pydantic_core-2.33.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d", size = 1861357, upload-time = "2025-04-23T18:30:46.372Z" }, + { url = "https://files.pythonhosted.org/packages/30/97/e8f13b55766234caae05372826e8e4b3b96e7b248be3157f53237682e43c/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d", size = 1898011, upload-time = "2025-04-23T18:30:47.591Z" }, + { url = "https://files.pythonhosted.org/packages/9b/a3/99c48cf7bafc991cc3ee66fd544c0aae8dc907b752f1dad2d79b1b5a471f/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572", size = 1982730, upload-time = "2025-04-23T18:30:49.328Z" }, + { url = "https://files.pythonhosted.org/packages/de/8e/a5b882ec4307010a840fb8b58bd9bf65d1840c92eae7534c7441709bf54b/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02", size = 2136178, upload-time = "2025-04-23T18:30:50.907Z" }, + { url = "https://files.pythonhosted.org/packages/e4/bb/71e35fc3ed05af6834e890edb75968e2802fe98778971ab5cba20a162315/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b", size = 2736462, upload-time = "2025-04-23T18:30:52.083Z" }, + { url = "https://files.pythonhosted.org/packages/31/0d/c8f7593e6bc7066289bbc366f2235701dcbebcd1ff0ef8e64f6f239fb47d/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2", size = 2005652, upload-time = "2025-04-23T18:30:53.389Z" }, + { url = "https://files.pythonhosted.org/packages/d2/7a/996d8bd75f3eda405e3dd219ff5ff0a283cd8e34add39d8ef9157e722867/pydantic_core-2.33.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a", size = 2113306, upload-time = "2025-04-23T18:30:54.661Z" }, + { url = "https://files.pythonhosted.org/packages/ff/84/daf2a6fb2db40ffda6578a7e8c5a6e9c8affb251a05c233ae37098118788/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac", size = 2073720, upload-time = "2025-04-23T18:30:56.11Z" }, + { url = "https://files.pythonhosted.org/packages/77/fb/2258da019f4825128445ae79456a5499c032b55849dbd5bed78c95ccf163/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a", size = 2244915, upload-time = "2025-04-23T18:30:57.501Z" }, + { url = "https://files.pythonhosted.org/packages/d8/7a/925ff73756031289468326e355b6fa8316960d0d65f8b5d6b3a3e7866de7/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b", size = 2241884, upload-time = "2025-04-23T18:30:58.867Z" }, + { url = "https://files.pythonhosted.org/packages/0b/b0/249ee6d2646f1cdadcb813805fe76265745c4010cf20a8eba7b0e639d9b2/pydantic_core-2.33.2-cp310-cp310-win32.whl", hash = "sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22", size = 1910496, upload-time = "2025-04-23T18:31:00.078Z" }, + { url = "https://files.pythonhosted.org/packages/66/ff/172ba8f12a42d4b552917aa65d1f2328990d3ccfc01d5b7c943ec084299f/pydantic_core-2.33.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640", size = 1955019, upload-time = "2025-04-23T18:31:01.335Z" }, + { url = "https://files.pythonhosted.org/packages/3f/8d/71db63483d518cbbf290261a1fc2839d17ff89fce7089e08cad07ccfce67/pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7", size = 2028584, upload-time = "2025-04-23T18:31:03.106Z" }, + { url = "https://files.pythonhosted.org/packages/24/2f/3cfa7244ae292dd850989f328722d2aef313f74ffc471184dc509e1e4e5a/pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246", size = 1855071, upload-time = "2025-04-23T18:31:04.621Z" }, + { url = "https://files.pythonhosted.org/packages/b3/d3/4ae42d33f5e3f50dd467761304be2fa0a9417fbf09735bc2cce003480f2a/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f", size = 1897823, upload-time = "2025-04-23T18:31:06.377Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f3/aa5976e8352b7695ff808599794b1fba2a9ae2ee954a3426855935799488/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc", size = 1983792, upload-time = "2025-04-23T18:31:07.93Z" }, + { url = "https://files.pythonhosted.org/packages/d5/7a/cda9b5a23c552037717f2b2a5257e9b2bfe45e687386df9591eff7b46d28/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de", size = 2136338, upload-time = "2025-04-23T18:31:09.283Z" }, + { url = "https://files.pythonhosted.org/packages/2b/9f/b8f9ec8dd1417eb9da784e91e1667d58a2a4a7b7b34cf4af765ef663a7e5/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a", size = 2730998, upload-time = "2025-04-23T18:31:11.7Z" }, + { url = "https://files.pythonhosted.org/packages/47/bc/cd720e078576bdb8255d5032c5d63ee5c0bf4b7173dd955185a1d658c456/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef", size = 2003200, upload-time = "2025-04-23T18:31:13.536Z" }, + { url = "https://files.pythonhosted.org/packages/ca/22/3602b895ee2cd29d11a2b349372446ae9727c32e78a94b3d588a40fdf187/pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e", size = 2113890, upload-time = "2025-04-23T18:31:15.011Z" }, + { url = "https://files.pythonhosted.org/packages/ff/e6/e3c5908c03cf00d629eb38393a98fccc38ee0ce8ecce32f69fc7d7b558a7/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d", size = 2073359, upload-time = "2025-04-23T18:31:16.393Z" }, + { url = "https://files.pythonhosted.org/packages/12/e7/6a36a07c59ebefc8777d1ffdaf5ae71b06b21952582e4b07eba88a421c79/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30", size = 2245883, upload-time = "2025-04-23T18:31:17.892Z" }, + { url = "https://files.pythonhosted.org/packages/16/3f/59b3187aaa6cc0c1e6616e8045b284de2b6a87b027cce2ffcea073adf1d2/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf", size = 2241074, upload-time = "2025-04-23T18:31:19.205Z" }, + { url = "https://files.pythonhosted.org/packages/e0/ed/55532bb88f674d5d8f67ab121a2a13c385df382de2a1677f30ad385f7438/pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51", size = 1910538, upload-time = "2025-04-23T18:31:20.541Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1b/25b7cccd4519c0b23c2dd636ad39d381abf113085ce4f7bec2b0dc755eb1/pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab", size = 1952909, upload-time = "2025-04-23T18:31:22.371Z" }, + { url = "https://files.pythonhosted.org/packages/49/a9/d809358e49126438055884c4366a1f6227f0f84f635a9014e2deb9b9de54/pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65", size = 1897786, upload-time = "2025-04-23T18:31:24.161Z" }, + { url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" }, + { url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" }, + { url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" }, + { url = "https://files.pythonhosted.org/packages/ec/6b/1ec2c03837ac00886ba8160ce041ce4e325b41d06a034adbef11339ae422/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011", size = 1964199, upload-time = "2025-04-23T18:31:31.025Z" }, + { url = "https://files.pythonhosted.org/packages/2d/1d/6bf34d6adb9debd9136bd197ca72642203ce9aaaa85cfcbfcf20f9696e83/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f", size = 2120296, upload-time = "2025-04-23T18:31:32.514Z" }, + { url = "https://files.pythonhosted.org/packages/e0/94/2bd0aaf5a591e974b32a9f7123f16637776c304471a0ab33cf263cf5591a/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88", size = 2676109, upload-time = "2025-04-23T18:31:33.958Z" }, + { url = "https://files.pythonhosted.org/packages/f9/41/4b043778cf9c4285d59742281a769eac371b9e47e35f98ad321349cc5d61/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1", size = 2002028, upload-time = "2025-04-23T18:31:39.095Z" }, + { url = "https://files.pythonhosted.org/packages/cb/d5/7bb781bf2748ce3d03af04d5c969fa1308880e1dca35a9bd94e1a96a922e/pydantic_core-2.33.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b", size = 2100044, upload-time = "2025-04-23T18:31:41.034Z" }, + { url = "https://files.pythonhosted.org/packages/fe/36/def5e53e1eb0ad896785702a5bbfd25eed546cdcf4087ad285021a90ed53/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1", size = 2058881, upload-time = "2025-04-23T18:31:42.757Z" }, + { url = "https://files.pythonhosted.org/packages/01/6c/57f8d70b2ee57fc3dc8b9610315949837fa8c11d86927b9bb044f8705419/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6", size = 2227034, upload-time = "2025-04-23T18:31:44.304Z" }, + { url = "https://files.pythonhosted.org/packages/27/b9/9c17f0396a82b3d5cbea4c24d742083422639e7bb1d5bf600e12cb176a13/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea", size = 2234187, upload-time = "2025-04-23T18:31:45.891Z" }, + { url = "https://files.pythonhosted.org/packages/b0/6a/adf5734ffd52bf86d865093ad70b2ce543415e0e356f6cacabbc0d9ad910/pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290", size = 1892628, upload-time = "2025-04-23T18:31:47.819Z" }, + { url = "https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2", size = 1955866, upload-time = "2025-04-23T18:31:49.635Z" }, + { url = "https://files.pythonhosted.org/packages/0d/24/8b11e8b3e2be9dd82df4b11408a67c61bb4dc4f8e11b5b0fc888b38118b5/pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab", size = 1888894, upload-time = "2025-04-23T18:31:51.609Z" }, + { url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" }, + { url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" }, + { url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" }, + { url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" }, + { url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" }, + { url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" }, + { url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" }, + { url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" }, + { url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" }, + { url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" }, + { url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" }, + { url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" }, + { url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" }, + { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" }, + { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, + { url = "https://files.pythonhosted.org/packages/30/68/373d55e58b7e83ce371691f6eaa7175e3a24b956c44628eb25d7da007917/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa", size = 2023982, upload-time = "2025-04-23T18:32:53.14Z" }, + { url = "https://files.pythonhosted.org/packages/a4/16/145f54ac08c96a63d8ed6442f9dec17b2773d19920b627b18d4f10a061ea/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29", size = 1858412, upload-time = "2025-04-23T18:32:55.52Z" }, + { url = "https://files.pythonhosted.org/packages/41/b1/c6dc6c3e2de4516c0bb2c46f6a373b91b5660312342a0cf5826e38ad82fa/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d", size = 1892749, upload-time = "2025-04-23T18:32:57.546Z" }, + { url = "https://files.pythonhosted.org/packages/12/73/8cd57e20afba760b21b742106f9dbdfa6697f1570b189c7457a1af4cd8a0/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e", size = 2067527, upload-time = "2025-04-23T18:32:59.771Z" }, + { url = "https://files.pythonhosted.org/packages/e3/d5/0bb5d988cc019b3cba4a78f2d4b3854427fc47ee8ec8e9eaabf787da239c/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c", size = 2108225, upload-time = "2025-04-23T18:33:04.51Z" }, + { url = "https://files.pythonhosted.org/packages/f1/c5/00c02d1571913d496aabf146106ad8239dc132485ee22efe08085084ff7c/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec", size = 2069490, upload-time = "2025-04-23T18:33:06.391Z" }, + { url = "https://files.pythonhosted.org/packages/22/a8/dccc38768274d3ed3a59b5d06f59ccb845778687652daa71df0cab4040d7/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052", size = 2237525, upload-time = "2025-04-23T18:33:08.44Z" }, + { url = "https://files.pythonhosted.org/packages/d4/e7/4f98c0b125dda7cf7ccd14ba936218397b44f50a56dd8c16a3091df116c3/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c", size = 2238446, upload-time = "2025-04-23T18:33:10.313Z" }, + { url = "https://files.pythonhosted.org/packages/ce/91/2ec36480fdb0b783cd9ef6795753c1dea13882f2e68e73bce76ae8c21e6a/pydantic_core-2.33.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808", size = 2066678, upload-time = "2025-04-23T18:33:12.224Z" }, + { url = "https://files.pythonhosted.org/packages/7b/27/d4ae6487d73948d6f20dddcd94be4ea43e74349b56eba82e9bdee2d7494c/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8", size = 2025200, upload-time = "2025-04-23T18:33:14.199Z" }, + { url = "https://files.pythonhosted.org/packages/f1/b8/b3cb95375f05d33801024079b9392a5ab45267a63400bf1866e7ce0f0de4/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593", size = 1859123, upload-time = "2025-04-23T18:33:16.555Z" }, + { url = "https://files.pythonhosted.org/packages/05/bc/0d0b5adeda59a261cd30a1235a445bf55c7e46ae44aea28f7bd6ed46e091/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612", size = 1892852, upload-time = "2025-04-23T18:33:18.513Z" }, + { url = "https://files.pythonhosted.org/packages/3e/11/d37bdebbda2e449cb3f519f6ce950927b56d62f0b84fd9cb9e372a26a3d5/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7", size = 2067484, upload-time = "2025-04-23T18:33:20.475Z" }, + { url = "https://files.pythonhosted.org/packages/8c/55/1f95f0a05ce72ecb02a8a8a1c3be0579bbc29b1d5ab68f1378b7bebc5057/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e", size = 2108896, upload-time = "2025-04-23T18:33:22.501Z" }, + { url = "https://files.pythonhosted.org/packages/53/89/2b2de6c81fa131f423246a9109d7b2a375e83968ad0800d6e57d0574629b/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8", size = 2069475, upload-time = "2025-04-23T18:33:24.528Z" }, + { url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" }, + { url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" }, + { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + +[[package]] +name = "python-dotenv" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/2c/7bb1416c5620485aa793f2de31d3df393d3686aa8a8506d11e10e13c5baf/python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5", size = 39920, upload-time = "2025-03-25T10:14:56.835Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256, upload-time = "2025-03-25T10:14:55.034Z" }, +] + +[[package]] +name = "python-louvain" +version = "0.16" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "networkx" }, + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7c/0d/8787b021d52eb8764c0bb18ab95f720cf554902044c6a5cb1865daf45763/python-louvain-0.16.tar.gz", hash = "sha256:b7ba2df5002fd28d3ee789a49532baad11fe648e4f2117cf0798e7520a1da56b", size = 204641, upload-time = "2022-01-29T15:53:03.532Z" } + +[[package]] +name = "pywin32" +version = "310" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/da/a5f38fffbba2fb99aa4aa905480ac4b8e83ca486659ac8c95bce47fb5276/pywin32-310-cp310-cp310-win32.whl", hash = "sha256:6dd97011efc8bf51d6793a82292419eba2c71cf8e7250cfac03bba284454abc1", size = 8848240, upload-time = "2025-03-17T00:55:46.783Z" }, + { url = "https://files.pythonhosted.org/packages/aa/fe/d873a773324fa565619ba555a82c9dabd677301720f3660a731a5d07e49a/pywin32-310-cp310-cp310-win_amd64.whl", hash = "sha256:c3e78706e4229b915a0821941a84e7ef420bf2b77e08c9dae3c76fd03fd2ae3d", size = 9601854, upload-time = "2025-03-17T00:55:48.783Z" }, + { url = "https://files.pythonhosted.org/packages/3c/84/1a8e3d7a15490d28a5d816efa229ecb4999cdc51a7c30dd8914f669093b8/pywin32-310-cp310-cp310-win_arm64.whl", hash = "sha256:33babed0cf0c92a6f94cc6cc13546ab24ee13e3e800e61ed87609ab91e4c8213", size = 8522963, upload-time = "2025-03-17T00:55:50.969Z" }, + { url = "https://files.pythonhosted.org/packages/f7/b1/68aa2986129fb1011dabbe95f0136f44509afaf072b12b8f815905a39f33/pywin32-310-cp311-cp311-win32.whl", hash = "sha256:1e765f9564e83011a63321bb9d27ec456a0ed90d3732c4b2e312b855365ed8bd", size = 8784284, upload-time = "2025-03-17T00:55:53.124Z" }, + { url = "https://files.pythonhosted.org/packages/b3/bd/d1592635992dd8db5bb8ace0551bc3a769de1ac8850200cfa517e72739fb/pywin32-310-cp311-cp311-win_amd64.whl", hash = "sha256:126298077a9d7c95c53823934f000599f66ec9296b09167810eb24875f32689c", size = 9520748, upload-time = "2025-03-17T00:55:55.203Z" }, + { url = "https://files.pythonhosted.org/packages/90/b1/ac8b1ffce6603849eb45a91cf126c0fa5431f186c2e768bf56889c46f51c/pywin32-310-cp311-cp311-win_arm64.whl", hash = "sha256:19ec5fc9b1d51c4350be7bb00760ffce46e6c95eaf2f0b2f1150657b1a43c582", size = 8455941, upload-time = "2025-03-17T00:55:57.048Z" }, + { url = "https://files.pythonhosted.org/packages/6b/ec/4fdbe47932f671d6e348474ea35ed94227fb5df56a7c30cbbb42cd396ed0/pywin32-310-cp312-cp312-win32.whl", hash = "sha256:8a75a5cc3893e83a108c05d82198880704c44bbaee4d06e442e471d3c9ea4f3d", size = 8796239, upload-time = "2025-03-17T00:55:58.807Z" }, + { url = "https://files.pythonhosted.org/packages/e3/e5/b0627f8bb84e06991bea89ad8153a9e50ace40b2e1195d68e9dff6b03d0f/pywin32-310-cp312-cp312-win_amd64.whl", hash = "sha256:bf5c397c9a9a19a6f62f3fb821fbf36cac08f03770056711f765ec1503972060", size = 9503839, upload-time = "2025-03-17T00:56:00.8Z" }, + { url = "https://files.pythonhosted.org/packages/1f/32/9ccf53748df72301a89713936645a664ec001abd35ecc8578beda593d37d/pywin32-310-cp312-cp312-win_arm64.whl", hash = "sha256:2349cc906eae872d0663d4d6290d13b90621eaf78964bb1578632ff20e152966", size = 8459470, upload-time = "2025-03-17T00:56:02.601Z" }, + { url = "https://files.pythonhosted.org/packages/1c/09/9c1b978ffc4ae53999e89c19c77ba882d9fce476729f23ef55211ea1c034/pywin32-310-cp313-cp313-win32.whl", hash = "sha256:5d241a659c496ada3253cd01cfaa779b048e90ce4b2b38cd44168ad555ce74ab", size = 8794384, upload-time = "2025-03-17T00:56:04.383Z" }, + { url = "https://files.pythonhosted.org/packages/45/3c/b4640f740ffebadd5d34df35fecba0e1cfef8fde9f3e594df91c28ad9b50/pywin32-310-cp313-cp313-win_amd64.whl", hash = "sha256:667827eb3a90208ddbdcc9e860c81bde63a135710e21e4cb3348968e4bd5249e", size = 9503039, upload-time = "2025-03-17T00:56:06.207Z" }, + { url = "https://files.pythonhosted.org/packages/b4/f4/f785020090fb050e7fb6d34b780f2231f302609dc964672f72bfaeb59a28/pywin32-310-cp313-cp313-win_arm64.whl", hash = "sha256:e308f831de771482b7cf692a1f308f8fca701b2d8f9dde6cc440c7da17e47b33", size = 8458152, upload-time = "2025-03-17T00:56:07.819Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/95/a3fac87cb7158e231b5a6012e438c647e1a87f09f8e0d123acec8ab8bf71/PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086", size = 184199, upload-time = "2024-08-06T20:31:40.178Z" }, + { url = "https://files.pythonhosted.org/packages/c7/7a/68bd47624dab8fd4afbfd3c48e3b79efe09098ae941de5b58abcbadff5cb/PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf", size = 171758, upload-time = "2024-08-06T20:31:42.173Z" }, + { url = "https://files.pythonhosted.org/packages/49/ee/14c54df452143b9ee9f0f29074d7ca5516a36edb0b4cc40c3f280131656f/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237", size = 718463, upload-time = "2024-08-06T20:31:44.263Z" }, + { url = "https://files.pythonhosted.org/packages/4d/61/de363a97476e766574650d742205be468921a7b532aa2499fcd886b62530/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b", size = 719280, upload-time = "2024-08-06T20:31:50.199Z" }, + { url = "https://files.pythonhosted.org/packages/6b/4e/1523cb902fd98355e2e9ea5e5eb237cbc5f3ad5f3075fa65087aa0ecb669/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed", size = 751239, upload-time = "2024-08-06T20:31:52.292Z" }, + { url = "https://files.pythonhosted.org/packages/b7/33/5504b3a9a4464893c32f118a9cc045190a91637b119a9c881da1cf6b7a72/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180", size = 695802, upload-time = "2024-08-06T20:31:53.836Z" }, + { url = "https://files.pythonhosted.org/packages/5c/20/8347dcabd41ef3a3cdc4f7b7a2aff3d06598c8779faa189cdbf878b626a4/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68", size = 720527, upload-time = "2024-08-06T20:31:55.565Z" }, + { url = "https://files.pythonhosted.org/packages/be/aa/5afe99233fb360d0ff37377145a949ae258aaab831bde4792b32650a4378/PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99", size = 144052, upload-time = "2024-08-06T20:31:56.914Z" }, + { url = "https://files.pythonhosted.org/packages/b5/84/0fa4b06f6d6c958d207620fc60005e241ecedceee58931bb20138e1e5776/PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e", size = 161774, upload-time = "2024-08-06T20:31:58.304Z" }, + { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612, upload-time = "2024-08-06T20:32:03.408Z" }, + { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040, upload-time = "2024-08-06T20:32:04.926Z" }, + { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829, upload-time = "2024-08-06T20:32:06.459Z" }, + { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167, upload-time = "2024-08-06T20:32:08.338Z" }, + { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952, upload-time = "2024-08-06T20:32:14.124Z" }, + { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301, upload-time = "2024-08-06T20:32:16.17Z" }, + { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638, upload-time = "2024-08-06T20:32:18.555Z" }, + { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850, upload-time = "2024-08-06T20:32:19.889Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980, upload-time = "2024-08-06T20:32:21.273Z" }, + { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873, upload-time = "2024-08-06T20:32:25.131Z" }, + { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302, upload-time = "2024-08-06T20:32:26.511Z" }, + { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154, upload-time = "2024-08-06T20:32:28.363Z" }, + { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223, upload-time = "2024-08-06T20:32:30.058Z" }, + { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542, upload-time = "2024-08-06T20:32:31.881Z" }, + { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164, upload-time = "2024-08-06T20:32:37.083Z" }, + { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611, upload-time = "2024-08-06T20:32:38.898Z" }, + { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591, upload-time = "2024-08-06T20:32:40.241Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338, upload-time = "2024-08-06T20:32:41.93Z" }, + { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309, upload-time = "2024-08-06T20:32:43.4Z" }, + { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679, upload-time = "2024-08-06T20:32:44.801Z" }, + { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428, upload-time = "2024-08-06T20:32:46.432Z" }, + { url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361, upload-time = "2024-08-06T20:32:51.188Z" }, + { url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523, upload-time = "2024-08-06T20:32:53.019Z" }, + { url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660, upload-time = "2024-08-06T20:32:54.708Z" }, + { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597, upload-time = "2024-08-06T20:32:56.985Z" }, + { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527, upload-time = "2024-08-06T20:33:03.001Z" }, + { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" }, +] + +[[package]] +name = "redis" +version = "6.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "async-timeout", marker = "python_full_version < '3.11.3'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/79/12/dffaaa4374b8d5f3b7ff5c40025c9db387e06264302d5a9da6043cd84e1f/redis-6.0.0.tar.gz", hash = "sha256:5446780d2425b787ed89c91ddbfa1be6d32370a636c8fdb687f11b1c26c1fa88", size = 4620969, upload-time = "2025-04-30T19:09:30.798Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/c8/68081c9d3531f7b2a4d663326b96a9dcbc2aef47df3c6b5c38dea90dff02/redis-6.0.0-py3-none-any.whl", hash = "sha256:a2e040aee2cdd947be1fa3a32e35a956cd839cc4c1dbbe4b2cdee5b9623fd27c", size = 268950, upload-time = "2025-04-30T19:09:28.432Z" }, +] + +[[package]] +name = "requests" +version = "2.32.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218, upload-time = "2024-05-29T15:37:49.536Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928, upload-time = "2024-05-29T15:37:47.027Z" }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + +[[package]] +name = "sqlalchemy" +version = "2.0.40" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "greenlet", marker = "(python_full_version < '3.14' and platform_machine == 'AMD64') or (python_full_version < '3.14' and platform_machine == 'WIN32') or (python_full_version < '3.14' and platform_machine == 'aarch64') or (python_full_version < '3.14' and platform_machine == 'amd64') or (python_full_version < '3.14' and platform_machine == 'ppc64le') or (python_full_version < '3.14' and platform_machine == 'win32') or (python_full_version < '3.14' and platform_machine == 'x86_64')" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/68/c3/3f2bfa5e4dcd9938405fe2fab5b6ab94a9248a4f9536ea2fd497da20525f/sqlalchemy-2.0.40.tar.gz", hash = "sha256:d827099289c64589418ebbcaead0145cd19f4e3e8a93919a0100247af245fa00", size = 9664299, upload-time = "2025-03-27T17:52:31.876Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/fa/8e8fd93684b04e65816be864bebf0000fe1602e5452d006f9acc5db14ce5/sqlalchemy-2.0.40-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f1ea21bef99c703f44444ad29c2c1b6bd55d202750b6de8e06a955380f4725d7", size = 2112843, upload-time = "2025-03-27T18:49:25.515Z" }, + { url = "https://files.pythonhosted.org/packages/ba/87/06992f78a9ce545dfd1fea3dd99262bec5221f6f9d2d2066c3e94662529f/sqlalchemy-2.0.40-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:afe63b208153f3a7a2d1a5b9df452b0673082588933e54e7c8aac457cf35e758", size = 2104032, upload-time = "2025-03-27T18:49:28.098Z" }, + { url = "https://files.pythonhosted.org/packages/92/ee/57dc77282e8be22d686bd4681825299aa1069bbe090564868ea270ed5214/sqlalchemy-2.0.40-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8aae085ea549a1eddbc9298b113cffb75e514eadbb542133dd2b99b5fb3b6af", size = 3086406, upload-time = "2025-03-27T18:44:25.302Z" }, + { url = "https://files.pythonhosted.org/packages/94/3f/ceb9ab214b2e42d2e74a9209b3a2f2f073504eee16cddd2df81feeb67c2f/sqlalchemy-2.0.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ea9181284754d37db15156eb7be09c86e16e50fbe77610e9e7bee09291771a1", size = 3094652, upload-time = "2025-03-27T18:55:16.174Z" }, + { url = "https://files.pythonhosted.org/packages/00/0a/3401232a5b6d91a2df16c1dc39c6504c54575744c2faafa1e5a50de96621/sqlalchemy-2.0.40-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5434223b795be5c5ef8244e5ac98056e290d3a99bdcc539b916e282b160dda00", size = 3050503, upload-time = "2025-03-27T18:44:28.266Z" }, + { url = "https://files.pythonhosted.org/packages/93/c2/ea7171415ab131397f71a2673645c2fe29ebe9a93063d458eb89e42bf051/sqlalchemy-2.0.40-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:15d08d5ef1b779af6a0909b97be6c1fd4298057504eb6461be88bd1696cb438e", size = 3076011, upload-time = "2025-03-27T18:55:17.967Z" }, + { url = "https://files.pythonhosted.org/packages/3d/ee/d8e229280d621bed8c51eebf1dd413aa09ca89e309b1fff40d881dd149af/sqlalchemy-2.0.40-cp310-cp310-win32.whl", hash = "sha256:cd2f75598ae70bcfca9117d9e51a3b06fe29edd972fdd7fd57cc97b4dbf3b08a", size = 2085136, upload-time = "2025-03-27T18:48:53.032Z" }, + { url = "https://files.pythonhosted.org/packages/60/7f/ea1086136bc648cd4713a1e01869f7fc31979d67b3a8f973f5d9ab8de7e1/sqlalchemy-2.0.40-cp310-cp310-win_amd64.whl", hash = "sha256:2cbafc8d39ff1abdfdda96435f38fab141892dc759a2165947d1a8fffa7ef596", size = 2109421, upload-time = "2025-03-27T18:48:54.258Z" }, + { url = "https://files.pythonhosted.org/packages/77/7e/55044a9ec48c3249bb38d5faae93f09579c35e862bb318ebd1ed7a1994a5/sqlalchemy-2.0.40-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f6bacab7514de6146a1976bc56e1545bee247242fab030b89e5f70336fc0003e", size = 2114025, upload-time = "2025-03-27T18:49:29.456Z" }, + { url = "https://files.pythonhosted.org/packages/77/0f/dcf7bba95f847aec72f638750747b12d37914f71c8cc7c133cf326ab945c/sqlalchemy-2.0.40-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5654d1ac34e922b6c5711631f2da497d3a7bffd6f9f87ac23b35feea56098011", size = 2104419, upload-time = "2025-03-27T18:49:30.75Z" }, + { url = "https://files.pythonhosted.org/packages/75/70/c86a5c20715e4fe903dde4c2fd44fc7e7a0d5fb52c1b954d98526f65a3ea/sqlalchemy-2.0.40-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35904d63412db21088739510216e9349e335f142ce4a04b69e2528020ee19ed4", size = 3222720, upload-time = "2025-03-27T18:44:29.871Z" }, + { url = "https://files.pythonhosted.org/packages/12/cf/b891a8c1d0c27ce9163361664c2128c7a57de3f35000ea5202eb3a2917b7/sqlalchemy-2.0.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c7a80ed86d6aaacb8160a1caef6680d4ddd03c944d985aecee940d168c411d1", size = 3222682, upload-time = "2025-03-27T18:55:20.097Z" }, + { url = "https://files.pythonhosted.org/packages/15/3f/7709d8c8266953d945435a96b7f425ae4172a336963756b58e996fbef7f3/sqlalchemy-2.0.40-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:519624685a51525ddaa7d8ba8265a1540442a2ec71476f0e75241eb8263d6f51", size = 3159542, upload-time = "2025-03-27T18:44:31.333Z" }, + { url = "https://files.pythonhosted.org/packages/85/7e/717eaabaf0f80a0132dc2032ea8f745b7a0914451c984821a7c8737fb75a/sqlalchemy-2.0.40-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2ee5f9999a5b0e9689bed96e60ee53c3384f1a05c2dd8068cc2e8361b0df5b7a", size = 3179864, upload-time = "2025-03-27T18:55:21.784Z" }, + { url = "https://files.pythonhosted.org/packages/e4/cc/03eb5dfcdb575cbecd2bd82487b9848f250a4b6ecfb4707e834b4ce4ec07/sqlalchemy-2.0.40-cp311-cp311-win32.whl", hash = "sha256:c0cae71e20e3c02c52f6b9e9722bca70e4a90a466d59477822739dc31ac18b4b", size = 2084675, upload-time = "2025-03-27T18:48:55.915Z" }, + { url = "https://files.pythonhosted.org/packages/9a/48/440946bf9dc4dc231f4f31ef0d316f7135bf41d4b86aaba0c0655150d370/sqlalchemy-2.0.40-cp311-cp311-win_amd64.whl", hash = "sha256:574aea2c54d8f1dd1699449f332c7d9b71c339e04ae50163a3eb5ce4c4325ee4", size = 2110099, upload-time = "2025-03-27T18:48:57.45Z" }, + { url = "https://files.pythonhosted.org/packages/92/06/552c1f92e880b57d8b92ce6619bd569b25cead492389b1d84904b55989d8/sqlalchemy-2.0.40-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9d3b31d0a1c44b74d3ae27a3de422dfccd2b8f0b75e51ecb2faa2bf65ab1ba0d", size = 2112620, upload-time = "2025-03-27T18:40:00.071Z" }, + { url = "https://files.pythonhosted.org/packages/01/72/a5bc6e76c34cebc071f758161dbe1453de8815ae6e662393910d3be6d70d/sqlalchemy-2.0.40-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:37f7a0f506cf78c80450ed1e816978643d3969f99c4ac6b01104a6fe95c5490a", size = 2103004, upload-time = "2025-03-27T18:40:04.204Z" }, + { url = "https://files.pythonhosted.org/packages/bf/fd/0e96c8e6767618ed1a06e4d7a167fe13734c2f8113c4cb704443e6783038/sqlalchemy-2.0.40-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bb933a650323e476a2e4fbef8997a10d0003d4da996aad3fd7873e962fdde4d", size = 3252440, upload-time = "2025-03-27T18:51:25.624Z" }, + { url = "https://files.pythonhosted.org/packages/cd/6a/eb82e45b15a64266a2917a6833b51a334ea3c1991728fd905bfccbf5cf63/sqlalchemy-2.0.40-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6959738971b4745eea16f818a2cd086fb35081383b078272c35ece2b07012716", size = 3263277, upload-time = "2025-03-27T18:50:28.142Z" }, + { url = "https://files.pythonhosted.org/packages/45/97/ebe41ab4530f50af99e3995ebd4e0204bf1b0dc0930f32250dde19c389fe/sqlalchemy-2.0.40-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:110179728e442dae85dd39591beb74072ae4ad55a44eda2acc6ec98ead80d5f2", size = 3198591, upload-time = "2025-03-27T18:51:27.543Z" }, + { url = "https://files.pythonhosted.org/packages/e6/1c/a569c1b2b2f5ac20ba6846a1321a2bf52e9a4061001f282bf1c5528dcd69/sqlalchemy-2.0.40-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8040680eaacdce4d635f12c55c714f3d4c7f57da2bc47a01229d115bd319191", size = 3225199, upload-time = "2025-03-27T18:50:30.069Z" }, + { url = "https://files.pythonhosted.org/packages/8f/91/87cc71a6b10065ca0209d19a4bb575378abda6085e72fa0b61ffb2201b84/sqlalchemy-2.0.40-cp312-cp312-win32.whl", hash = "sha256:650490653b110905c10adac69408380688cefc1f536a137d0d69aca1069dc1d1", size = 2082959, upload-time = "2025-03-27T18:45:57.574Z" }, + { url = "https://files.pythonhosted.org/packages/2a/9f/14c511cda174aa1ad9b0e42b64ff5a71db35d08b0d80dc044dae958921e5/sqlalchemy-2.0.40-cp312-cp312-win_amd64.whl", hash = "sha256:2be94d75ee06548d2fc591a3513422b873490efb124048f50556369a834853b0", size = 2108526, upload-time = "2025-03-27T18:45:58.965Z" }, + { url = "https://files.pythonhosted.org/packages/8c/18/4e3a86cc0232377bc48c373a9ba6a1b3fb79ba32dbb4eda0b357f5a2c59d/sqlalchemy-2.0.40-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:915866fd50dd868fdcc18d61d8258db1bf9ed7fbd6dfec960ba43365952f3b01", size = 2107887, upload-time = "2025-03-27T18:40:05.461Z" }, + { url = "https://files.pythonhosted.org/packages/cb/60/9fa692b1d2ffc4cbd5f47753731fd332afed30137115d862d6e9a1e962c7/sqlalchemy-2.0.40-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4a4c5a2905a9ccdc67a8963e24abd2f7afcd4348829412483695c59e0af9a705", size = 2098367, upload-time = "2025-03-27T18:40:07.182Z" }, + { url = "https://files.pythonhosted.org/packages/4c/9f/84b78357ca641714a439eb3fbbddb17297dacfa05d951dbf24f28d7b5c08/sqlalchemy-2.0.40-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55028d7a3ebdf7ace492fab9895cbc5270153f75442a0472d8516e03159ab364", size = 3184806, upload-time = "2025-03-27T18:51:29.356Z" }, + { url = "https://files.pythonhosted.org/packages/4b/7d/e06164161b6bfce04c01bfa01518a20cccbd4100d5c951e5a7422189191a/sqlalchemy-2.0.40-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6cfedff6878b0e0d1d0a50666a817ecd85051d12d56b43d9d425455e608b5ba0", size = 3198131, upload-time = "2025-03-27T18:50:31.616Z" }, + { url = "https://files.pythonhosted.org/packages/6d/51/354af20da42d7ec7b5c9de99edafbb7663a1d75686d1999ceb2c15811302/sqlalchemy-2.0.40-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bb19e30fdae77d357ce92192a3504579abe48a66877f476880238a962e5b96db", size = 3131364, upload-time = "2025-03-27T18:51:31.336Z" }, + { url = "https://files.pythonhosted.org/packages/7a/2f/48a41ff4e6e10549d83fcc551ab85c268bde7c03cf77afb36303c6594d11/sqlalchemy-2.0.40-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:16d325ea898f74b26ffcd1cf8c593b0beed8714f0317df2bed0d8d1de05a8f26", size = 3159482, upload-time = "2025-03-27T18:50:33.201Z" }, + { url = "https://files.pythonhosted.org/packages/33/ac/e5e0a807163652a35be878c0ad5cfd8b1d29605edcadfb5df3c512cdf9f3/sqlalchemy-2.0.40-cp313-cp313-win32.whl", hash = "sha256:a669cbe5be3c63f75bcbee0b266779706f1a54bcb1000f302685b87d1b8c1500", size = 2080704, upload-time = "2025-03-27T18:46:00.193Z" }, + { url = "https://files.pythonhosted.org/packages/1c/cb/f38c61f7f2fd4d10494c1c135ff6a6ddb63508d0b47bccccd93670637309/sqlalchemy-2.0.40-cp313-cp313-win_amd64.whl", hash = "sha256:641ee2e0834812d657862f3a7de95e0048bdcb6c55496f39c6fa3d435f6ac6ad", size = 2104564, upload-time = "2025-03-27T18:46:01.442Z" }, + { url = "https://files.pythonhosted.org/packages/d1/7c/5fc8e802e7506fe8b55a03a2e1dab156eae205c91bee46305755e086d2e2/sqlalchemy-2.0.40-py3-none-any.whl", hash = "sha256:32587e2e1e359276957e6fe5dad089758bc042a971a8a09ae8ecf7a8fe23d07a", size = 1903894, upload-time = "2025-03-27T18:40:43.796Z" }, +] + +[[package]] +name = "starlette" +version = "0.46.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ce/20/08dfcd9c983f6a6f4a1000d934b9e6d626cff8d2eeb77a89a68eef20a2b7/starlette-0.46.2.tar.gz", hash = "sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5", size = 2580846, upload-time = "2025-04-13T13:56:17.942Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8b/0c/9d30a4ebeb6db2b25a841afbb80f6ef9a854fc3b41be131d249a977b4959/starlette-0.46.2-py3-none-any.whl", hash = "sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35", size = 72037, upload-time = "2025-04-13T13:56:16.21Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.13.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967, upload-time = "2025-04-10T14:19:05.416Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806, upload-time = "2025-04-10T14:19:03.967Z" }, +] + +[[package]] +name = "typing-inspection" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/82/5c/e6082df02e215b846b4b8c0b887a64d7d08ffaba30605502639d44c06b82/typing_inspection-0.4.0.tar.gz", hash = "sha256:9765c87de36671694a67904bf2c96e395be9c6439bb6c87b5142569dcdd65122", size = 76222, upload-time = "2025-02-25T17:27:59.638Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/08/aa4fdfb71f7de5176385bd9e90852eaf6b5d622735020ad600f2bab54385/typing_inspection-0.4.0-py3-none-any.whl", hash = "sha256:50e72559fcd2a6367a19f7a7e610e6afcb9fac940c650290eed893d61386832f", size = 14125, upload-time = "2025-02-25T17:27:57.754Z" }, +] + +[[package]] +name = "tzdata" +version = "2025.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, +] + +[[package]] +name = "urllib3" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8a/78/16493d9c386d8e60e442a35feac5e00f0913c0f4b7c217c11e8ec2ff53e0/urllib3-2.4.0.tar.gz", hash = "sha256:414bc6535b787febd7567804cc015fee39daab8ad86268f1310a9250697de466", size = 390672, upload-time = "2025-04-10T15:23:39.232Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680, upload-time = "2025-04-10T15:23:37.377Z" }, +] + +[[package]] +name = "uvicorn" +version = "0.34.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "h11" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/ae/9bbb19b9e1c450cf9ecaef06463e40234d98d95bf572fab11b4f19ae5ded/uvicorn-0.34.2.tar.gz", hash = "sha256:0e929828f6186353a80b58ea719861d2629d766293b6d19baf086ba31d4f3328", size = 76815, upload-time = "2025-04-19T06:02:50.101Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/4b/4cef6ce21a2aaca9d852a6e84ef4f135d99fcd74fa75105e2fc0c8308acd/uvicorn-0.34.2-py3-none-any.whl", hash = "sha256:deb49af569084536d269fe0a6d67e3754f104cf03aba7c11c40f01aadf33c403", size = 62483, upload-time = "2025-04-19T06:02:48.42Z" }, +] + +[package.optional-dependencies] +standard = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "httptools" }, + { name = "python-dotenv" }, + { name = "pyyaml" }, + { name = "uvloop", marker = "platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'" }, + { name = "watchfiles" }, + { name = "websockets" }, +] + +[[package]] +name = "uvloop" +version = "0.21.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/af/c0/854216d09d33c543f12a44b393c402e89a920b1a0a7dc634c42de91b9cf6/uvloop-0.21.0.tar.gz", hash = "sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3", size = 2492741, upload-time = "2024-10-14T23:38:35.489Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3d/76/44a55515e8c9505aa1420aebacf4dd82552e5e15691654894e90d0bd051a/uvloop-0.21.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f", size = 1442019, upload-time = "2024-10-14T23:37:20.068Z" }, + { url = "https://files.pythonhosted.org/packages/35/5a/62d5800358a78cc25c8a6c72ef8b10851bdb8cca22e14d9c74167b7f86da/uvloop-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d", size = 801898, upload-time = "2024-10-14T23:37:22.663Z" }, + { url = "https://files.pythonhosted.org/packages/f3/96/63695e0ebd7da6c741ccd4489b5947394435e198a1382349c17b1146bb97/uvloop-0.21.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f38b2e090258d051d68a5b14d1da7203a3c3677321cf32a95a6f4db4dd8b6f26", size = 3827735, upload-time = "2024-10-14T23:37:25.129Z" }, + { url = "https://files.pythonhosted.org/packages/61/e0/f0f8ec84979068ffae132c58c79af1de9cceeb664076beea86d941af1a30/uvloop-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87c43e0f13022b998eb9b973b5e97200c8b90823454d4bc06ab33829e09fb9bb", size = 3825126, upload-time = "2024-10-14T23:37:27.59Z" }, + { url = "https://files.pythonhosted.org/packages/bf/fe/5e94a977d058a54a19df95f12f7161ab6e323ad49f4dabc28822eb2df7ea/uvloop-0.21.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:10d66943def5fcb6e7b37310eb6b5639fd2ccbc38df1177262b0640c3ca68c1f", size = 3705789, upload-time = "2024-10-14T23:37:29.385Z" }, + { url = "https://files.pythonhosted.org/packages/26/dd/c7179618e46092a77e036650c1f056041a028a35c4d76945089fcfc38af8/uvloop-0.21.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:67dd654b8ca23aed0a8e99010b4c34aca62f4b7fce88f39d452ed7622c94845c", size = 3800523, upload-time = "2024-10-14T23:37:32.048Z" }, + { url = "https://files.pythonhosted.org/packages/57/a7/4cf0334105c1160dd6819f3297f8700fda7fc30ab4f61fbf3e725acbc7cc/uvloop-0.21.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c0f3fa6200b3108919f8bdabb9a7f87f20e7097ea3c543754cabc7d717d95cf8", size = 1447410, upload-time = "2024-10-14T23:37:33.612Z" }, + { url = "https://files.pythonhosted.org/packages/8c/7c/1517b0bbc2dbe784b563d6ab54f2ef88c890fdad77232c98ed490aa07132/uvloop-0.21.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0878c2640cf341b269b7e128b1a5fed890adc4455513ca710d77d5e93aa6d6a0", size = 805476, upload-time = "2024-10-14T23:37:36.11Z" }, + { url = "https://files.pythonhosted.org/packages/ee/ea/0bfae1aceb82a503f358d8d2fa126ca9dbdb2ba9c7866974faec1cb5875c/uvloop-0.21.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9fb766bb57b7388745d8bcc53a359b116b8a04c83a2288069809d2b3466c37e", size = 3960855, upload-time = "2024-10-14T23:37:37.683Z" }, + { url = "https://files.pythonhosted.org/packages/8a/ca/0864176a649838b838f36d44bf31c451597ab363b60dc9e09c9630619d41/uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a375441696e2eda1c43c44ccb66e04d61ceeffcd76e4929e527b7fa401b90fb", size = 3973185, upload-time = "2024-10-14T23:37:40.226Z" }, + { url = "https://files.pythonhosted.org/packages/30/bf/08ad29979a936d63787ba47a540de2132169f140d54aa25bc8c3df3e67f4/uvloop-0.21.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:baa0e6291d91649c6ba4ed4b2f982f9fa165b5bbd50a9e203c416a2797bab3c6", size = 3820256, upload-time = "2024-10-14T23:37:42.839Z" }, + { url = "https://files.pythonhosted.org/packages/da/e2/5cf6ef37e3daf2f06e651aae5ea108ad30df3cb269102678b61ebf1fdf42/uvloop-0.21.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4509360fcc4c3bd2c70d87573ad472de40c13387f5fda8cb58350a1d7475e58d", size = 3937323, upload-time = "2024-10-14T23:37:45.337Z" }, + { url = "https://files.pythonhosted.org/packages/8c/4c/03f93178830dc7ce8b4cdee1d36770d2f5ebb6f3d37d354e061eefc73545/uvloop-0.21.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:359ec2c888397b9e592a889c4d72ba3d6befba8b2bb01743f72fffbde663b59c", size = 1471284, upload-time = "2024-10-14T23:37:47.833Z" }, + { url = "https://files.pythonhosted.org/packages/43/3e/92c03f4d05e50f09251bd8b2b2b584a2a7f8fe600008bcc4523337abe676/uvloop-0.21.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2", size = 821349, upload-time = "2024-10-14T23:37:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/a6/ef/a02ec5da49909dbbfb1fd205a9a1ac4e88ea92dcae885e7c961847cd51e2/uvloop-0.21.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baa4dcdbd9ae0a372f2167a207cd98c9f9a1ea1188a8a526431eef2f8116cc8d", size = 4580089, upload-time = "2024-10-14T23:37:51.703Z" }, + { url = "https://files.pythonhosted.org/packages/06/a7/b4e6a19925c900be9f98bec0a75e6e8f79bb53bdeb891916609ab3958967/uvloop-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86975dca1c773a2c9864f4c52c5a55631038e387b47eaf56210f873887b6c8dc", size = 4693770, upload-time = "2024-10-14T23:37:54.122Z" }, + { url = "https://files.pythonhosted.org/packages/ce/0c/f07435a18a4b94ce6bd0677d8319cd3de61f3a9eeb1e5f8ab4e8b5edfcb3/uvloop-0.21.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:461d9ae6660fbbafedd07559c6a2e57cd553b34b0065b6550685f6653a98c1cb", size = 4451321, upload-time = "2024-10-14T23:37:55.766Z" }, + { url = "https://files.pythonhosted.org/packages/8f/eb/f7032be105877bcf924709c97b1bf3b90255b4ec251f9340cef912559f28/uvloop-0.21.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:183aef7c8730e54c9a3ee3227464daed66e37ba13040bb3f350bc2ddc040f22f", size = 4659022, upload-time = "2024-10-14T23:37:58.195Z" }, + { url = "https://files.pythonhosted.org/packages/3f/8d/2cbef610ca21539f0f36e2b34da49302029e7c9f09acef0b1c3b5839412b/uvloop-0.21.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:bfd55dfcc2a512316e65f16e503e9e450cab148ef11df4e4e679b5e8253a5281", size = 1468123, upload-time = "2024-10-14T23:38:00.688Z" }, + { url = "https://files.pythonhosted.org/packages/93/0d/b0038d5a469f94ed8f2b2fce2434a18396d8fbfb5da85a0a9781ebbdec14/uvloop-0.21.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787ae31ad8a2856fc4e7c095341cccc7209bd657d0e71ad0dc2ea83c4a6fa8af", size = 819325, upload-time = "2024-10-14T23:38:02.309Z" }, + { url = "https://files.pythonhosted.org/packages/50/94/0a687f39e78c4c1e02e3272c6b2ccdb4e0085fda3b8352fecd0410ccf915/uvloop-0.21.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ee4d4ef48036ff6e5cfffb09dd192c7a5027153948d85b8da7ff705065bacc6", size = 4582806, upload-time = "2024-10-14T23:38:04.711Z" }, + { url = "https://files.pythonhosted.org/packages/d2/19/f5b78616566ea68edd42aacaf645adbf71fbd83fc52281fba555dc27e3f1/uvloop-0.21.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816", size = 4701068, upload-time = "2024-10-14T23:38:06.385Z" }, + { url = "https://files.pythonhosted.org/packages/47/57/66f061ee118f413cd22a656de622925097170b9380b30091b78ea0c6ea75/uvloop-0.21.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc", size = 4454428, upload-time = "2024-10-14T23:38:08.416Z" }, + { url = "https://files.pythonhosted.org/packages/63/9a/0962b05b308494e3202d3f794a6e85abe471fe3cafdbcf95c2e8c713aabd/uvloop-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553", size = 4660018, upload-time = "2024-10-14T23:38:10.888Z" }, +] + +[[package]] +name = "vine" +version = "5.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/e4/d07b5f29d283596b9727dd5275ccbceb63c44a1a82aa9e4bfd20426762ac/vine-5.1.0.tar.gz", hash = "sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0", size = 48980, upload-time = "2023-11-05T08:46:53.857Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/ff/7c0c86c43b3cbb927e0ccc0255cb4057ceba4799cd44ae95174ce8e8b5b2/vine-5.1.0-py3-none-any.whl", hash = "sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc", size = 9636, upload-time = "2023-11-05T08:46:51.205Z" }, +] + +[[package]] +name = "watchfiles" +version = "1.0.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/03/e2/8ed598c42057de7aa5d97c472254af4906ff0a59a66699d426fc9ef795d7/watchfiles-1.0.5.tar.gz", hash = "sha256:b7529b5dcc114679d43827d8c35a07c493ad6f083633d573d81c660abc5979e9", size = 94537, upload-time = "2025-04-08T10:36:26.722Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/4d/d02e6ea147bb7fff5fd109c694a95109612f419abed46548a930e7f7afa3/watchfiles-1.0.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:5c40fe7dd9e5f81e0847b1ea64e1f5dd79dd61afbedb57759df06767ac719b40", size = 405632, upload-time = "2025-04-08T10:34:41.832Z" }, + { url = "https://files.pythonhosted.org/packages/60/31/9ee50e29129d53a9a92ccf1d3992751dc56fc3c8f6ee721be1c7b9c81763/watchfiles-1.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c0db396e6003d99bb2d7232c957b5f0b5634bbd1b24e381a5afcc880f7373fb", size = 395734, upload-time = "2025-04-08T10:34:44.236Z" }, + { url = "https://files.pythonhosted.org/packages/ad/8c/759176c97195306f028024f878e7f1c776bda66ccc5c68fa51e699cf8f1d/watchfiles-1.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b551d4fb482fc57d852b4541f911ba28957d051c8776e79c3b4a51eb5e2a1b11", size = 455008, upload-time = "2025-04-08T10:34:45.617Z" }, + { url = "https://files.pythonhosted.org/packages/55/1a/5e977250c795ee79a0229e3b7f5e3a1b664e4e450756a22da84d2f4979fe/watchfiles-1.0.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:830aa432ba5c491d52a15b51526c29e4a4b92bf4f92253787f9726fe01519487", size = 459029, upload-time = "2025-04-08T10:34:46.814Z" }, + { url = "https://files.pythonhosted.org/packages/e6/17/884cf039333605c1d6e296cf5be35fad0836953c3dfd2adb71b72f9dbcd0/watchfiles-1.0.5-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a16512051a822a416b0d477d5f8c0e67b67c1a20d9acecb0aafa3aa4d6e7d256", size = 488916, upload-time = "2025-04-08T10:34:48.571Z" }, + { url = "https://files.pythonhosted.org/packages/ef/e0/bcb6e64b45837056c0a40f3a2db3ef51c2ced19fda38484fa7508e00632c/watchfiles-1.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfe0cbc787770e52a96c6fda6726ace75be7f840cb327e1b08d7d54eadc3bc85", size = 523763, upload-time = "2025-04-08T10:34:50.268Z" }, + { url = "https://files.pythonhosted.org/packages/24/e9/f67e9199f3bb35c1837447ecf07e9830ec00ff5d35a61e08c2cd67217949/watchfiles-1.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d363152c5e16b29d66cbde8fa614f9e313e6f94a8204eaab268db52231fe5358", size = 502891, upload-time = "2025-04-08T10:34:51.419Z" }, + { url = "https://files.pythonhosted.org/packages/23/ed/a6cf815f215632f5c8065e9c41fe872025ffea35aa1f80499f86eae922db/watchfiles-1.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ee32c9a9bee4d0b7bd7cbeb53cb185cf0b622ac761efaa2eba84006c3b3a614", size = 454921, upload-time = "2025-04-08T10:34:52.67Z" }, + { url = "https://files.pythonhosted.org/packages/92/4c/e14978599b80cde8486ab5a77a821e8a982ae8e2fcb22af7b0886a033ec8/watchfiles-1.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:29c7fd632ccaf5517c16a5188e36f6612d6472ccf55382db6c7fe3fcccb7f59f", size = 631422, upload-time = "2025-04-08T10:34:53.985Z" }, + { url = "https://files.pythonhosted.org/packages/b2/1a/9263e34c3458f7614b657f974f4ee61fd72f58adce8b436e16450e054efd/watchfiles-1.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e637810586e6fe380c8bc1b3910accd7f1d3a9a7262c8a78d4c8fb3ba6a2b3d", size = 625675, upload-time = "2025-04-08T10:34:55.173Z" }, + { url = "https://files.pythonhosted.org/packages/96/1f/1803a18bd6ab04a0766386a19bcfe64641381a04939efdaa95f0e3b0eb58/watchfiles-1.0.5-cp310-cp310-win32.whl", hash = "sha256:cd47d063fbeabd4c6cae1d4bcaa38f0902f8dc5ed168072874ea11d0c7afc1ff", size = 277921, upload-time = "2025-04-08T10:34:56.318Z" }, + { url = "https://files.pythonhosted.org/packages/c2/3b/29a89de074a7d6e8b4dc67c26e03d73313e4ecf0d6e97e942a65fa7c195e/watchfiles-1.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:86c0df05b47a79d80351cd179893f2f9c1b1cae49d96e8b3290c7f4bd0ca0a92", size = 291526, upload-time = "2025-04-08T10:34:57.95Z" }, + { url = "https://files.pythonhosted.org/packages/39/f4/41b591f59021786ef517e1cdc3b510383551846703e03f204827854a96f8/watchfiles-1.0.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:237f9be419e977a0f8f6b2e7b0475ababe78ff1ab06822df95d914a945eac827", size = 405336, upload-time = "2025-04-08T10:34:59.359Z" }, + { url = "https://files.pythonhosted.org/packages/ae/06/93789c135be4d6d0e4f63e96eea56dc54050b243eacc28439a26482b5235/watchfiles-1.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0da39ff917af8b27a4bdc5a97ac577552a38aac0d260a859c1517ea3dc1a7c4", size = 395977, upload-time = "2025-04-08T10:35:00.522Z" }, + { url = "https://files.pythonhosted.org/packages/d2/db/1cd89bd83728ca37054512d4d35ab69b5f12b8aa2ac9be3b0276b3bf06cc/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cfcb3952350e95603f232a7a15f6c5f86c5375e46f0bd4ae70d43e3e063c13d", size = 455232, upload-time = "2025-04-08T10:35:01.698Z" }, + { url = "https://files.pythonhosted.org/packages/40/90/d8a4d44ffe960517e487c9c04f77b06b8abf05eb680bed71c82b5f2cad62/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:68b2dddba7a4e6151384e252a5632efcaa9bc5d1c4b567f3cb621306b2ca9f63", size = 459151, upload-time = "2025-04-08T10:35:03.358Z" }, + { url = "https://files.pythonhosted.org/packages/6c/da/267a1546f26465dead1719caaba3ce660657f83c9d9c052ba98fb8856e13/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:95cf944fcfc394c5f9de794ce581914900f82ff1f855326f25ebcf24d5397418", size = 489054, upload-time = "2025-04-08T10:35:04.561Z" }, + { url = "https://files.pythonhosted.org/packages/b1/31/33850dfd5c6efb6f27d2465cc4c6b27c5a6f5ed53c6fa63b7263cf5f60f6/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ecf6cd9f83d7c023b1aba15d13f705ca7b7d38675c121f3cc4a6e25bd0857ee9", size = 523955, upload-time = "2025-04-08T10:35:05.786Z" }, + { url = "https://files.pythonhosted.org/packages/09/84/b7d7b67856efb183a421f1416b44ca975cb2ea6c4544827955dfb01f7dc2/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:852de68acd6212cd6d33edf21e6f9e56e5d98c6add46f48244bd479d97c967c6", size = 502234, upload-time = "2025-04-08T10:35:07.187Z" }, + { url = "https://files.pythonhosted.org/packages/71/87/6dc5ec6882a2254cfdd8b0718b684504e737273903b65d7338efaba08b52/watchfiles-1.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5730f3aa35e646103b53389d5bc77edfbf578ab6dab2e005142b5b80a35ef25", size = 454750, upload-time = "2025-04-08T10:35:08.859Z" }, + { url = "https://files.pythonhosted.org/packages/3d/6c/3786c50213451a0ad15170d091570d4a6554976cf0df19878002fc96075a/watchfiles-1.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:18b3bd29954bc4abeeb4e9d9cf0b30227f0f206c86657674f544cb032296acd5", size = 631591, upload-time = "2025-04-08T10:35:10.64Z" }, + { url = "https://files.pythonhosted.org/packages/1b/b3/1427425ade4e359a0deacce01a47a26024b2ccdb53098f9d64d497f6684c/watchfiles-1.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ba5552a1b07c8edbf197055bc9d518b8f0d98a1c6a73a293bc0726dce068ed01", size = 625370, upload-time = "2025-04-08T10:35:12.412Z" }, + { url = "https://files.pythonhosted.org/packages/15/ba/f60e053b0b5b8145d682672024aa91370a29c5c921a88977eb565de34086/watchfiles-1.0.5-cp311-cp311-win32.whl", hash = "sha256:2f1fefb2e90e89959447bc0420fddd1e76f625784340d64a2f7d5983ef9ad246", size = 277791, upload-time = "2025-04-08T10:35:13.719Z" }, + { url = "https://files.pythonhosted.org/packages/50/ed/7603c4e164225c12c0d4e8700b64bb00e01a6c4eeea372292a3856be33a4/watchfiles-1.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:b6e76ceb1dd18c8e29c73f47d41866972e891fc4cc7ba014f487def72c1cf096", size = 291622, upload-time = "2025-04-08T10:35:15.071Z" }, + { url = "https://files.pythonhosted.org/packages/a2/c2/99bb7c96b4450e36877fde33690ded286ff555b5a5c1d925855d556968a1/watchfiles-1.0.5-cp311-cp311-win_arm64.whl", hash = "sha256:266710eb6fddc1f5e51843c70e3bebfb0f5e77cf4f27129278c70554104d19ed", size = 283699, upload-time = "2025-04-08T10:35:16.732Z" }, + { url = "https://files.pythonhosted.org/packages/2a/8c/4f0b9bdb75a1bfbd9c78fad7d8854369283f74fe7cf03eb16be77054536d/watchfiles-1.0.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:b5eb568c2aa6018e26da9e6c86f3ec3fd958cee7f0311b35c2630fa4217d17f2", size = 401511, upload-time = "2025-04-08T10:35:17.956Z" }, + { url = "https://files.pythonhosted.org/packages/dc/4e/7e15825def77f8bd359b6d3f379f0c9dac4eb09dd4ddd58fd7d14127179c/watchfiles-1.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0a04059f4923ce4e856b4b4e5e783a70f49d9663d22a4c3b3298165996d1377f", size = 392715, upload-time = "2025-04-08T10:35:19.202Z" }, + { url = "https://files.pythonhosted.org/packages/58/65/b72fb817518728e08de5840d5d38571466c1b4a3f724d190cec909ee6f3f/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e380c89983ce6e6fe2dd1e1921b9952fb4e6da882931abd1824c092ed495dec", size = 454138, upload-time = "2025-04-08T10:35:20.586Z" }, + { url = "https://files.pythonhosted.org/packages/3e/a4/86833fd2ea2e50ae28989f5950b5c3f91022d67092bfec08f8300d8b347b/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fe43139b2c0fdc4a14d4f8d5b5d967f7a2777fd3d38ecf5b1ec669b0d7e43c21", size = 458592, upload-time = "2025-04-08T10:35:21.87Z" }, + { url = "https://files.pythonhosted.org/packages/38/7e/42cb8df8be9a37e50dd3a818816501cf7a20d635d76d6bd65aae3dbbff68/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee0822ce1b8a14fe5a066f93edd20aada932acfe348bede8aa2149f1a4489512", size = 487532, upload-time = "2025-04-08T10:35:23.143Z" }, + { url = "https://files.pythonhosted.org/packages/fc/fd/13d26721c85d7f3df6169d8b495fcac8ab0dc8f0945ebea8845de4681dab/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a0dbcb1c2d8f2ab6e0a81c6699b236932bd264d4cef1ac475858d16c403de74d", size = 522865, upload-time = "2025-04-08T10:35:24.702Z" }, + { url = "https://files.pythonhosted.org/packages/a1/0d/7f9ae243c04e96c5455d111e21b09087d0eeaf9a1369e13a01c7d3d82478/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a2014a2b18ad3ca53b1f6c23f8cd94a18ce930c1837bd891262c182640eb40a6", size = 499887, upload-time = "2025-04-08T10:35:25.969Z" }, + { url = "https://files.pythonhosted.org/packages/8e/0f/a257766998e26aca4b3acf2ae97dff04b57071e991a510857d3799247c67/watchfiles-1.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10f6ae86d5cb647bf58f9f655fcf577f713915a5d69057a0371bc257e2553234", size = 454498, upload-time = "2025-04-08T10:35:27.353Z" }, + { url = "https://files.pythonhosted.org/packages/81/79/8bf142575a03e0af9c3d5f8bcae911ee6683ae93a625d349d4ecf4c8f7df/watchfiles-1.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1a7bac2bde1d661fb31f4d4e8e539e178774b76db3c2c17c4bb3e960a5de07a2", size = 630663, upload-time = "2025-04-08T10:35:28.685Z" }, + { url = "https://files.pythonhosted.org/packages/f1/80/abe2e79f610e45c63a70d271caea90c49bbf93eb00fa947fa9b803a1d51f/watchfiles-1.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ab626da2fc1ac277bbf752446470b367f84b50295264d2d313e28dc4405d663", size = 625410, upload-time = "2025-04-08T10:35:30.42Z" }, + { url = "https://files.pythonhosted.org/packages/91/6f/bc7fbecb84a41a9069c2c6eb6319f7f7df113adf113e358c57fc1aff7ff5/watchfiles-1.0.5-cp312-cp312-win32.whl", hash = "sha256:9f4571a783914feda92018ef3901dab8caf5b029325b5fe4558c074582815249", size = 277965, upload-time = "2025-04-08T10:35:32.023Z" }, + { url = "https://files.pythonhosted.org/packages/99/a5/bf1c297ea6649ec59e935ab311f63d8af5faa8f0b86993e3282b984263e3/watchfiles-1.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:360a398c3a19672cf93527f7e8d8b60d8275119c5d900f2e184d32483117a705", size = 291693, upload-time = "2025-04-08T10:35:33.225Z" }, + { url = "https://files.pythonhosted.org/packages/7f/7b/fd01087cc21db5c47e5beae507b87965db341cce8a86f9eb12bf5219d4e0/watchfiles-1.0.5-cp312-cp312-win_arm64.whl", hash = "sha256:1a2902ede862969077b97523987c38db28abbe09fb19866e711485d9fbf0d417", size = 283287, upload-time = "2025-04-08T10:35:34.568Z" }, + { url = "https://files.pythonhosted.org/packages/c7/62/435766874b704f39b2fecd8395a29042db2b5ec4005bd34523415e9bd2e0/watchfiles-1.0.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:0b289572c33a0deae62daa57e44a25b99b783e5f7aed81b314232b3d3c81a11d", size = 401531, upload-time = "2025-04-08T10:35:35.792Z" }, + { url = "https://files.pythonhosted.org/packages/6e/a6/e52a02c05411b9cb02823e6797ef9bbba0bfaf1bb627da1634d44d8af833/watchfiles-1.0.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a056c2f692d65bf1e99c41045e3bdcaea3cb9e6b5a53dcaf60a5f3bd95fc9763", size = 392417, upload-time = "2025-04-08T10:35:37.048Z" }, + { url = "https://files.pythonhosted.org/packages/3f/53/c4af6819770455932144e0109d4854437769672d7ad897e76e8e1673435d/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9dca99744991fc9850d18015c4f0438865414e50069670f5f7eee08340d8b40", size = 453423, upload-time = "2025-04-08T10:35:38.357Z" }, + { url = "https://files.pythonhosted.org/packages/cb/d1/8e88df58bbbf819b8bc5cfbacd3c79e01b40261cad0fc84d1e1ebd778a07/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:894342d61d355446d02cd3988a7326af344143eb33a2fd5d38482a92072d9563", size = 458185, upload-time = "2025-04-08T10:35:39.708Z" }, + { url = "https://files.pythonhosted.org/packages/ff/70/fffaa11962dd5429e47e478a18736d4e42bec42404f5ee3b92ef1b87ad60/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ab44e1580924d1ffd7b3938e02716d5ad190441965138b4aa1d1f31ea0877f04", size = 486696, upload-time = "2025-04-08T10:35:41.469Z" }, + { url = "https://files.pythonhosted.org/packages/39/db/723c0328e8b3692d53eb273797d9a08be6ffb1d16f1c0ba2bdbdc2a3852c/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d6f9367b132078b2ceb8d066ff6c93a970a18c3029cea37bfd7b2d3dd2e5db8f", size = 522327, upload-time = "2025-04-08T10:35:43.289Z" }, + { url = "https://files.pythonhosted.org/packages/cd/05/9fccc43c50c39a76b68343484b9da7b12d42d0859c37c61aec018c967a32/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2e55a9b162e06e3f862fb61e399fe9f05d908d019d87bf5b496a04ef18a970a", size = 499741, upload-time = "2025-04-08T10:35:44.574Z" }, + { url = "https://files.pythonhosted.org/packages/23/14/499e90c37fa518976782b10a18b18db9f55ea73ca14641615056f8194bb3/watchfiles-1.0.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0125f91f70e0732a9f8ee01e49515c35d38ba48db507a50c5bdcad9503af5827", size = 453995, upload-time = "2025-04-08T10:35:46.336Z" }, + { url = "https://files.pythonhosted.org/packages/61/d9/f75d6840059320df5adecd2c687fbc18960a7f97b55c300d20f207d48aef/watchfiles-1.0.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:13bb21f8ba3248386337c9fa51c528868e6c34a707f729ab041c846d52a0c69a", size = 629693, upload-time = "2025-04-08T10:35:48.161Z" }, + { url = "https://files.pythonhosted.org/packages/fc/17/180ca383f5061b61406477218c55d66ec118e6c0c51f02d8142895fcf0a9/watchfiles-1.0.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:839ebd0df4a18c5b3c1b890145b5a3f5f64063c2a0d02b13c76d78fe5de34936", size = 624677, upload-time = "2025-04-08T10:35:49.65Z" }, + { url = "https://files.pythonhosted.org/packages/bf/15/714d6ef307f803f236d69ee9d421763707899d6298d9f3183e55e366d9af/watchfiles-1.0.5-cp313-cp313-win32.whl", hash = "sha256:4a8ec1e4e16e2d5bafc9ba82f7aaecfeec990ca7cd27e84fb6f191804ed2fcfc", size = 277804, upload-time = "2025-04-08T10:35:51.093Z" }, + { url = "https://files.pythonhosted.org/packages/a8/b4/c57b99518fadf431f3ef47a610839e46e5f8abf9814f969859d1c65c02c7/watchfiles-1.0.5-cp313-cp313-win_amd64.whl", hash = "sha256:f436601594f15bf406518af922a89dcaab416568edb6f65c4e5bbbad1ea45c11", size = 291087, upload-time = "2025-04-08T10:35:52.458Z" }, + { url = "https://files.pythonhosted.org/packages/1a/03/81f9fcc3963b3fc415cd4b0b2b39ee8cc136c42fb10a36acf38745e9d283/watchfiles-1.0.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f59b870db1f1ae5a9ac28245707d955c8721dd6565e7f411024fa374b5362d1d", size = 405947, upload-time = "2025-04-08T10:36:13.721Z" }, + { url = "https://files.pythonhosted.org/packages/54/97/8c4213a852feb64807ec1d380f42d4fc8bfaef896bdbd94318f8fd7f3e4e/watchfiles-1.0.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9475b0093767e1475095f2aeb1d219fb9664081d403d1dff81342df8cd707034", size = 397276, upload-time = "2025-04-08T10:36:15.131Z" }, + { url = "https://files.pythonhosted.org/packages/78/12/d4464d19860cb9672efa45eec1b08f8472c478ed67dcd30647c51ada7aef/watchfiles-1.0.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc533aa50664ebd6c628b2f30591956519462f5d27f951ed03d6c82b2dfd9965", size = 455550, upload-time = "2025-04-08T10:36:16.635Z" }, + { url = "https://files.pythonhosted.org/packages/90/fb/b07bcdf1034d8edeaef4c22f3e9e3157d37c5071b5f9492ffdfa4ad4bed7/watchfiles-1.0.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fed1cd825158dcaae36acce7b2db33dcbfd12b30c34317a88b8ed80f0541cc57", size = 455542, upload-time = "2025-04-08T10:36:18.655Z" }, +] + +[[package]] +name = "wcwidth" +version = "0.2.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301, upload-time = "2024-01-06T02:10:57.829Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166, upload-time = "2024-01-06T02:10:55.763Z" }, +] + +[[package]] +name = "websockets" +version = "15.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/e6/26d09fab466b7ca9c7737474c52be4f76a40301b08362eb2dbc19dcc16c1/websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee", size = 177016, upload-time = "2025-03-05T20:03:41.606Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/da/6462a9f510c0c49837bbc9345aca92d767a56c1fb2939e1579df1e1cdcf7/websockets-15.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d63efaa0cd96cf0c5fe4d581521d9fa87744540d4bc999ae6e08595a1014b45b", size = 175423, upload-time = "2025-03-05T20:01:35.363Z" }, + { url = "https://files.pythonhosted.org/packages/1c/9f/9d11c1a4eb046a9e106483b9ff69bce7ac880443f00e5ce64261b47b07e7/websockets-15.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ac60e3b188ec7574cb761b08d50fcedf9d77f1530352db4eef1707fe9dee7205", size = 173080, upload-time = "2025-03-05T20:01:37.304Z" }, + { url = "https://files.pythonhosted.org/packages/d5/4f/b462242432d93ea45f297b6179c7333dd0402b855a912a04e7fc61c0d71f/websockets-15.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5756779642579d902eed757b21b0164cd6fe338506a8083eb58af5c372e39d9a", size = 173329, upload-time = "2025-03-05T20:01:39.668Z" }, + { url = "https://files.pythonhosted.org/packages/6e/0c/6afa1f4644d7ed50284ac59cc70ef8abd44ccf7d45850d989ea7310538d0/websockets-15.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdfe3e2a29e4db3659dbd5bbf04560cea53dd9610273917799f1cde46aa725e", size = 182312, upload-time = "2025-03-05T20:01:41.815Z" }, + { url = "https://files.pythonhosted.org/packages/dd/d4/ffc8bd1350b229ca7a4db2a3e1c482cf87cea1baccd0ef3e72bc720caeec/websockets-15.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c2529b320eb9e35af0fa3016c187dffb84a3ecc572bcee7c3ce302bfeba52bf", size = 181319, upload-time = "2025-03-05T20:01:43.967Z" }, + { url = "https://files.pythonhosted.org/packages/97/3a/5323a6bb94917af13bbb34009fac01e55c51dfde354f63692bf2533ffbc2/websockets-15.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac1e5c9054fe23226fb11e05a6e630837f074174c4c2f0fe442996112a6de4fb", size = 181631, upload-time = "2025-03-05T20:01:46.104Z" }, + { url = "https://files.pythonhosted.org/packages/a6/cc/1aeb0f7cee59ef065724041bb7ed667b6ab1eeffe5141696cccec2687b66/websockets-15.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5df592cd503496351d6dc14f7cdad49f268d8e618f80dce0cd5a36b93c3fc08d", size = 182016, upload-time = "2025-03-05T20:01:47.603Z" }, + { url = "https://files.pythonhosted.org/packages/79/f9/c86f8f7af208e4161a7f7e02774e9d0a81c632ae76db2ff22549e1718a51/websockets-15.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0a34631031a8f05657e8e90903e656959234f3a04552259458aac0b0f9ae6fd9", size = 181426, upload-time = "2025-03-05T20:01:48.949Z" }, + { url = "https://files.pythonhosted.org/packages/c7/b9/828b0bc6753db905b91df6ae477c0b14a141090df64fb17f8a9d7e3516cf/websockets-15.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d00075aa65772e7ce9e990cab3ff1de702aa09be3940d1dc88d5abf1ab8a09c", size = 181360, upload-time = "2025-03-05T20:01:50.938Z" }, + { url = "https://files.pythonhosted.org/packages/89/fb/250f5533ec468ba6327055b7d98b9df056fb1ce623b8b6aaafb30b55d02e/websockets-15.0.1-cp310-cp310-win32.whl", hash = "sha256:1234d4ef35db82f5446dca8e35a7da7964d02c127b095e172e54397fb6a6c256", size = 176388, upload-time = "2025-03-05T20:01:52.213Z" }, + { url = "https://files.pythonhosted.org/packages/1c/46/aca7082012768bb98e5608f01658ff3ac8437e563eca41cf068bd5849a5e/websockets-15.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:39c1fec2c11dc8d89bba6b2bf1556af381611a173ac2b511cf7231622058af41", size = 176830, upload-time = "2025-03-05T20:01:53.922Z" }, + { url = "https://files.pythonhosted.org/packages/9f/32/18fcd5919c293a398db67443acd33fde142f283853076049824fc58e6f75/websockets-15.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:823c248b690b2fd9303ba00c4f66cd5e2d8c3ba4aa968b2779be9532a4dad431", size = 175423, upload-time = "2025-03-05T20:01:56.276Z" }, + { url = "https://files.pythonhosted.org/packages/76/70/ba1ad96b07869275ef42e2ce21f07a5b0148936688c2baf7e4a1f60d5058/websockets-15.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678999709e68425ae2593acf2e3ebcbcf2e69885a5ee78f9eb80e6e371f1bf57", size = 173082, upload-time = "2025-03-05T20:01:57.563Z" }, + { url = "https://files.pythonhosted.org/packages/86/f2/10b55821dd40eb696ce4704a87d57774696f9451108cff0d2824c97e0f97/websockets-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d50fd1ee42388dcfb2b3676132c78116490976f1300da28eb629272d5d93e905", size = 173330, upload-time = "2025-03-05T20:01:59.063Z" }, + { url = "https://files.pythonhosted.org/packages/a5/90/1c37ae8b8a113d3daf1065222b6af61cc44102da95388ac0018fcb7d93d9/websockets-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d99e5546bf73dbad5bf3547174cd6cb8ba7273062a23808ffea025ecb1cf8562", size = 182878, upload-time = "2025-03-05T20:02:00.305Z" }, + { url = "https://files.pythonhosted.org/packages/8e/8d/96e8e288b2a41dffafb78e8904ea7367ee4f891dafc2ab8d87e2124cb3d3/websockets-15.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66dd88c918e3287efc22409d426c8f729688d89a0c587c88971a0faa2c2f3792", size = 181883, upload-time = "2025-03-05T20:02:03.148Z" }, + { url = "https://files.pythonhosted.org/packages/93/1f/5d6dbf551766308f6f50f8baf8e9860be6182911e8106da7a7f73785f4c4/websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dd8327c795b3e3f219760fa603dcae1dcc148172290a8ab15158cf85a953413", size = 182252, upload-time = "2025-03-05T20:02:05.29Z" }, + { url = "https://files.pythonhosted.org/packages/d4/78/2d4fed9123e6620cbf1706c0de8a1632e1a28e7774d94346d7de1bba2ca3/websockets-15.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fdc51055e6ff4adeb88d58a11042ec9a5eae317a0a53d12c062c8a8865909e8", size = 182521, upload-time = "2025-03-05T20:02:07.458Z" }, + { url = "https://files.pythonhosted.org/packages/e7/3b/66d4c1b444dd1a9823c4a81f50231b921bab54eee2f69e70319b4e21f1ca/websockets-15.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:693f0192126df6c2327cce3baa7c06f2a117575e32ab2308f7f8216c29d9e2e3", size = 181958, upload-time = "2025-03-05T20:02:09.842Z" }, + { url = "https://files.pythonhosted.org/packages/08/ff/e9eed2ee5fed6f76fdd6032ca5cd38c57ca9661430bb3d5fb2872dc8703c/websockets-15.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:54479983bd5fb469c38f2f5c7e3a24f9a4e70594cd68cd1fa6b9340dadaff7cf", size = 181918, upload-time = "2025-03-05T20:02:11.968Z" }, + { url = "https://files.pythonhosted.org/packages/d8/75/994634a49b7e12532be6a42103597b71098fd25900f7437d6055ed39930a/websockets-15.0.1-cp311-cp311-win32.whl", hash = "sha256:16b6c1b3e57799b9d38427dda63edcbe4926352c47cf88588c0be4ace18dac85", size = 176388, upload-time = "2025-03-05T20:02:13.32Z" }, + { url = "https://files.pythonhosted.org/packages/98/93/e36c73f78400a65f5e236cd376713c34182e6663f6889cd45a4a04d8f203/websockets-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:27ccee0071a0e75d22cb35849b1db43f2ecd3e161041ac1ee9d2352ddf72f065", size = 176828, upload-time = "2025-03-05T20:02:14.585Z" }, + { url = "https://files.pythonhosted.org/packages/51/6b/4545a0d843594f5d0771e86463606a3988b5a09ca5123136f8a76580dd63/websockets-15.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3", size = 175437, upload-time = "2025-03-05T20:02:16.706Z" }, + { url = "https://files.pythonhosted.org/packages/f4/71/809a0f5f6a06522af902e0f2ea2757f71ead94610010cf570ab5c98e99ed/websockets-15.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665", size = 173096, upload-time = "2025-03-05T20:02:18.832Z" }, + { url = "https://files.pythonhosted.org/packages/3d/69/1a681dd6f02180916f116894181eab8b2e25b31e484c5d0eae637ec01f7c/websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2", size = 173332, upload-time = "2025-03-05T20:02:20.187Z" }, + { url = "https://files.pythonhosted.org/packages/a6/02/0073b3952f5bce97eafbb35757f8d0d54812b6174ed8dd952aa08429bcc3/websockets-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215", size = 183152, upload-time = "2025-03-05T20:02:22.286Z" }, + { url = "https://files.pythonhosted.org/packages/74/45/c205c8480eafd114b428284840da0b1be9ffd0e4f87338dc95dc6ff961a1/websockets-15.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5", size = 182096, upload-time = "2025-03-05T20:02:24.368Z" }, + { url = "https://files.pythonhosted.org/packages/14/8f/aa61f528fba38578ec553c145857a181384c72b98156f858ca5c8e82d9d3/websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65", size = 182523, upload-time = "2025-03-05T20:02:25.669Z" }, + { url = "https://files.pythonhosted.org/packages/ec/6d/0267396610add5bc0d0d3e77f546d4cd287200804fe02323797de77dbce9/websockets-15.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe", size = 182790, upload-time = "2025-03-05T20:02:26.99Z" }, + { url = "https://files.pythonhosted.org/packages/02/05/c68c5adbf679cf610ae2f74a9b871ae84564462955d991178f95a1ddb7dd/websockets-15.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4", size = 182165, upload-time = "2025-03-05T20:02:30.291Z" }, + { url = "https://files.pythonhosted.org/packages/29/93/bb672df7b2f5faac89761cb5fa34f5cec45a4026c383a4b5761c6cea5c16/websockets-15.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597", size = 182160, upload-time = "2025-03-05T20:02:31.634Z" }, + { url = "https://files.pythonhosted.org/packages/ff/83/de1f7709376dc3ca9b7eeb4b9a07b4526b14876b6d372a4dc62312bebee0/websockets-15.0.1-cp312-cp312-win32.whl", hash = "sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9", size = 176395, upload-time = "2025-03-05T20:02:33.017Z" }, + { url = "https://files.pythonhosted.org/packages/7d/71/abf2ebc3bbfa40f391ce1428c7168fb20582d0ff57019b69ea20fa698043/websockets-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7", size = 176841, upload-time = "2025-03-05T20:02:34.498Z" }, + { url = "https://files.pythonhosted.org/packages/cb/9f/51f0cf64471a9d2b4d0fc6c534f323b664e7095640c34562f5182e5a7195/websockets-15.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931", size = 175440, upload-time = "2025-03-05T20:02:36.695Z" }, + { url = "https://files.pythonhosted.org/packages/8a/05/aa116ec9943c718905997412c5989f7ed671bc0188ee2ba89520e8765d7b/websockets-15.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675", size = 173098, upload-time = "2025-03-05T20:02:37.985Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0b/33cef55ff24f2d92924923c99926dcce78e7bd922d649467f0eda8368923/websockets-15.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151", size = 173329, upload-time = "2025-03-05T20:02:39.298Z" }, + { url = "https://files.pythonhosted.org/packages/31/1d/063b25dcc01faa8fada1469bdf769de3768b7044eac9d41f734fd7b6ad6d/websockets-15.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22", size = 183111, upload-time = "2025-03-05T20:02:40.595Z" }, + { url = "https://files.pythonhosted.org/packages/93/53/9a87ee494a51bf63e4ec9241c1ccc4f7c2f45fff85d5bde2ff74fcb68b9e/websockets-15.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f", size = 182054, upload-time = "2025-03-05T20:02:41.926Z" }, + { url = "https://files.pythonhosted.org/packages/ff/b2/83a6ddf56cdcbad4e3d841fcc55d6ba7d19aeb89c50f24dd7e859ec0805f/websockets-15.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8", size = 182496, upload-time = "2025-03-05T20:02:43.304Z" }, + { url = "https://files.pythonhosted.org/packages/98/41/e7038944ed0abf34c45aa4635ba28136f06052e08fc2168520bb8b25149f/websockets-15.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375", size = 182829, upload-time = "2025-03-05T20:02:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/e0/17/de15b6158680c7623c6ef0db361da965ab25d813ae54fcfeae2e5b9ef910/websockets-15.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d", size = 182217, upload-time = "2025-03-05T20:02:50.14Z" }, + { url = "https://files.pythonhosted.org/packages/33/2b/1f168cb6041853eef0362fb9554c3824367c5560cbdaad89ac40f8c2edfc/websockets-15.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4", size = 182195, upload-time = "2025-03-05T20:02:51.561Z" }, + { url = "https://files.pythonhosted.org/packages/86/eb/20b6cdf273913d0ad05a6a14aed4b9a85591c18a987a3d47f20fa13dcc47/websockets-15.0.1-cp313-cp313-win32.whl", hash = "sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa", size = 176393, upload-time = "2025-03-05T20:02:53.814Z" }, + { url = "https://files.pythonhosted.org/packages/1b/6c/c65773d6cab416a64d191d6ee8a8b1c68a09970ea6909d16965d26bfed1e/websockets-15.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561", size = 176837, upload-time = "2025-03-05T20:02:55.237Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/d40f779fa16f74d3468357197af8d6ad07e7c5a27ea1ca74ceb38986f77a/websockets-15.0.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0c9e74d766f2818bb95f84c25be4dea09841ac0f734d1966f415e4edfc4ef1c3", size = 173109, upload-time = "2025-03-05T20:03:17.769Z" }, + { url = "https://files.pythonhosted.org/packages/bc/cd/5b887b8585a593073fd92f7c23ecd3985cd2c3175025a91b0d69b0551372/websockets-15.0.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1009ee0c7739c08a0cd59de430d6de452a55e42d6b522de7aa15e6f67db0b8e1", size = 173343, upload-time = "2025-03-05T20:03:19.094Z" }, + { url = "https://files.pythonhosted.org/packages/fe/ae/d34f7556890341e900a95acf4886833646306269f899d58ad62f588bf410/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76d1f20b1c7a2fa82367e04982e708723ba0e7b8d43aa643d3dcd404d74f1475", size = 174599, upload-time = "2025-03-05T20:03:21.1Z" }, + { url = "https://files.pythonhosted.org/packages/71/e6/5fd43993a87db364ec60fc1d608273a1a465c0caba69176dd160e197ce42/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f29d80eb9a9263b8d109135351caf568cc3f80b9928bccde535c235de55c22d9", size = 174207, upload-time = "2025-03-05T20:03:23.221Z" }, + { url = "https://files.pythonhosted.org/packages/2b/fb/c492d6daa5ec067c2988ac80c61359ace5c4c674c532985ac5a123436cec/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b359ed09954d7c18bbc1680f380c7301f92c60bf924171629c5db97febb12f04", size = 174155, upload-time = "2025-03-05T20:03:25.321Z" }, + { url = "https://files.pythonhosted.org/packages/68/a1/dcb68430b1d00b698ae7a7e0194433bce4f07ded185f0ee5fb21e2a2e91e/websockets-15.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cad21560da69f4ce7658ca2cb83138fb4cf695a2ba3e475e0559e05991aa8122", size = 176884, upload-time = "2025-03-05T20:03:27.934Z" }, + { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" }, +] From ecaba89d0d830c93cd7b2bcc073988189a0089c6 Mon Sep 17 00:00:00 2001 From: Ali Tavallaie Date: Mon, 12 May 2025 21:49:01 +0330 Subject: [PATCH 2/6] chore: removing unnecessary file --- requirements.txt | 34 ---------------------------------- setup.py | 0 2 files changed, 34 deletions(-) delete mode 100644 requirements.txt delete mode 100644 setup.py diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 27a278d..0000000 --- a/requirements.txt +++ /dev/null @@ -1,34 +0,0 @@ -# --- UPDATED FILE: requirements.txt --- - -# Configuration -python-dotenv - -# Database ORM and Migrations -SQLAlchemy -psycopg2-binary # For PostgreSQL connection (use non-binary in production if needed) -alembic - -# HTTP Requests -requests - -# UUID Generation (Needed for DiscoveryChain ID) - Standard library 'uuid' used. - -# --- ADDED FOR PHASE 2 API --- -# Web Framework -fastapi - -concurrent-log-handler>=0.9.23 # Or latest version - -# ASGI Server -uvicorn[standard] # Includes standard extras like httptools, websockets -# --- END ADDED FOR PHASE 2 API --- - -# Background Task Queue -celery>=5.3.6 # Use a specific recent version -redis>=5.0.4 - -# --- ADDED FOR PHASE 17 Analysis --- -# Graph Analysis -networkx>=3.0 # Use version 3+ -python-louvain # For Louvain community detection -# --- END ADDED FOR PHASE 17 Analysis --- \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index e69de29..0000000 From 04b8679abc2fe79a3e682b56b706ec6f67ff1f52 Mon Sep 17 00:00:00 2001 From: Ali Tavallaie Date: Mon, 12 May 2025 21:56:17 +0330 Subject: [PATCH 3/6] chore: update readme for using uv --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5695c71..f6688d6 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,8 @@ The system uses a FastAPI web framework for its API, PostgreSQL as the database, Before you begin, ensure you have the following installed on your system: 1. **Python:** Version 3.10 or higher is recommended. [Download Python](https://www.python.org/downloads/) - 2. **pip:** Python's package installer (usually comes with Python). + 2. **uv:** Python's package installer, follow [this instrauction](https://docs.astral.sh/uv/#installation) to install. + use `uv sync` for install dependencies. 3. **Git:** For cloning the repository. [Download Git](https://git-scm.com/downloads) 4. **PostgreSQL:** A running PostgreSQL database server (version 12+ recommended). You'll need the ability to create a database and a user. [Download PostgreSQL](https://www.postgresql.org/download/) 5. **Redis:** A running Redis server. Celery uses this to manage background tasks. [Download Redis](https://redis.io/docs/getting-started/installation/) or use Docker. From 5beb4f564ad525d1f73156e4c7b84e95706aa7d5 Mon Sep 17 00:00:00 2001 From: Ali Tavallaie Date: Mon, 12 May 2025 23:07:32 +0330 Subject: [PATCH 4/6] chore(README): fix and update - add autoconter for numberd list - add uv to other parts too --- README.md | 49 ++++++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index f6688d6..ae4deb9 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,6 @@ The system uses a FastAPI web framework for its API, PostgreSQL as the database, 1. **Python:** Version 3.10 or higher is recommended. [Download Python](https://www.python.org/downloads/) 2. **uv:** Python's package installer, follow [this instrauction](https://docs.astral.sh/uv/#installation) to install. - use `uv sync` for install dependencies. 3. **Git:** For cloning the repository. [Download Git](https://git-scm.com/downloads) 4. **PostgreSQL:** A running PostgreSQL database server (version 12+ recommended). You'll need the ability to create a database and a user. [Download PostgreSQL](https://www.postgresql.org/download/) 5. **Redis:** A running Redis server. Celery uses this to manage background tasks. [Download Redis](https://redis.io/docs/getting-started/installation/) or use Docker. @@ -67,31 +66,19 @@ Follow these steps carefully to set up the MOSS backend application: cd moss/ ``` -2. **Create a Virtual Environment:** - It's highly recommended to use a virtual environment to isolate project dependencies. +1. **Install Dependencies:** + Install all the required Python packages: ```bash - python -m venv venv + uv sync ``` - *(This creates a `venv` directory in your project folder.)* + `uv` automatically make `.venv` directory in root project and install all the dependenies. -3. **Activate the Virtual Environment:** - * **On macOS/Linux:** - ```bash - source venv/bin/activate - ``` - * **On Windows:** - ```bash - .\venv\Scripts\activate - ``` - *(Your terminal prompt should change to indicate the active environment, e.g., `(venv)`).* - -4. **Install Dependencies:** - Install all the required Python packages listed in `requirements.txt`: + for **Contributing** please use it with `--dev` to install devdependency: ```bash - pip install -r requirements.txt + uv sync --dev ``` -5. **Configure Environment Variables (`.env` file):** +1. **Configure Environment Variables (`.env` file):** * Copy the example environment file: ```bash cp .env.example .env @@ -114,7 +101,7 @@ Follow these steps carefully to set up the MOSS backend application: * `CELERY_RESULT_BACKEND_URL`: URL for your Redis server (used by Celery to store task results). * Default: `redis://localhost:6379/1` (using database 1, different from the broker). Adjust if needed. -6. **Set Up PostgreSQL Database:** +1. **Set Up PostgreSQL Database:** * Connect to your PostgreSQL server (e.g., using `psql` or a GUI tool). * Create the database (if it doesn't exist). **Use the name you specified in `.env`**. ```sql @@ -129,7 +116,7 @@ Follow these steps carefully to set up the MOSS backend application: ``` * *(**Note:** These are example commands. Adjust them based on your PostgreSQL setup and security practices.)* -7. **Run Database Migrations:** +1. **Run Database Migrations:** This step creates all the necessary tables in your database based on the application's models. We use Alembic, managed via a script. ```bash python scripts/setup_db.py @@ -143,7 +130,7 @@ The application consists of two main parts that need to run concurrently: the ** 1. **Start the API Server (FastAPI with Uvicorn):** This makes the REST API available. ```bash - uvicorn backend.main:app --reload --host 0.0.0.0 --port 8000 + uv run uvicorn backend.main:app --reload --host 0.0.0.0 --port 8000 ``` * `backend.main:app`: Tells Uvicorn where to find the FastAPI app instance. * `--reload`: Automatically restarts the server when code changes (useful for development). Remove this flag in production. @@ -152,7 +139,7 @@ The application consists of two main parts that need to run concurrently: the ** * You should see output indicating the server is running, often including `Application startup complete.` * You can access the API documentation at `http://localhost:8000/docs` in your browser. -2. **Start the Celery Workers:** +1. **Start the Celery Workers:** These processes handle background tasks like keyword discovery and DOI processing. **Make sure Redis is running before starting the workers.** ```bash celery -A backend.celery_app worker -l info -P eventlet -c 4 @@ -174,14 +161,14 @@ The frontend application is typically developed and run separately from the back cd frontend/ ``` -2. **Install Frontend Dependencies:** +1. **Install Frontend Dependencies:** Install the necessary Node.js packages defined in `package.json`: ```bash npm install ``` *(This command downloads all the libraries the frontend needs. It might take a few minutes the first time.)* -3. **Configure Frontend Environment (Optional):** +1. **Configure Frontend Environment (Optional):** * The frontend might require its own environment variables (e.g., the URL of the backend API). Look for a file named `.env.development.local` or similar example files in the `frontend/` directory. * If an example file exists (like `.env.development.local.example`), copy it: ```bash @@ -189,14 +176,14 @@ The frontend application is typically developed and run separately from the back ``` * Edit the `.env.development.local` file and adjust any necessary settings, such as `VITE_API_BASE_URL` if the backend isn't running on `http://localhost:8000`. By default, it should usually point to where the backend API server is running. -4. **Start the Frontend Development Server:** +1. **Start the Frontend Development Server:** Run the development server script: ```bash npm run dev ``` *(This command typically starts a local web server for the frontend with features like automatic reloading when you change frontend code.)* -5. **Access the Frontend:** +1. **Access the Frontend:** * Once the server starts, it will usually print a URL in the terminal. Open this URL in your web browser. * Common URLs are `http://localhost:5173` (Vite default) or `http://localhost:3000` (Create React App default). Check the terminal output for the correct one. @@ -205,8 +192,8 @@ The frontend application is typically developed and run separately from the back To run the full MOSS application locally for development, you will typically need **three separate terminals** running concurrently (ensure the Python virtual environment is activated in the backend terminals): 1. **Terminal 1:** Backend API Server (`uvicorn backend.main:app ...`) -2. **Terminal 2:** Celery Worker (`celery -A backend.celery_app worker ...`) -3. **Terminal 3:** Frontend Development Server (`cd frontend && npm run dev`) +1. **Terminal 2:** Celery Worker (`celery -A backend.celery_app worker ...`) +1. **Terminal 3:** Frontend Development Server (`cd frontend && npm run dev`) *(Remember to have PostgreSQL and Redis running in the background as well).* ## Running Database Migrations Manually @@ -219,7 +206,7 @@ If you make changes to the database models (`backend/data/models/`) later, you w ``` *(Review the generated script in `backend/data/migrations/versions/`)* -2. **Apply the migration:** +1. **Apply the migration:** ```bash python scripts/setup_db.py ``` From be307f5ad95c953fbc54583e20b55244ab6329ff Mon Sep 17 00:00:00 2001 From: Ali Tavallaie Date: Mon, 12 May 2025 23:52:03 +0330 Subject: [PATCH 5/6] WIP: add pre-commit config --- .pre-commit-config.yaml | 12 + Older Experiments/docs/gen_ref_pages.py | 27 +- .../clients/github_client.py | 127 +- .../clients/openalex_client.py | 97 +- .../scrappy-proof-of-concept/config.py | 11 +- .../scrappy-proof-of-concept/db/database.py | 11 +- .../scrappy-proof-of-concept/main.py | 222 +-- .../scrappy-proof-of-concept/models/models.py | 277 ++-- .../queries/acf_query.py | 693 +++++---- .../queries/analysis_history.py | 353 +++-- .../queries/citing_works.py | 111 +- .../queries/externalcontributors.py | 42 +- .../queries/institution_analysis_query.py | 1166 +++++++++------- .../queries/interactive_query.py | 160 ++- .../scrappy-proof-of-concept/queries/top10.py | 21 +- .../queries/top_domains.py | 32 +- .../queries/top_fields.py | 32 +- .../queries/top_subfields.py | 32 +- .../queries/top_topics.py | 32 +- .../scrappy-proof-of-concept/queries/usage.py | 48 +- .../services/acf_base.py | 19 +- .../services/acf_filters/__init__.py | 2 +- .../acf_filters/comprehensive_filter.py | 453 +++--- .../services/acf_framework.py | 449 +++--- .../services/discovery.py | 47 +- .../services/entity_service.py | 491 ++++--- .../services/github_ingestion.py | 701 ++++++---- .../services/ingestion_service.py | 115 +- .../services/institution_analysis.py | 191 +-- .../institution_analysis_impl/person_acf.py | 213 +-- .../institution_analysis_impl/surfacing.py | 647 +++++---- .../services/openalex_ingestion.py | 405 ++++-- .../services/query_service.py | 23 +- .../scrappy-proof-of-concept/utils/common.py | 19 +- .../scrappy-proof-of-concept/utils/filters.py | 4 + .../utils/logging_config.py | 6 +- .../utils/repo_finder.py | 64 +- Older Experiments/scripts/ecosyst.ms-api.py | 12 +- .../scripts/repo_cite/repo_cite.py | 414 ++++-- .../scripts/repo_cite/test_repo_cite.py | 76 +- .../scripts/repo_finder/repofinder.py | 894 ++++++++---- backend/api/__init__.py | 2 +- backend/api/deps.py | 6 +- backend/api/v1/__init__.py | 2 +- backend/api/v1/api.py | 24 +- backend/api/v1/endpoints/__init__.py | 2 +- .../v1/endpoints/affiliation_algorithms.py | 246 ++-- .../api/v1/endpoints/discovery_algorithms.py | 187 ++- backend/api/v1/endpoints/history.py | 154 +- backend/api/v1/endpoints/ingestion.py | 205 ++- backend/api/v1/endpoints/retrieval.py | 160 ++- backend/api/v1/endpoints/search.py | 167 ++- backend/api/v1/endpoints/shared_recipes.py | 156 ++- backend/api/v1/endpoints/surfacing.py | 340 +++-- backend/celery_app.py | 44 +- backend/config/__init__.py | 2 +- backend/config/logging_config.py | 120 +- backend/config/settings.py | 44 +- backend/data/__init__.py | 2 +- backend/data/database.py | 27 +- backend/data/migrations/env.py | 17 +- ...31_phase_10_add_repository_institution_.py | 80 +- ...9702b_phase_2_add_keyword_search_models.py | 120 +- ...a4cf052_allow_null_entity_id_in_entity_.py | 25 +- ...a9c_phase_19_add_domain_field_subfield_.py | 252 ++-- ...phase_21_add_pullrequest_issue_comment_.py | 359 +++-- ...e8_phase_3_add_scholarly_entity_models_.py | 244 ++-- ...9ca94_phase_1_initial_core_schema_with_.py | 567 +++++--- ...64e5_phase_6_add_topics_and_license_to_.py | 21 +- ...c_phase_19_add_concept_and_workconcept_.py | 22 +- ..._phase_18_add_software_dependency_table.py | 143 +- ...4bf_phase_10_3_add_github_organization_.py | 16 +- backend/data/models/__init__.py | 12 +- backend/data/models/affiliation.py | 22 +- backend/data/models/authorship.py | 22 +- backend/data/models/base.py | 6 +- backend/data/models/contributor.py | 24 +- backend/data/models/discovery_chain.py | 66 +- backend/data/models/doi_reference.py | 31 +- backend/data/models/domain.py | 19 +- .../models/entity_discovery_association.py | 49 +- backend/data/models/field.py | 31 +- backend/data/models/institution.py | 36 +- backend/data/models/issue.py | 48 +- backend/data/models/issue_comment.py | 44 +- .../models/keyword_repository_association.py | 22 +- backend/data/models/keyword_search_session.py | 37 +- backend/data/models/owner.py | 43 +- backend/data/models/person.py | 36 +- backend/data/models/pr_review_comment.py | 46 +- backend/data/models/pull_request.py | 52 +- backend/data/models/repository.py | 51 +- backend/data/models/repository_contributor.py | 26 +- .../repository_institution_affiliation.py | 41 +- backend/data/models/software_dependency.py | 37 +- backend/data/models/subfield.py | 31 +- backend/data/models/topic.py | 28 +- backend/data/models/types.py | 34 +- backend/data/models/work.py | 57 +- backend/data/models/work_citation.py | 17 +- backend/data/models/work_topic.py | 12 +- backend/data/repositories/__init__.py | 11 +- backend/data/repositories/base_repository.py | 73 +- backend/data/repositories/contributor_repo.py | 110 +- .../data/repositories/discovery_chain_repo.py | 12 +- .../data/repositories/doi_reference_repo.py | 41 +- backend/data/repositories/domain_repo.py | 79 +- .../repositories/entity_discovery_repo.py | 34 +- backend/data/repositories/field_repo.py | 97 +- backend/data/repositories/institution_repo.py | 279 ++-- .../data/repositories/issue_comment_repo.py | 74 +- backend/data/repositories/issue_repo.py | 94 +- .../keyword_repository_association_repo.py | 40 +- .../keyword_search_session_repo.py | 6 +- backend/data/repositories/owner_repo.py | 112 +- backend/data/repositories/person_repo.py | 283 ++-- .../repositories/pr_review_comment_repo.py | 89 +- .../data/repositories/pull_request_repo.py | 102 +- ...repository_institution_affiliation_repo.py | 48 +- backend/data/repositories/repository_repo.py | 143 +- .../repositories/software_dependency_repo.py | 75 +- backend/data/repositories/subfield_repo.py | 97 +- backend/data/repositories/topic_repo.py | 105 +- backend/data/repositories/work_repo.py | 282 ++-- backend/external/__init__.py | 2 +- backend/external/client_base.py | 140 +- backend/external/github_client.py | 457 ++++-- backend/external/openalex_client.py | 373 +++-- backend/main.py | 24 +- backend/schemas/__init__.py | 2 +- backend/schemas/requests.py | 36 +- backend/schemas/responses.py | 504 +++++-- backend/services/__init__.py | 6 +- backend/services/base_service.py | 4 +- backend/services/discovery_chain_service.py | 255 ++-- backend/services/doi_processing_service.py | 449 ++++-- backend/services/ingestion_service.py | 1240 +++++++++++------ backend/services/keyword_discovery_service.py | 336 +++-- .../services/scholarly_processing_service.py | 832 +++++++---- backend/services/surfacing_service.py | 496 ++++--- backend/tasks/__init__.py | 2 +- backend/tasks/discovery_tasks.py | 154 +- backend/tasks/scholarly_tasks.py | 1107 ++++++++++----- backend/utils/__init__.py | 2 +- backend/utils/doi_utils.py | 21 +- backend/utils/github_utils.py | 50 +- backend/utils/recipe_executor.py | 161 ++- backend/utils/recipe_utils.py | 153 +- .../contributor_affiliation_match_v1.py | 121 +- .../keyword_match_v1.py | 130 +- .../readme_mention_v1.py | 155 ++- .../keyword_discovery_v1.py | 39 +- .../citation_community_detection_v1.py | 108 +- contrib/queries/citing_work_subjects_v1.py | 164 ++- .../queries/engaged_non_pr_contributors_v1.py | 49 +- .../queries/institutional_authorship_v1.py | 64 +- ...stitutional_contribution_aggregation_v1.py | 62 +- contrib/queries/repo_health_v1.py | 134 +- contrib/queries/top_pr_contributors_v1.py | 32 +- contrib/queries/top_subjects_v1.py | 254 +++- .../queries/works_by_citing_institution_v1.py | 100 +- pyproject.toml | 5 + scripts/setup_db.py | 15 +- uv.lock | 92 ++ 164 files changed, 15608 insertions(+), 8495 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..f1c490b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,12 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.11.9 + hooks: + # Run the linter. + - id: ruff + types_or: [ python, pyi ] + args: [ --fix ] + # Run the formatter. + - id: ruff-format + types_or: [ python, pyi ] \ No newline at end of file diff --git a/Older Experiments/docs/gen_ref_pages.py b/Older Experiments/docs/gen_ref_pages.py index d135324..7a83213 100644 --- a/Older Experiments/docs/gen_ref_pages.py +++ b/Older Experiments/docs/gen_ref_pages.py @@ -1,4 +1,5 @@ """Generate the code reference pages and navigation.""" + # from: https://mkdocstrings.github.io/recipes/#bind-pages-to-sections-themselves import os from pathlib import Path @@ -7,29 +8,29 @@ nav = mkdocs_gen_files.Nav() -src = os.path.join("src", "moss", "lib") +src = os.path.join('src', 'moss', 'lib') -for path in sorted(Path(src).rglob("*.py")): - module_path = src / path.relative_to(src).with_suffix("") - doc_path = src / path.relative_to(src).with_suffix(".md") - full_doc_path = Path("reference", doc_path) +for path in sorted(Path(src).rglob('*.py')): + module_path = src / path.relative_to(src).with_suffix('') + doc_path = src / path.relative_to(src).with_suffix('.md') + full_doc_path = Path('reference', doc_path) parts = tuple(module_path.parts) - if parts[-1] == "__init__": + if parts[-1] == '__init__': parts = parts[:-1] - doc_path = doc_path.with_name("index.md") - full_doc_path = full_doc_path.with_name("index.md") - elif parts[-1] == "__main__": + doc_path = doc_path.with_name('index.md') + full_doc_path = full_doc_path.with_name('index.md') + elif parts[-1] == '__main__': continue nav[parts] = doc_path.as_posix() - with mkdocs_gen_files.open(full_doc_path, "w") as fd: - ident = ".".join(parts) - fd.write(f"::: {ident}") + with mkdocs_gen_files.open(full_doc_path, 'w') as fd: + ident = '.'.join(parts) + fd.write(f'::: {ident}') mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root)) -with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file: +with mkdocs_gen_files.open('reference/SUMMARY.md', 'w') as nav_file: nav_file.writelines(nav.build_literate_nav()) diff --git a/Older Experiments/scrappy-proof-of-concept/clients/github_client.py b/Older Experiments/scrappy-proof-of-concept/clients/github_client.py index 640ad43..880e4bc 100644 --- a/Older Experiments/scrappy-proof-of-concept/clients/github_client.py +++ b/Older Experiments/scrappy-proof-of-concept/clients/github_client.py @@ -1,12 +1,14 @@ # clients/github_client.py -import requests -import time import json import logging +import time + +import requests from config import GITHUB_API_BASE_URL # Use centralized config logger = logging.getLogger(__name__) + class GitHubClient: BASE_URL = GITHUB_API_BASE_URL @@ -19,7 +21,7 @@ def __init__(self, token=None, default_timeout=30): self.default_timeout = default_timeout self.headers = {} if token: - self.headers["Authorization"] = f"token {token}" + self.headers['Authorization'] = f'token {token}' def get(self, url, params=None): """ @@ -29,72 +31,83 @@ def get(self, url, params=None): attempt = 0 while attempt < max_retries: attempt += 1 - logger.debug(f"[GET Attempt {attempt}/{max_retries}] URL={url} Params={params}") + logger.debug( + f'[GET Attempt {attempt}/{max_retries}] URL={url} Params={params}' + ) try: response = requests.get( url, headers=self.headers, params=params, - timeout=self.default_timeout + timeout=self.default_timeout, ) if response.status_code == 200: - logger.debug(f"[GET {url}] -> 200 OK") + logger.debug(f'[GET {url}] -> 200 OK') try: return response.json() except json.JSONDecodeError as e: - logger.error(f"[GET {url}] JSON parse error: {e}") + logger.error(f'[GET {url}] JSON parse error: {e}') return None elif response.status_code == 403: try: error_json = response.json() except json.JSONDecodeError: error_json = {} - message = error_json.get("message", "").lower() - if "rate limit exceeded" in message: - reset_timestamp = response.headers.get("X-RateLimit-Reset") - remaining = response.headers.get("X-RateLimit-Remaining") - logger.warning("GitHub rate limit exceeded!") - logger.warning(f"X-RateLimit-Remaining: {remaining}") - logger.warning(f"X-RateLimit-Reset: {reset_timestamp}") + message = error_json.get('message', '').lower() + if 'rate limit exceeded' in message: + reset_timestamp = response.headers.get('X-RateLimit-Reset') + remaining = response.headers.get('X-RateLimit-Remaining') + logger.warning('GitHub rate limit exceeded!') + logger.warning(f'X-RateLimit-Remaining: {remaining}') + logger.warning(f'X-RateLimit-Reset: {reset_timestamp}') if reset_timestamp: reset_ts = int(reset_timestamp) current_ts = int(time.time()) sleep_time = reset_ts - current_ts + 1 if sleep_time < 1: sleep_time = 1 - logger.warning(f"Sleeping for {sleep_time} seconds (rate limit).") + logger.warning( + f'Sleeping for {sleep_time} seconds (rate limit).' + ) time.sleep(sleep_time) continue else: - logger.warning("No X-RateLimit-Reset header found. Sleeping 60s.") + logger.warning( + 'No X-RateLimit-Reset header found. Sleeping 60s.' + ) time.sleep(60) continue else: - logger.error(f"[GET {url}] 403 Forbidden: {response.text}") + logger.error(f'[GET {url}] 403 Forbidden: {response.text}') return None else: - logger.error(f"[GET {url}] -> {response.status_code} {response.reason}") - logger.error(f"Response Text: {response.text}") + logger.error( + f'[GET {url}] -> {response.status_code} {response.reason}' + ) + logger.error(f'Response Text: {response.text}') return None - except (requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout) as e: - logger.warning(f"[GET {url}] Timeout on attempt {attempt}. Error: {e}") + except ( + requests.exceptions.ConnectTimeout, + requests.exceptions.ReadTimeout, + ) as e: + logger.warning(f'[GET {url}] Timeout on attempt {attempt}. Error: {e}') if attempt < max_retries: backoff = 5 * attempt - logger.warning(f"Retrying in {backoff} seconds...") + logger.warning(f'Retrying in {backoff} seconds...') time.sleep(backoff) else: - logger.error("Max retries reached. Giving up.") + logger.error('Max retries reached. Giving up.') return None except requests.exceptions.RequestException as e: - logger.error(f"[GET {url}] RequestException on attempt {attempt}: {e}") + logger.error(f'[GET {url}] RequestException on attempt {attempt}: {e}') if attempt < max_retries: backoff = 5 * attempt - logger.warning(f"Retrying in {backoff} seconds...") + logger.warning(f'Retrying in {backoff} seconds...') time.sleep(backoff) else: - logger.error("Max retries reached. Giving up.") + logger.error('Max retries reached. Giving up.') return None - logger.error(f"[GET {url}] All retries exhausted. Returning None.") + logger.error(f'[GET {url}] All retries exhausted. Returning None.') return None def get_all_pages(self, url, params=None): @@ -105,107 +118,111 @@ def get_all_pages(self, url, params=None): page = 1 while True: local_params = params.copy() if params else {} - local_params.update({"page": page, "per_page": 100}) - logger.info(f"Fetching page {page} of {url}") + local_params.update({'page': page, 'per_page': 100}) + logger.info(f'Fetching page {page} of {url}') items = self.get(url, params=local_params) if not items: - logger.info(f"No more data for {url} on page {page}.") + logger.info(f'No more data for {url} on page {page}.') break if isinstance(items, list): all_items.extend(items) - logger.info(f"Fetched {len(items)} items from page {page}.") + logger.info(f'Fetched {len(items)} items from page {page}.') if len(items) < 100: break else: - logger.info(f"Non-list response encountered. Ending pagination for {url}.") + logger.info( + f'Non-list response encountered. Ending pagination for {url}.' + ) break page += 1 time.sleep(1) - logger.info(f"Finished pagination for {url}, total items fetched: {len(all_items)}") + logger.info( + f'Finished pagination for {url}, total items fetched: {len(all_items)}' + ) return all_items def get_repository(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}" + url = f'{self.BASE_URL}/repos/{owner}/{repo}' return self.get(url) def get_user(self, username): - url = f"{self.BASE_URL}/users/{username}" + url = f'{self.BASE_URL}/users/{username}' return self.get(url) def get_organization(self, org_login): - url = f"{self.BASE_URL}/orgs/{org_login}" + url = f'{self.BASE_URL}/orgs/{org_login}' return self.get(url) def get_branches(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/branches" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/branches' return self.get_all_pages(url) def get_tags(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/tags" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/tags' return self.get_all_pages(url) def get_commits(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/commits" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/commits' return self.get_all_pages(url) def get_labels(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/labels" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/labels' return self.get_all_pages(url) def get_milestones(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/milestones" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/milestones' return self.get_all_pages(url) def get_releases(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/releases" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/releases' return self.get_all_pages(url) def get_webhooks(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/hooks" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/hooks' return self.get_all_pages(url) def get_events(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/events" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/events' return self.get_all_pages(url) def get_collaborators(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/collaborators" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/collaborators' return self.get_all_pages(url) def get_workflows(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/actions/workflows" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/actions/workflows' data = self.get(url) if data and isinstance(data, dict): - return data.get("workflows", []) + return data.get('workflows', []) return [] def get_workflow_runs(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/actions/runs" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/actions/runs' data = self.get(url) if data and isinstance(data, dict): - return data.get("workflow_runs", []) + return data.get('workflow_runs', []) return [] def get_readme(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/readme" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/readme' return self.get(url) def get_discussions(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/discussions" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/discussions' return self.get_all_pages(url) def get_citation_cff(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/contents/CITATION.cff" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/contents/CITATION.cff' return self.get(url) def get_traffic_views(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/traffic/views" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/traffic/views' return self.get(url) def get_traffic_clones(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/traffic/clones" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/traffic/clones' return self.get(url) def get_traffic_popular_paths(self, owner, repo): - url = f"{self.BASE_URL}/repos/{owner}/{repo}/traffic/popular/paths" + url = f'{self.BASE_URL}/repos/{owner}/{repo}/traffic/popular/paths' return self.get(url) diff --git a/Older Experiments/scrappy-proof-of-concept/clients/openalex_client.py b/Older Experiments/scrappy-proof-of-concept/clients/openalex_client.py index 39a3e1a..7144b5b 100644 --- a/Older Experiments/scrappy-proof-of-concept/clients/openalex_client.py +++ b/Older Experiments/scrappy-proof-of-concept/clients/openalex_client.py @@ -1,32 +1,38 @@ # clients/openalex_client.py -import requests import logging -from utils.common import clean_doi + +import requests from config import OPENALEX_BASE_URL # Use centralized configuration +from utils.common import clean_doi logger = logging.getLogger(__name__) + class OpenAlexClient: BASE_URL = OPENALEX_BASE_URL def __init__(self): - self.headers = {"User-Agent": "MyGitHubOpenAlexApp/1.0 (your_email@example.com)"} + self.headers = { + 'User-Agent': 'MyGitHubOpenAlexApp/1.0 (your_email@example.com)' + } def get_work_by_doi(self, doi): doi = clean_doi(doi).lower() - url = f"{self.BASE_URL}/works/doi:{doi}" + url = f'{self.BASE_URL}/works/doi:{doi}' try: response = requests.get(url, headers=self.headers, timeout=30) if response.status_code == 200: - logger.debug(f"Fetched work for DOI {doi}.") + logger.debug(f'Fetched work for DOI {doi}.') return response.json() else: - logger.error(f"OpenAlex: Failed to fetch work for DOI {doi} (status: {response.status_code}).") + logger.error( + f'OpenAlex: Failed to fetch work for DOI {doi} (status: {response.status_code}).' + ) return None except Exception as e: - logger.error(f"OpenAlex: Exception while fetching work for DOI {doi}: {e}") + logger.error(f'OpenAlex: Exception while fetching work for DOI {doi}: {e}') return None - + def get_work_by_id(self, openalex_id): """ Fetch a work by its OpenAlex ID. @@ -34,63 +40,90 @@ def get_work_by_id(self, openalex_id): # If the ID is the full URL, extract just the ID part if openalex_id.startswith('https://'): openalex_id = openalex_id.split('/')[-1] - - url = f"{self.BASE_URL}/works/{openalex_id}" + + url = f'{self.BASE_URL}/works/{openalex_id}' try: response = requests.get(url, headers=self.headers, timeout=30) if response.status_code == 200: - logger.debug(f"Fetched work for ID {openalex_id}.") + logger.debug(f'Fetched work for ID {openalex_id}.') return response.json() else: - logger.error(f"OpenAlex: Failed to fetch work for ID {openalex_id} (status: {response.status_code}).") + logger.error( + f'OpenAlex: Failed to fetch work for ID {openalex_id} (status: {response.status_code}).' + ) return None except Exception as e: - logger.error(f"OpenAlex: Exception while fetching work for ID {openalex_id}: {e}") + logger.error( + f'OpenAlex: Exception while fetching work for ID {openalex_id}: {e}' + ) return None def get_additional_works_for_author(self, author_openalex_id, per_page=5): - url = f"{self.BASE_URL}/works" - params = {"filter": f"authorships.author.id:{author_openalex_id}", "per_page": per_page} + url = f'{self.BASE_URL}/works' + params = { + 'filter': f'authorships.author.id:{author_openalex_id}', + 'per_page': per_page, + } try: - response = requests.get(url, headers=self.headers, params=params, timeout=30) + response = requests.get( + url, headers=self.headers, params=params, timeout=30 + ) if response.status_code == 200: data = response.json() - logger.debug(f"Fetched {len(data.get('results', []))} additional works for author {author_openalex_id}.") - return data.get("results", []) + logger.debug( + f'Fetched {len(data.get("results", []))} additional works for author {author_openalex_id}.' + ) + return data.get('results', []) else: - logger.error(f"OpenAlex: Failed to fetch additional works for author {author_openalex_id}.") + logger.error( + f'OpenAlex: Failed to fetch additional works for author {author_openalex_id}.' + ) return [] except Exception as e: - logger.error(f"OpenAlex: Exception while fetching additional works for author {author_openalex_id}: {e}") + logger.error( + f'OpenAlex: Exception while fetching additional works for author {author_openalex_id}: {e}' + ) return [] def get_citing_works(self, work_openalex_id, per_page=200): """ Retrieve all citing works for a given work using explicit pagination. """ - short_id = work_openalex_id.split("/")[-1] + short_id = work_openalex_id.split('/')[-1] page = 1 all_results = [] while True: - url = f"{self.BASE_URL}/works" - params = {"filter": f"cites:{short_id}", "per_page": per_page, "page": page} - logger.debug(f"Fetching citing works for {work_openalex_id}: page {page} with params {params}") + url = f'{self.BASE_URL}/works' + params = {'filter': f'cites:{short_id}', 'per_page': per_page, 'page': page} + logger.debug( + f'Fetching citing works for {work_openalex_id}: page {page} with params {params}' + ) try: - response = requests.get(url, headers=self.headers, params=params, timeout=30) + response = requests.get( + url, headers=self.headers, params=params, timeout=30 + ) if response.status_code == 200: data = response.json() - results = data.get("results", []) - logger.debug(f"Page {page}: Retrieved {len(results)} works.") + results = data.get('results', []) + logger.debug(f'Page {page}: Retrieved {len(results)} works.') if not results: - logger.info(f"No more citing works found on page {page}. Total works: {len(all_results)}") + logger.info( + f'No more citing works found on page {page}. Total works: {len(all_results)}' + ) break all_results.extend(results) page += 1 else: - logger.error(f"OpenAlex: Failed to fetch citing works for {work_openalex_id} on page {page} (status: {response.status_code}).") + logger.error( + f'OpenAlex: Failed to fetch citing works for {work_openalex_id} on page {page} (status: {response.status_code}).' + ) break except Exception as e: - logger.error(f"OpenAlex: Exception while fetching citing works for {work_openalex_id} on page {page}: {e}") + logger.error( + f'OpenAlex: Exception while fetching citing works for {work_openalex_id} on page {page}: {e}' + ) break - logger.info(f"Total citing works fetched for {work_openalex_id}: {len(all_results)}") - return all_results \ No newline at end of file + logger.info( + f'Total citing works fetched for {work_openalex_id}: {len(all_results)}' + ) + return all_results diff --git a/Older Experiments/scrappy-proof-of-concept/config.py b/Older Experiments/scrappy-proof-of-concept/config.py index 2d5da9b..fcf5e56 100644 --- a/Older Experiments/scrappy-proof-of-concept/config.py +++ b/Older Experiments/scrappy-proof-of-concept/config.py @@ -1,19 +1,20 @@ # config.py import os + from dotenv import load_dotenv # Load environment variables from the .env file load_dotenv() # GitHub Configuration -GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") -GITHUB_API_BASE_URL = os.getenv("GITHUB_API_BASE_URL", "https://api.github.com") +GITHUB_TOKEN = os.getenv('GITHUB_TOKEN') +GITHUB_API_BASE_URL = os.getenv('GITHUB_API_BASE_URL', 'https://api.github.com') # OpenAlex Configuration -OPENALEX_BASE_URL = os.getenv("OPENALEX_BASE_URL", "https://api.openalex.org") +OPENALEX_BASE_URL = os.getenv('OPENALEX_BASE_URL', 'https://api.openalex.org') # Database Configuration -DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///mosspoc.db") +DATABASE_URL = os.getenv('DATABASE_URL', 'sqlite:///mosspoc.db') # Logging Configuration -LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") +LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO') diff --git a/Older Experiments/scrappy-proof-of-concept/db/database.py b/Older Experiments/scrappy-proof-of-concept/db/database.py index 074afa8..3b6d12d 100644 --- a/Older Experiments/scrappy-proof-of-concept/db/database.py +++ b/Older Experiments/scrappy-proof-of-concept/db/database.py @@ -1,16 +1,18 @@ # db/database.py -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker, configure_mappers -from models.models import Base -from config import DATABASE_URL # Use centralized configuration from contextlib import contextmanager +from config import DATABASE_URL # Use centralized configuration +from models.models import Base +from sqlalchemy import create_engine +from sqlalchemy.orm import configure_mappers, sessionmaker + # Create the engine using DATABASE_URL from config.py engine = create_engine(DATABASE_URL, echo=False) # Create a configured Session class with expire_on_commit set to False SessionLocal = sessionmaker(bind=engine, expire_on_commit=False) + def init_db(): """ Initialize the DB, ensuring that all tables (including versioning tables) are created. @@ -18,6 +20,7 @@ def init_db(): configure_mappers() Base.metadata.create_all(bind=engine) + @contextmanager def get_db_session(): """ diff --git a/Older Experiments/scrappy-proof-of-concept/main.py b/Older Experiments/scrappy-proof-of-concept/main.py index 8055823..f073400 100644 --- a/Older Experiments/scrappy-proof-of-concept/main.py +++ b/Older Experiments/scrappy-proof-of-concept/main.py @@ -1,189 +1,231 @@ import sys + from config import GITHUB_TOKEN -from utils.logging_config import setup_logging from db.database import init_db +from services import ingestion_service from utils.common import parse_github_url -from services import ingestion_service, query_service -from datetime import datetime +from utils.logging_config import setup_logging setup_logging() + def main_menu(): - print("Welcome to the Unified GitHub & OpenAlex Data Application") + print('Welcome to the Unified GitHub & OpenAlex Data Application') while True: - print("\nMain Menu:") - print("1) Ingest a single repository") - print("2) Search and ingest repositories by keyword") - print("3) Run interactive query mode") - print("4) Find repositories associated with your institution") - print("5) View analysis history and trends") # New option - print("0) Exit") - choice = input("Enter your choice: ").strip() - - if choice == "1": + print('\nMain Menu:') + print('1) Ingest a single repository') + print('2) Search and ingest repositories by keyword') + print('3) Run interactive query mode') + print('4) Find repositories associated with your institution') + print('5) View analysis history and trends') # New option + print('0) Exit') + choice = input('Enter your choice: ').strip() + + if choice == '1': # Capture pre-ingestion counts. pre_counts = ingestion_service.get_ingestion_counts() - - repo_url = input("Enter repository URL: ").strip() + + repo_url = input('Enter repository URL: ').strip() owner, repo_name = parse_github_url(repo_url) if not owner or not repo_name: - print("Invalid repository URL provided.") + print('Invalid repository URL provided.') continue - + # NEW: Check if repository already exists existing_repo = ingestion_service.check_repository_exists(owner, repo_name) if existing_repo: - print(f"\nRepository '{existing_repo.full_name}' is already in the database.") - print(f"Last ingested: {existing_repo.ingested_at}") - + print( + f"\nRepository '{existing_repo.full_name}' is already in the database." + ) + print(f'Last ingested: {existing_repo.ingested_at}') + # Show associated data counts - doi_count = ingestion_service.get_repository_doi_counts(existing_repo.id) - print(f"DOIs associated: {doi_count}") - + doi_count = ingestion_service.get_repository_doi_counts( + existing_repo.id + ) + print(f'DOIs associated: {doi_count}') + # Show discovery events events = ingestion_service.get_discovery_events(existing_repo.id) if events: - print(f"Discovery chain: {events[0].chain_id}") - print(f"Discovery method: {events[0].discovery_method}") - print(f"Original trigger: {events[0].url or events[0].keyword or 'Direct'}") - + print(f'Discovery chain: {events[0].chain_id}') + print(f'Discovery method: {events[0].discovery_method}') + print( + f'Original trigger: {events[0].url or events[0].keyword or "Direct"}' + ) + # Ask if user wants to re-ingest - reingest = input("\nDo you want to re-ingest this repository? (y/n): ").strip().lower() + reingest = ( + input('\nDo you want to re-ingest this repository? (y/n): ') + .strip() + .lower() + ) if reingest != 'y': continue - - token = input("Enter GitHub token (or press Enter to use the default token): ").strip() or GITHUB_TOKEN + + token = ( + input( + 'Enter GitHub token (or press Enter to use the default token): ' + ).strip() + or GITHUB_TOKEN + ) try: # Pass the repository URL as trigger_input. - repo = ingestion_service.ingest_repository(owner, repo_name, token, trigger_input=repo_url) + repo = ingestion_service.ingest_repository( + owner, repo_name, token, trigger_input=repo_url + ) print(f"Repository '{repo.full_name}' ingested successfully.") except Exception as e: - print(f"Error ingesting repository: {e}") + print(f'Error ingesting repository: {e}') continue - + # Capture post-ingestion counts and output the summary. post_counts = ingestion_service.get_ingestion_counts() print(ingestion_service.print_ingestion_summary(pre_counts, post_counts)) - - elif choice == "2": + + elif choice == '2': # Capture pre-ingestion counts. pre_counts = ingestion_service.get_ingestion_counts() - - keywords_input = input("Enter search keywords (comma-separated): ").strip() + + keywords_input = input('Enter search keywords (comma-separated): ').strip() if not keywords_input: - print("No keywords provided.") + print('No keywords provided.') continue - + # NEW: Convert the input to a list of keywords keyword_list = [k.strip() for k in keywords_input.split(',') if k.strip()] - + # NEW: Check which keywords have been used before from services.acf_framework import find_keyword_matches + keyword_matches = find_keyword_matches(keyword_list) - + # NEW: Display keyword status - print("\n=== Keyword Status ===") - + print('\n=== Keyword Status ===') + used_keywords = [] new_keywords = [] - + for keyword in keyword_list: if keyword in keyword_matches: used_keywords.append(keyword) else: new_keywords.append(keyword) - + if new_keywords: - print("New keywords:") + print('New keywords:') for kw in new_keywords: - print(f" - {kw}") - + print(f' - {kw}') + if used_keywords: - print("\nPreviously used keywords:") + print('\nPreviously used keywords:') for kw in used_keywords: stats = keyword_matches[kw] - last_run = stats['last_run'].strftime("%Y-%m-%d %H:%M") + last_run = stats['last_run'].strftime('%Y-%m-%d %H:%M') repo_count = stats['repository_count'] - print(f" - {kw} (Last run: {last_run}, Repositories found: {repo_count})") - + print( + f' - {kw} (Last run: {last_run}, Repositories found: {repo_count})' + ) + # NEW: Option to remove already used keywords if used_keywords: - remove_used = input("\nDo you want to remove already used keywords? (y/n): ").strip().lower() + remove_used = ( + input('\nDo you want to remove already used keywords? (y/n): ') + .strip() + .lower() + ) if remove_used == 'y': keyword_list = new_keywords - print(f"Kept {len(keyword_list)} new keywords.") - + print(f'Kept {len(keyword_list)} new keywords.') + # NEW: Option to modify the keyword list - modify = input("\nDo you want to modify the keyword list? (y/n): ").strip().lower() + modify = ( + input('\nDo you want to modify the keyword list? (y/n): ') + .strip() + .lower() + ) if modify == 'y': - print("Enter keywords one per line. Empty line to finish.") + print('Enter keywords one per line. Empty line to finish.') modified_keywords = [] while True: - keyword = input("> ").strip() + keyword = input('> ').strip() if not keyword: break modified_keywords.append(keyword) - + if modified_keywords: keyword_list = modified_keywords - + # NEW: Confirm keyword list if not keyword_list: - print("Keyword list is empty. Returning to main menu.") + print('Keyword list is empty. Returning to main menu.') continue - - print("\n=== Final Keyword List ===") + + print('\n=== Final Keyword List ===') for i, kw in enumerate(keyword_list, 1): - print(f"{i}. {kw}") - - confirm = input("\nProceed with these keywords? (y/n): ").strip().lower() + print(f'{i}. {kw}') + + confirm = input('\nProceed with these keywords? (y/n): ').strip().lower() if confirm != 'y': continue - + # Convert back to comma-separated string for existing function keywords = ','.join(keyword_list) - - token = input("Enter GitHub token (or press Enter to use the default token): ").strip() or GITHUB_TOKEN + + token = ( + input( + 'Enter GitHub token (or press Enter to use the default token): ' + ).strip() + or GITHUB_TOKEN + ) # Pass keywords as trigger_input. - repos = ingestion_service.search_and_ingest_repositories(token, keywords, trigger_input=keywords) + repos = ingestion_service.search_and_ingest_repositories( + token, keywords, trigger_input=keywords + ) print(f"Ingested {len(repos)} repositories matching '{keywords}'.") - + # Capture post-ingestion counts and output the summary. post_counts = ingestion_service.get_ingestion_counts() print(ingestion_service.print_ingestion_summary(pre_counts, post_counts)) - - elif choice == "3": + + elif choice == '3': # Launch the interactive query experience try: - import queries.interactive_query as interactive_query + import queries.interactive_query as interactive_query + interactive_query.interactive_query() except Exception as e: - print(f"Error running interactive query mode: {e}") - - elif choice == "4": + print(f'Error running interactive query mode: {e}') + + elif choice == '4': # Launch the institutional repository discovery try: # Updated import to use the new implementation - from queries.institution_analysis_query import institutional_repository_discovery + from queries.institution_analysis_query import ( + institutional_repository_discovery, + ) + institutional_repository_discovery() except Exception as e: - print(f"Error running institutional repository discovery: {e}") - - elif choice == "5": + print(f'Error running institutional repository discovery: {e}') + + elif choice == '5': # Launch the analysis history view try: from queries.analysis_history import main as analysis_history_main + analysis_history_main() except Exception as e: - print(f"Error viewing analysis history: {e}") - - elif choice == "0": - print("Exiting.") + print(f'Error viewing analysis history: {e}') + + elif choice == '0': + print('Exiting.') sys.exit(0) - + else: - print("Invalid choice. Please try again.") + print('Invalid choice. Please try again.') + -if __name__ == "__main__": +if __name__ == '__main__': init_db() - main_menu() \ No newline at end of file + main_menu() diff --git a/Older Experiments/scrappy-proof-of-concept/models/models.py b/Older Experiments/scrappy-proof-of-concept/models/models.py index fb65f3a..7abe92b 100644 --- a/Older Experiments/scrappy-proof-of-concept/models/models.py +++ b/Older Experiments/scrappy-proof-of-concept/models/models.py @@ -1,40 +1,62 @@ +from datetime import datetime, timezone + +from sqlalchemy import ( + Boolean, + Column, + DateTime, + Float, + ForeignKey, + Integer, + String, + Table, + Text, +) from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, ForeignKey, Table, Float from sqlalchemy.orm import relationship from sqlalchemy_continuum import make_versioned -from datetime import datetime, timezone -import uuid make_versioned(user_cls=None) Base = declarative_base() + # --- Mixin for Ingestion Timestamp --- class IngestedAtMixin: - ingested_at = Column(DateTime, default=lambda: datetime.now(timezone.utc), nullable=False) + ingested_at = Column( + DateTime, default=lambda: datetime.now(timezone.utc), nullable=False + ) + # --- New Audit Table for Discovery Events --- class DiscoveryEvent(Base): __tablename__ = 'discovery_events' id = Column(Integer, primary_key=True) - chain_id = Column(String, nullable=False) # Unique per ingestion session. - branch_id = Column(String, nullable=False) # Unique per discovery branch. - step_number = Column(Integer, nullable=False) # Depth relative to the trigger event. + chain_id = Column(String, nullable=False) # Unique per ingestion session. + branch_id = Column(String, nullable=False) # Unique per discovery branch. + step_number = Column( + Integer, nullable=False + ) # Depth relative to the trigger event. discovery_method = Column(String, nullable=False) details = Column(Text, nullable=False) - timestamp = Column(DateTime, default=lambda: datetime.now(timezone.utc), nullable=False) - ingestion_type = Column(String) # "direct ingestion" or "keyword ingestion" - url = Column(String) # Populated for direct ingestion. - keyword = Column(String) # Populated for keyword ingestion. - object_type = Column(String, nullable=False) # e.g. "Repository", "DOI", etc. - object_id = Column(String, nullable=False) # Stored as a string for flexibility. + timestamp = Column( + DateTime, default=lambda: datetime.now(timezone.utc), nullable=False + ) + ingestion_type = Column(String) # "direct ingestion" or "keyword ingestion" + url = Column(String) # Populated for direct ingestion. + keyword = Column(String) # Populated for keyword ingestion. + object_type = Column(String, nullable=False) # e.g. "Repository", "DOI", etc. + object_id = Column(String, nullable=False) # Stored as a string for flexibility. def __repr__(self): - return (f"") + return ( + f"" + ) + # --- GitHub Models --- + class User(IngestedAtMixin, Base): __tablename__ = 'users' __versioned__ = {} @@ -69,6 +91,7 @@ class User(IngestedAtMixin, Base): def __repr__(self): return f"" + class Organization(IngestedAtMixin, Base): __tablename__ = 'organizations' __versioned__ = {} @@ -81,6 +104,7 @@ class Organization(IngestedAtMixin, Base): def __repr__(self): return f"" + class Repository(IngestedAtMixin, Base): __tablename__ = 'repositories' __versioned__ = {} @@ -92,8 +116,8 @@ class Repository(IngestedAtMixin, Base): description = Column(Text) homepage = Column(String) language = Column(String) - topics = Column(Text) # Comma-separated list from GitHub topics - license = Column(Text) # JSON string or license name + topics = Column(Text) # Comma-separated list from GitHub topics + license = Column(Text) # JSON string or license name visibility = Column(String) default_branch = Column(String) archived = Column(Boolean) @@ -129,7 +153,12 @@ class Repository(IngestedAtMixin, Base): raw_data = Column(Text) # Relationships – explicitly tie the DOI relationship to this repository. - dois = relationship('DOI', back_populates='repository', cascade="all, delete-orphan", foreign_keys='DOI.repository_id') + dois = relationship( + 'DOI', + back_populates='repository', + cascade='all, delete-orphan', + foreign_keys='DOI.repository_id', + ) issues = relationship('Issue', back_populates='repository') pull_requests = relationship('PullRequest', back_populates='repository') branches = relationship('Branch', back_populates='repository') @@ -146,6 +175,7 @@ class Repository(IngestedAtMixin, Base): def __repr__(self): return f"" + class Branch(IngestedAtMixin, Base): __tablename__ = 'branches' id = Column(Integer, primary_key=True, autoincrement=True) @@ -157,6 +187,7 @@ class Branch(IngestedAtMixin, Base): def __repr__(self): return f"" + class Tag(IngestedAtMixin, Base): __tablename__ = 'tags' id = Column(Integer, primary_key=True, autoincrement=True) @@ -168,6 +199,7 @@ class Tag(IngestedAtMixin, Base): def __repr__(self): return f"" + class Commit(IngestedAtMixin, Base): __tablename__ = 'commits' sha = Column(String, primary_key=True) @@ -184,6 +216,7 @@ class Commit(IngestedAtMixin, Base): def __repr__(self): return f"" + class Issue(IngestedAtMixin, Base): __tablename__ = 'issues' id = Column(Integer, primary_key=True) # GitHub issue id @@ -205,6 +238,7 @@ class Issue(IngestedAtMixin, Base): def __repr__(self): return f"" + class PullRequest(IngestedAtMixin, Base): __tablename__ = 'pull_requests' id = Column(Integer, primary_key=True) # GitHub PR id @@ -227,6 +261,7 @@ class PullRequest(IngestedAtMixin, Base): def __repr__(self): return f"" + class IssueComment(IngestedAtMixin, Base): __tablename__ = 'issue_comments' id = Column(Integer, primary_key=True) # GitHub comment id @@ -241,7 +276,8 @@ class IssueComment(IngestedAtMixin, Base): issue = relationship('Issue', back_populates='comments') def __repr__(self): - return f"" + return f'' + class PRReviewComment(IngestedAtMixin, Base): __tablename__ = 'pr_review_comments' @@ -257,7 +293,8 @@ class PRReviewComment(IngestedAtMixin, Base): pull_request = relationship('PullRequest', back_populates='review_comments') def __repr__(self): - return f"" + return f'' + class PullRequestReview(IngestedAtMixin, Base): __tablename__ = 'pull_request_reviews' @@ -269,12 +306,13 @@ class PullRequestReview(IngestedAtMixin, Base): body = Column(Text) raw_data = Column(Text) - user = relationship("User", back_populates="pull_request_reviews") - pull_request = relationship("PullRequest", back_populates="reviews") + user = relationship('User', back_populates='pull_request_reviews') + pull_request = relationship('PullRequest', back_populates='reviews') def __repr__(self): return f"" + class Label(IngestedAtMixin, Base): __tablename__ = 'labels' id = Column(Integer, primary_key=True) @@ -289,6 +327,7 @@ class Label(IngestedAtMixin, Base): def __repr__(self): return f"" + class Milestone(IngestedAtMixin, Base): __tablename__ = 'milestones' id = Column(Integer, primary_key=True) @@ -304,6 +343,7 @@ class Milestone(IngestedAtMixin, Base): def __repr__(self): return f"" + class Release(IngestedAtMixin, Base): __tablename__ = 'releases' id = Column(Integer, primary_key=True) @@ -322,6 +362,7 @@ class Release(IngestedAtMixin, Base): def __repr__(self): return f"" + class Webhook(IngestedAtMixin, Base): __tablename__ = 'webhooks' id = Column(Integer, primary_key=True) @@ -337,6 +378,7 @@ class Webhook(IngestedAtMixin, Base): def __repr__(self): return f"" + class Event(IngestedAtMixin, Base): __tablename__ = 'events' id = Column(Integer, primary_key=True, autoincrement=True) @@ -350,6 +392,7 @@ class Event(IngestedAtMixin, Base): def __repr__(self): return f"" + class Workflow(IngestedAtMixin, Base): __tablename__ = 'workflows' id = Column(Integer, primary_key=True) @@ -363,6 +406,7 @@ class Workflow(IngestedAtMixin, Base): def __repr__(self): return f"" + class WorkflowRun(IngestedAtMixin, Base): __tablename__ = 'workflow_runs' id = Column(Integer, primary_key=True) @@ -377,48 +421,59 @@ class WorkflowRun(IngestedAtMixin, Base): repository = relationship('Repository', back_populates='workflow_runs') def __repr__(self): - return f"" + return f'' + class DOI(IngestedAtMixin, Base): __tablename__ = 'dois' __versioned__ = {} id = Column(Integer, primary_key=True, autoincrement=True) - repository_id = Column(Integer, ForeignKey('repositories.id', ondelete='CASCADE'), nullable=False) + repository_id = Column( + Integer, ForeignKey('repositories.id', ondelete='CASCADE'), nullable=False + ) doi = Column(String, index=True, nullable=False) source = Column(String, nullable=True) doi_metadata = Column(Text, nullable=True) - + repository = relationship('Repository', back_populates='dois') - + def __repr__(self): return f"" + # --- OpenAlex Models and Association Tables --- openalex_work_authors = Table( - 'openalex_work_authors', Base.metadata, + 'openalex_work_authors', + Base.metadata, Column('work_id', Integer, ForeignKey('openalex_works.id')), - Column('author_id', Integer, ForeignKey('openalex_authors.id')) + Column('author_id', Integer, ForeignKey('openalex_authors.id')), ) openalex_author_institutions = Table( - 'openalex_author_institutions', Base.metadata, + 'openalex_author_institutions', + Base.metadata, Column('author_id', Integer, ForeignKey('openalex_authors.id')), - Column('institution_id', Integer, ForeignKey('openalex_institutions.id')) + Column('institution_id', Integer, ForeignKey('openalex_institutions.id')), ) openalex_work_topics = Table( - 'openalex_work_topics', Base.metadata, + 'openalex_work_topics', + Base.metadata, Column('work_id', Integer, ForeignKey('openalex_works.id')), - Column('topic_id', Integer, ForeignKey('openalex_topics.id')) + Column('topic_id', Integer, ForeignKey('openalex_topics.id')), ) openalex_citations = Table( - 'openalex_citations', Base.metadata, - Column('citing_work_id', Integer, ForeignKey('openalex_works.id'), primary_key=True), - Column('cited_work_id', Integer, ForeignKey('openalex_works.id'), primary_key=True) + 'openalex_citations', + Base.metadata, + Column( + 'citing_work_id', Integer, ForeignKey('openalex_works.id'), primary_key=True + ), + Column('cited_work_id', Integer, ForeignKey('openalex_works.id'), primary_key=True), ) + class OpenAlexWork(IngestedAtMixin, Base): __tablename__ = 'openalex_works' id = Column(Integer, primary_key=True, autoincrement=True) @@ -433,21 +488,26 @@ class OpenAlexWork(IngestedAtMixin, Base): raw_data = Column(Text) venue_id = Column(Integer, ForeignKey('openalex_venues.id')) - venue = relationship("OpenAlexVenue", back_populates="works") + venue = relationship('OpenAlexVenue', back_populates='works') - authors = relationship("OpenAlexAuthor", secondary=openalex_work_authors, back_populates="works") - topics = relationship("OpenAlexTopic", secondary=openalex_work_topics, back_populates="works") + authors = relationship( + 'OpenAlexAuthor', secondary=openalex_work_authors, back_populates='works' + ) + topics = relationship( + 'OpenAlexTopic', secondary=openalex_work_topics, back_populates='works' + ) cited_works = relationship( - "OpenAlexWork", + 'OpenAlexWork', secondary=openalex_citations, primaryjoin=id == openalex_citations.c.citing_work_id, secondaryjoin=id == openalex_citations.c.cited_work_id, - backref="citing_works" + backref='citing_works', ) def __repr__(self): return f"" + class OpenAlexAuthor(IngestedAtMixin, Base): __tablename__ = 'openalex_authors' id = Column(Integer, primary_key=True, autoincrement=True) @@ -457,12 +517,19 @@ class OpenAlexAuthor(IngestedAtMixin, Base): works_count = Column(Integer) raw_data = Column(Text) - works = relationship("OpenAlexWork", secondary=openalex_work_authors, back_populates="authors") - institutions = relationship("OpenAlexInstitution", secondary=openalex_author_institutions, back_populates="authors") + works = relationship( + 'OpenAlexWork', secondary=openalex_work_authors, back_populates='authors' + ) + institutions = relationship( + 'OpenAlexInstitution', + secondary=openalex_author_institutions, + back_populates='authors', + ) def __repr__(self): return f"" + class OpenAlexVenue(IngestedAtMixin, Base): __tablename__ = 'openalex_venues' id = Column(Integer, primary_key=True, autoincrement=True) @@ -472,11 +539,12 @@ class OpenAlexVenue(IngestedAtMixin, Base): url = Column(String) raw_data = Column(Text) - works = relationship("OpenAlexWork", back_populates="venue") + works = relationship('OpenAlexWork', back_populates='venue') def __repr__(self): return f"" + class OpenAlexInstitution(IngestedAtMixin, Base): __tablename__ = 'openalex_institutions' id = Column(Integer, primary_key=True, autoincrement=True) @@ -486,11 +554,16 @@ class OpenAlexInstitution(IngestedAtMixin, Base): url = Column(String) raw_data = Column(Text) - authors = relationship("OpenAlexAuthor", secondary=openalex_author_institutions, back_populates="institutions") + authors = relationship( + 'OpenAlexAuthor', + secondary=openalex_author_institutions, + back_populates='institutions', + ) def __repr__(self): return f"" + class OpenAlexTopic(IngestedAtMixin, Base): __tablename__ = 'openalex_topics' id = Column(Integer, primary_key=True, autoincrement=True) @@ -507,16 +580,20 @@ class OpenAlexTopic(IngestedAtMixin, Base): works_count = Column(Integer) keywords = Column(Text) # Comma-separated keywords raw_data = Column(Text) - - works = relationship("OpenAlexWork", secondary=openalex_work_topics, back_populates="topics") + + works = relationship( + 'OpenAlexWork', secondary=openalex_work_topics, back_populates='topics' + ) def __repr__(self): return f"" - + + class RepositoryInstitutionAnalysis(IngestedAtMixin, Base): """Stores results from running Association Confidence Filters on repositories.""" + __tablename__ = 'repository_institution_analyses' - + id = Column(Integer, primary_key=True, autoincrement=True) repository_id = Column(Integer, ForeignKey('repositories.id'), nullable=False) institution_name = Column(String, nullable=False, index=True) @@ -524,35 +601,43 @@ class RepositoryInstitutionAnalysis(IngestedAtMixin, Base): confidence_score = Column(Float, nullable=False) evidence = Column(Text) # JSON string created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc)) - keywords_used = Column(Text) # Comma-separated list of keywords that led to this repository - + keywords_used = Column( + Text + ) # Comma-separated list of keywords that led to this repository + # Relationships - repository = relationship("Repository", backref="institution_analyses") - + repository = relationship('Repository', backref='institution_analyses') + def __repr__(self): return f"" + class AnalysisSession(Base): """Tracks a complete institution analysis session.""" + __tablename__ = 'analysis_sessions' - + id = Column(Integer, primary_key=True, autoincrement=True) session_id = Column(String, unique=True, nullable=False) # UUID for the session institution_name = Column(String, nullable=False) analysis_type = Column(String, nullable=False) # 'repository' or 'people' created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc)) last_updated = Column(DateTime, default=lambda: datetime.now(timezone.utc)) - status = Column(String, default='initiated') # 'initiated', 'surfacing', 'acf', 'analysis', 'completed' + status = Column( + String, default='initiated' + ) # 'initiated', 'surfacing', 'acf', 'analysis', 'completed' parameters = Column(Text) # JSON string of parameters used - + # Relationships - surfacing_results = relationship("SurfacingResult", back_populates="session") - acf_results = relationship("ACFResult", back_populates="session") + surfacing_results = relationship('SurfacingResult', back_populates='session') + acf_results = relationship('ACFResult', back_populates='session') + class SurfacingResult(Base): """Stores results of a surfacing operation.""" + __tablename__ = 'surfacing_results' - + id = Column(Integer, primary_key=True, autoincrement=True) session_id = Column(Integer, ForeignKey('analysis_sessions.id'), nullable=False) algorithm = Column(String, nullable=False) @@ -560,50 +645,58 @@ class SurfacingResult(Base): run_at = Column(DateTime, default=lambda: datetime.now(timezone.utc)) result_count = Column(Integer, default=0) result_summary = Column(Text) # JSON string summary of results - + # Relationships - session = relationship("AnalysisSession", back_populates="surfacing_results") - repositories = relationship("SurfacedRepository", back_populates="surfacing_result") - people = relationship("SurfacedPerson", back_populates="surfacing_result") + session = relationship('AnalysisSession', back_populates='surfacing_results') + repositories = relationship('SurfacedRepository', back_populates='surfacing_result') + people = relationship('SurfacedPerson', back_populates='surfacing_result') + class SurfacedRepository(Base): """A repository surfaced during institution analysis.""" + __tablename__ = 'surfaced_repositories' - + id = Column(Integer, primary_key=True, autoincrement=True) surfacing_id = Column(Integer, ForeignKey('surfacing_results.id'), nullable=False) repository_id = Column(Integer, ForeignKey('repositories.id'), nullable=False) discovery_method = Column(String, nullable=False) discovery_details = Column(Text) surface_score = Column(Float, default=0.0) # Initial relevance score - + # Relationships - surfacing_result = relationship("SurfacingResult", back_populates="repositories") - repository = relationship("Repository") + surfacing_result = relationship('SurfacingResult', back_populates='repositories') + repository = relationship('Repository') + class SurfacedPerson(Base): """A person surfaced during institution analysis.""" + __tablename__ = 'surfaced_people' - + id = Column(Integer, primary_key=True, autoincrement=True) surfacing_id = Column(Integer, ForeignKey('surfacing_results.id'), nullable=False) user_id = Column(Integer, ForeignKey('users.id'), nullable=True) - openalex_author_id = Column(Integer, ForeignKey('openalex_authors.id'), nullable=True) + openalex_author_id = Column( + Integer, ForeignKey('openalex_authors.id'), nullable=True + ) name = Column(String) email = Column(String) discovery_method = Column(String, nullable=False) discovery_details = Column(Text) surface_score = Column(Float, default=0.0) # Initial relevance score - + # Relationships - surfacing_result = relationship("SurfacingResult", back_populates="people") - user = relationship("User") - openalex_author = relationship("OpenAlexAuthor") + surfacing_result = relationship('SurfacingResult', back_populates='people') + user = relationship('User') + openalex_author = relationship('OpenAlexAuthor') + class ACFResult(Base): """Stores results of an ACF operation.""" + __tablename__ = 'acf_results' - + id = Column(Integer, primary_key=True, autoincrement=True) session_id = Column(Integer, ForeignKey('analysis_sessions.id'), nullable=False) surfacing_id = Column(Integer, ForeignKey('surfacing_results.id'), nullable=False) @@ -611,37 +704,45 @@ class ACFResult(Base): run_at = Column(DateTime, default=lambda: datetime.now(timezone.utc)) parameters = Column(Text) # JSON string of parameters used result_summary = Column(Text) # JSON string summary of results - + # Relationships - session = relationship("AnalysisSession", back_populates="acf_results") - surfacing_result = relationship("SurfacingResult") - repository_results = relationship("ACFRepositoryResult", back_populates="acf_result") - people_results = relationship("ACFPersonResult", back_populates="acf_result") + session = relationship('AnalysisSession', back_populates='acf_results') + surfacing_result = relationship('SurfacingResult') + repository_results = relationship( + 'ACFRepositoryResult', back_populates='acf_result' + ) + people_results = relationship('ACFPersonResult', back_populates='acf_result') + class ACFRepositoryResult(Base): """ACF result for a specific repository.""" + __tablename__ = 'acf_repository_results' - + id = Column(Integer, primary_key=True, autoincrement=True) acf_id = Column(Integer, ForeignKey('acf_results.id'), nullable=False) repository_id = Column(Integer, ForeignKey('repositories.id'), nullable=False) confidence_score = Column(Float, default=0.0) evidence = Column(Text) # JSON string of evidence - + # Relationships - acf_result = relationship("ACFResult", back_populates="repository_results") - repository = relationship("Repository") + acf_result = relationship('ACFResult', back_populates='repository_results') + repository = relationship('Repository') + class ACFPersonResult(Base): """ACF result for a specific person.""" + __tablename__ = 'acf_person_results' - + id = Column(Integer, primary_key=True, autoincrement=True) acf_id = Column(Integer, ForeignKey('acf_results.id'), nullable=False) - surfaced_person_id = Column(Integer, ForeignKey('surfaced_people.id'), nullable=False) + surfaced_person_id = Column( + Integer, ForeignKey('surfaced_people.id'), nullable=False + ) confidence_score = Column(Float, default=0.0) evidence = Column(Text) # JSON string of evidence - + # Relationships - acf_result = relationship("ACFResult", back_populates="people_results") - surfaced_person = relationship("SurfacedPerson") \ No newline at end of file + acf_result = relationship('ACFResult', back_populates='people_results') + surfaced_person = relationship('SurfacedPerson') diff --git a/Older Experiments/scrappy-proof-of-concept/queries/acf_query.py b/Older Experiments/scrappy-proof-of-concept/queries/acf_query.py index 4b40cee..fe2b01a 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/acf_query.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/acf_query.py @@ -1,25 +1,25 @@ # queries/acf_query.py (Improved version) """ -User interface for applying Association Confidence Filters (ACFs) to +User interface for applying Association Confidence Filters (ACFs) to discover repositories associated with an institution. """ -import logging import json -from datetime import datetime -from typing import List, Dict, Any +import logging +from typing import Dict, List from db.database import get_db_session -from models.models import Repository, DiscoveryEvent, RepositoryInstitutionAnalysis +from models.models import Repository, RepositoryInstitutionAnalysis from services.acf_framework import ( + apply_filter, find_keyword_matches, - get_repositories_from_keywords, get_available_filters, - apply_filter + get_repositories_from_keywords, ) logger = logging.getLogger(__name__) + def check_existing_analysis_for_repos(repositories, institution_name): """ Check if analysis results already exist for these repositories and this institution. @@ -27,525 +27,648 @@ def check_existing_analysis_for_repos(repositories, institution_name): """ results = {} repo_ids = [repo.id for repo in repositories] - + with get_db_session() as session: # Find analysis records for these repositories and this institution - analyses = session.query(RepositoryInstitutionAnalysis).filter( - RepositoryInstitutionAnalysis.repository_id.in_(repo_ids), - RepositoryInstitutionAnalysis.institution_name == institution_name - ).all() - + analyses = ( + session.query(RepositoryInstitutionAnalysis) + .filter( + RepositoryInstitutionAnalysis.repository_id.in_(repo_ids), + RepositoryInstitutionAnalysis.institution_name == institution_name, + ) + .all() + ) + if not analyses: return None - + # Group by filter for analysis in analyses: filter_name = analysis.filter_name if filter_name not in results: results[filter_name] = { - "total": 0, - "high_confidence": 0, - "medium_confidence": 0, - "low_confidence": 0, - "last_run": analysis.created_at + 'total': 0, + 'high_confidence': 0, + 'medium_confidence': 0, + 'low_confidence': 0, + 'last_run': analysis.created_at, } - + # Update counts based on confidence score - results[filter_name]["total"] += 1 - + results[filter_name]['total'] += 1 + if analysis.confidence_score >= 0.7: - results[filter_name]["high_confidence"] += 1 + results[filter_name]['high_confidence'] += 1 elif analysis.confidence_score >= 0.4: - results[filter_name]["medium_confidence"] += 1 + results[filter_name]['medium_confidence'] += 1 elif analysis.confidence_score > 0: - results[filter_name]["low_confidence"] += 1 - + results[filter_name]['low_confidence'] += 1 + # Update last run date if more recent - if analysis.created_at > results[filter_name]["last_run"]: - results[filter_name]["last_run"] = analysis.created_at - + if analysis.created_at > results[filter_name]['last_run']: + results[filter_name]['last_run'] = analysis.created_at + return results + def display_evidence(evidence, filter_name): """Display evidence highlights based on filter type.""" - if filter_name == "Comprehensive Filter": + if filter_name == 'Comprehensive Filter': display_comprehensive_evidence(evidence) - elif filter_name == "Name Match Filter": + elif filter_name == 'Name Match Filter': if 'owner_name_match' in evidence: - print(f" - Owner name contains institution name: {evidence['owner_name_match']['match']}") + print( + f' - Owner name contains institution name: {evidence["owner_name_match"]["match"]}' + ) if 'repo_name_match' in evidence: - print(f" - Repository name contains institution name: {evidence['repo_name_match']['match']}") + print( + f' - Repository name contains institution name: {evidence["repo_name_match"]["match"]}' + ) if 'description_match' in evidence: - print(" - Repository description mentions institution name") + print(' - Repository description mentions institution name') if 'topic_match' in evidence: - print(f" - Repository topic contains institution name: {evidence['topic_match']['match']}") - elif filter_name == "Email Domain Filter": + print( + f' - Repository topic contains institution name: {evidence["topic_match"]["match"]}' + ) + elif filter_name == 'Email Domain Filter': if 'matching_contributors' in evidence: - print(f" - {evidence['matching_contributors']} of {evidence['total_contributors']} contributors have institution email domains") + print( + f' - {evidence["matching_contributors"]} of {evidence["total_contributors"]} contributors have institution email domains' + ) if 'matching_logins' in evidence: - print(f" - Matching contributors include: {', '.join(evidence['matching_logins'])}") - elif filter_name == "OpenAlex Affiliation Filter": + print( + f' - Matching contributors include: {", ".join(evidence["matching_logins"])}' + ) + elif filter_name == 'OpenAlex Affiliation Filter': if 'matching_works' in evidence: - print(f" - {evidence['matching_works']} of {evidence['total_works']} works linked to this repository have authors affiliated with the institution") + print( + f' - {evidence["matching_works"]} of {evidence["total_works"]} works linked to this repository have authors affiliated with the institution' + ) if 'work_details' in evidence: for i, work in enumerate(evidence['work_details'], 1): - print(f" - Paper {i}: {work['title']}") - print(f" Authors: {', '.join(work['authors'])}") - elif filter_name == "Combined Filter": + print(f' - Paper {i}: {work["title"]}') + print(f' Authors: {", ".join(work["authors"])}') + elif filter_name == 'Combined Filter': if 'component_scores' in evidence: - print(" - Combined from multiple filters:") + print(' - Combined from multiple filters:') for filter_name, score in evidence['component_scores'].items(): - print(f" • {filter_name}: {score:.2f}") + print(f' • {filter_name}: {score:.2f}') + def view_detailed_results_for_repos(repositories, institution_name): """ Show detailed analysis results for specific repositories and institution. """ repo_ids = [repo.id for repo in repositories] - + # Get available filters that have been used available_filters = {} - + with get_db_session() as session: - filters = session.query(RepositoryInstitutionAnalysis.filter_name).filter( - RepositoryInstitutionAnalysis.repository_id.in_(repo_ids), - RepositoryInstitutionAnalysis.institution_name == institution_name - ).distinct().all() - + filters = ( + session.query(RepositoryInstitutionAnalysis.filter_name) + .filter( + RepositoryInstitutionAnalysis.repository_id.in_(repo_ids), + RepositoryInstitutionAnalysis.institution_name == institution_name, + ) + .distinct() + .all() + ) + for i, (filter_name,) in enumerate(filters, 1): available_filters[str(i)] = filter_name - + if not available_filters: - print("No filters have been applied to these repositories for this institution.") + print( + 'No filters have been applied to these repositories for this institution.' + ) return - + # Select filter - print("\n=== Available Filters ===") + print('\n=== Available Filters ===') for num, name in available_filters.items(): - print(f"{num}) {name}") - - selection = input("\nSelect a filter to view results (number): ").strip() + print(f'{num}) {name}') + + selection = input('\nSelect a filter to view results (number): ').strip() if selection not in available_filters: - print("Invalid selection.") + print('Invalid selection.') return - + selected_filter = available_filters[selection] - + # Get minimum confidence threshold - min_confidence = input("\nMinimum confidence threshold (0.0-1.0, default=0.3): ").strip() or "0.3" + min_confidence = ( + input('\nMinimum confidence threshold (0.0-1.0, default=0.3): ').strip() + or '0.3' + ) try: min_confidence = float(min_confidence) min_confidence = max(0.0, min(1.0, min_confidence)) except ValueError: - print("Invalid threshold, using default 0.3") + print('Invalid threshold, using default 0.3') min_confidence = 0.3 - + # Query database for results with get_db_session() as session: - analysis_results = session.query( - RepositoryInstitutionAnalysis, - Repository - ).join( - Repository, - Repository.id == RepositoryInstitutionAnalysis.repository_id - ).filter( - RepositoryInstitutionAnalysis.repository_id.in_(repo_ids), - RepositoryInstitutionAnalysis.institution_name == institution_name, - RepositoryInstitutionAnalysis.filter_name == selected_filter, - RepositoryInstitutionAnalysis.confidence_score >= min_confidence - ).order_by( - RepositoryInstitutionAnalysis.confidence_score.desc() - ).all() - + analysis_results = ( + session.query(RepositoryInstitutionAnalysis, Repository) + .join( + Repository, Repository.id == RepositoryInstitutionAnalysis.repository_id + ) + .filter( + RepositoryInstitutionAnalysis.repository_id.in_(repo_ids), + RepositoryInstitutionAnalysis.institution_name == institution_name, + RepositoryInstitutionAnalysis.filter_name == selected_filter, + RepositoryInstitutionAnalysis.confidence_score >= min_confidence, + ) + .order_by(RepositoryInstitutionAnalysis.confidence_score.desc()) + .all() + ) + # Display results if not analysis_results: - print(f"\nNo repositories met the confidence threshold of {min_confidence}.") + print(f'\nNo repositories met the confidence threshold of {min_confidence}.') return - - print(f"\n=== Repositories Associated with {institution_name} ===") - print(f"Found {len(analysis_results)} repositories with confidence ≥ {min_confidence}") - print(f"Filter: {selected_filter}") - + + print(f'\n=== Repositories Associated with {institution_name} ===') + print( + f'Found {len(analysis_results)} repositories with confidence ≥ {min_confidence}' + ) + print(f'Filter: {selected_filter}') + # Display the results for i, (analysis, repo) in enumerate(analysis_results, 1): - confidence_level = "HIGH" if analysis.confidence_score >= 0.7 else "MEDIUM" if analysis.confidence_score >= 0.4 else "LOW" - - print(f"\n{i}) {repo.full_name}") - print(f" Confidence: {analysis.confidence_score:.2f} ({confidence_level})") - print(f" URL: {repo.html_url}") - print(f" Description: {repo.description or 'None'}") - + confidence_level = ( + 'HIGH' + if analysis.confidence_score >= 0.7 + else 'MEDIUM' + if analysis.confidence_score >= 0.4 + else 'LOW' + ) + + print(f'\n{i}) {repo.full_name}') + print(f' Confidence: {analysis.confidence_score:.2f} ({confidence_level})') + print(f' URL: {repo.html_url}') + print(f' Description: {repo.description or "None"}') + # Display evidence highlights if analysis.evidence: try: evidence = json.loads(analysis.evidence) - print(" Evidence:") + print(' Evidence:') display_evidence(evidence, selected_filter) except json.JSONDecodeError: - print(" Evidence: Unable to parse evidence data") - + print(' Evidence: Unable to parse evidence data') + # Allow the user to select repositories for further analysis - print("\nWould you like to analyze specific repositories?") + print('\nWould you like to analyze specific repositories?') analyze = input("Enter 'y' to select repositories for analysis: ").strip().lower() - + if analyze == 'y': - selected_indices = input("Enter repository numbers to analyze (comma-separated): ").strip() + selected_indices = input( + 'Enter repository numbers to analyze (comma-separated): ' + ).strip() try: - indices = [int(idx.strip()) for idx in selected_indices.split(",") if idx.strip()] + indices = [ + int(idx.strip()) for idx in selected_indices.split(',') if idx.strip() + ] selected_repos = [] - + for idx in indices: if 1 <= idx <= len(analysis_results): - selected_repos.append(analysis_results[idx-1][1]) # Get the Repository object + selected_repos.append( + analysis_results[idx - 1][1] + ) # Get the Repository object else: - print(f"Invalid repository number: {idx}") - + print(f'Invalid repository number: {idx}') + if selected_repos: analyze_repositories(selected_repos) except ValueError: - print("Invalid input. Please enter comma-separated numbers.") + print('Invalid input. Please enter comma-separated numbers.') + def print_keyword_status(keywords: List[str]): """Print which keywords have been used before and when.""" matches = find_keyword_matches(keywords) - - print("\n=== Keyword Status ===") - print(f"You provided {len(keywords)} keywords.") - + + print('\n=== Keyword Status ===') + print(f'You provided {len(keywords)} keywords.') + if not matches: - print("None of these keywords have been used for repository discovery yet.") + print('None of these keywords have been used for repository discovery yet.') return False - - print(f"{len(matches)} of these keywords have been used for repository discovery:") - + + print(f'{len(matches)} of these keywords have been used for repository discovery:') + for keyword, stats in matches.items(): - last_run = stats['last_run'].strftime("%Y-%m-%d %H:%M:%S") + last_run = stats['last_run'].strftime('%Y-%m-%d %H:%M:%S') repo_count = stats['repository_count'] - print(f"- '{keyword}': Last run on {last_run}, discovered {repo_count} repositories") - + print( + f"- '{keyword}': Last run on {last_run}, discovered {repo_count} repositories" + ) + return True + def display_comprehensive_evidence(evidence: Dict): """Format and display evidence from the Comprehensive Filter.""" # Check for direct ownership (100% confidence) if 'direct_ownership' in evidence: ownership = evidence['direct_ownership'] - print(f" ✓ DIRECT OWNERSHIP (100% confidence):") - print(f" Repository is owned by institutional GitHub organization: {ownership['owner']}") - print(f" This is a verified {ownership['owner_type']} of your institution") + print(' ✓ DIRECT OWNERSHIP (100% confidence):') + print( + f' Repository is owned by institutional GitHub organization: {ownership["owner"]}' + ) + print( + f' This is a verified {ownership["owner_type"]} of your institution' + ) return - + # Check for core contributors (high confidence) - if 'core_contributors' in evidence and evidence['core_contributors'].get('score', 0) >= 0.7: + if ( + 'core_contributors' in evidence + and evidence['core_contributors'].get('score', 0) >= 0.7 + ): core_ev = evidence['core_contributors'] - print(f" ✓ HIGH CONFIDENCE: Core Contributor Analysis ({core_ev['score']:.2f})") - print(f" {core_ev['matching_core_contributors']} of {core_ev['total_core_contributors']} core contributors are affiliated with your institution") - + print( + f' ✓ HIGH CONFIDENCE: Core Contributor Analysis ({core_ev["score"]:.2f})' + ) + print( + f' {core_ev["matching_core_contributors"]} of {core_ev["total_core_contributors"]} core contributors are affiliated with your institution' + ) + if 'contributors' in core_ev and core_ev['contributors']: - print(" Key contributors:") + print(' Key contributors:') for contrib in core_ev['contributors'][:3]: matches = [] if 'evidence' in contrib: ev = contrib['evidence'] - if ev.get('company_match'): matches.append("company") - if ev.get('location_match'): matches.append("location") - if ev.get('email_domain_match'): matches.append("email domain") - - print(f" - {contrib['login']} (matches: {', '.join(matches)})") - + if ev.get('company_match'): + matches.append('company') + if ev.get('location_match'): + matches.append('location') + if ev.get('email_domain_match'): + matches.append('email domain') + + print(f' - {contrib["login"]} (matches: {", ".join(matches)})') + if len(core_ev['contributors']) > 3: - print(f" ...and {len(core_ev['contributors'])-3} more") + print(f' ...and {len(core_ev["contributors"]) - 3} more') return - + # Check for high confidence factors high_confidence_found = False - + if 'email_domains' in evidence and evidence['email_domains'].get('score', 0) >= 0.7: high_confidence_found = True email_ev = evidence['email_domains'] - print(f" ✓ HIGH CONFIDENCE: Institutional Email Domains ({email_ev['score']:.2f})") - print(f" {email_ev['matching_count']} of {email_ev['total_contributors']} contributors have institutional email domains") + print( + f' ✓ HIGH CONFIDENCE: Institutional Email Domains ({email_ev["score"]:.2f})' + ) + print( + f' {email_ev["matching_count"]} of {email_ev["total_contributors"]} contributors have institutional email domains' + ) if 'matching_examples' in email_ev and email_ev['matching_examples']: - print(f" Contributors include: {', '.join(email_ev['matching_examples'][:3])}") + print( + f' Contributors include: {", ".join(email_ev["matching_examples"][:3])}' + ) if len(email_ev['matching_examples']) > 3: - print(f" ...and {len(email_ev['matching_examples'])-3} more") - - if 'openalex_affiliations' in evidence and evidence['openalex_affiliations'].get('score', 0) >= 0.7: + print(f' ...and {len(email_ev["matching_examples"]) - 3} more') + + if ( + 'openalex_affiliations' in evidence + and evidence['openalex_affiliations'].get('score', 0) >= 0.7 + ): high_confidence_found = True openalex_ev = evidence['openalex_affiliations'] - print(f" ✓ HIGH CONFIDENCE: OpenAlex Affiliations ({openalex_ev['score']:.2f})") - print(f" {openalex_ev['matching_works']} of {openalex_ev['total_works']} papers have authors affiliated with your institution") + print( + f' ✓ HIGH CONFIDENCE: OpenAlex Affiliations ({openalex_ev["score"]:.2f})' + ) + print( + f' {openalex_ev["matching_works"]} of {openalex_ev["total_works"]} papers have authors affiliated with your institution' + ) if 'matching_authors' in openalex_ev and openalex_ev['matching_authors']: - print(f" Authors include: {', '.join(openalex_ev['matching_authors'][:3])}") + print( + f' Authors include: {", ".join(openalex_ev["matching_authors"][:3])}' + ) if len(openalex_ev['matching_authors']) > 3: - print(f" ...and {len(openalex_ev['matching_authors'])-3} more") - + print(f' ...and {len(openalex_ev["matching_authors"]) - 3} more') + if 'combined_high_confidence' in evidence: high_confidence_found = True combined = evidence['combined_high_confidence'] - print(f" ✓ HIGH CONFIDENCE: Combined Factors ({combined['combined_score']:.2f})") - + print( + f' ✓ HIGH CONFIDENCE: Combined Factors ({combined["combined_score"]:.2f})' + ) + if 'core_contributor_score' in combined: - print(f" Core Contributors: {combined['core_contributor_score']:.2f}") - + print(f' Core Contributors: {combined["core_contributor_score"]:.2f}') + if 'email_score' in combined: - print(f" Email Domains: {combined['email_score']:.2f}") - + print(f' Email Domains: {combined["email_score"]:.2f}') + if 'openalex_score' in combined: - print(f" OpenAlex Affiliations: {combined['openalex_score']:.2f}") - + print(f' OpenAlex Affiliations: {combined["openalex_score"]:.2f}') + # Medium confidence factors if not high_confidence_found and 'naming_references' in evidence: naming_ev = evidence['naming_references'] - print(f" ✓ MEDIUM CONFIDENCE: Name References ({naming_ev['score']:.2f})") - + print(f' ✓ MEDIUM CONFIDENCE: Name References ({naming_ev["score"]:.2f})') + if 'name_match' in naming_ev: - print(f" Repository name contains institution name: {naming_ev['name_match']['text']}") + print( + f' Repository name contains institution name: {naming_ev["name_match"]["text"]}' + ) elif 'fullname_match' in naming_ev: - print(f" Repository full name contains institution name: {naming_ev['fullname_match']['text']}") + print( + f' Repository full name contains institution name: {naming_ev["fullname_match"]["text"]}' + ) if 'description_match' in naming_ev: - print(" Repository description mentions institution name") - + print(' Repository description mentions institution name') + # Lower confidence factors if 'topic_matches' in evidence: topics_ev = evidence['topic_matches'] - print(f" ✓ LOWER CONFIDENCE: Topic Matches ({topics_ev['score']:.2f})") + print(f' ✓ LOWER CONFIDENCE: Topic Matches ({topics_ev["score"]:.2f})') if 'matching_topics' in topics_ev: - print(f" Matching topics: {', '.join(topics_ev['matching_topics'])}") - + print(f' Matching topics: {", ".join(topics_ev["matching_topics"])}') + # Show other factors if they weren't already shown as high confidence if not high_confidence_found: if 'core_contributors' in evidence: core_ev = evidence['core_contributors'] - print(f" ✓ Core Contributor Matches ({core_ev['score']:.2f})") - print(f" {core_ev['matching_core_contributors']} of {core_ev['total_core_contributors']} core contributors") - + print(f' ✓ Core Contributor Matches ({core_ev["score"]:.2f})') + print( + f' {core_ev["matching_core_contributors"]} of {core_ev["total_core_contributors"]} core contributors' + ) + if 'email_domains' in evidence: email_ev = evidence['email_domains'] - print(f" ✓ Email Domain Matches ({email_ev['score']:.2f})") - print(f" {email_ev['matching_count']} of {email_ev['total_contributors']} contributors") - + print(f' ✓ Email Domain Matches ({email_ev["score"]:.2f})') + print( + f' {email_ev["matching_count"]} of {email_ev["total_contributors"]} contributors' + ) + if 'openalex_affiliations' in evidence: openalex_ev = evidence['openalex_affiliations'] - print(f" ✓ OpenAlex Affiliations ({openalex_ev['score']:.2f})") - print(f" {openalex_ev['matching_works']} of {openalex_ev['total_works']} papers") - + print(f' ✓ OpenAlex Affiliations ({openalex_ev["score"]:.2f})') + print( + f' {openalex_ev["matching_works"]} of {openalex_ev["total_works"]} papers' + ) + # Multi-factor bonus if 'multi_factor_bonus' in evidence and evidence['multi_factor_bonus']: - print(" ✓ Multiple confidence factors found (score bonus applied)") + print(' ✓ Multiple confidence factors found (score bonus applied)') + def institutional_repository_discovery(): """ Interactive interface for discovering repositories associated with an institution using Association Confidence Filters. """ - print("\n=== Institutional Repository Discovery ===") - print("This tool helps you find repositories associated with your institution.") - + print('\n=== Institutional Repository Discovery ===') + print('This tool helps you find repositories associated with your institution.') + # Step 1: Collect institution information institution_name = input("Institution name (e.g., 'Stanford University'): ").strip() if not institution_name: - print("Institution name cannot be empty.") + print('Institution name cannot be empty.') return - - institution_domains = input("Email domains (comma-separated, e.g., 'stanford.edu,cs.stanford.edu'): ").strip() - domains = [d.strip() for d in institution_domains.split(",") if d.strip()] - - github_orgs = input("GitHub organization names (comma-separated, e.g., 'stanford,StanfordVL'): ").strip() - org_list = [org.strip() for org in github_orgs.split(",") if org.strip()] - + + institution_domains = input( + "Email domains (comma-separated, e.g., 'stanford.edu,cs.stanford.edu'): " + ).strip() + domains = [d.strip() for d in institution_domains.split(',') if d.strip()] + + github_orgs = input( + "GitHub organization names (comma-separated, e.g., 'stanford,StanfordVL'): " + ).strip() + org_list = [org.strip() for org in github_orgs.split(',') if org.strip()] + # Step 2: Collect keywords associated with the institution - print("\nEnter keywords associated with your institution (one per line).") - print("These could include research areas, lab names, project identifiers, etc.") - print("Press Enter on an empty line when finished.") - + print('\nEnter keywords associated with your institution (one per line).') + print('These could include research areas, lab names, project identifiers, etc.') + print('Press Enter on an empty line when finished.') + keywords = [] while True: - keyword = input("> ").strip() + keyword = input('> ').strip() if not keyword: break keywords.append(keyword) - + if not keywords: - print("You must provide at least one keyword.") + print('You must provide at least one keyword.') return - + # Step 3: Check which keywords have been used before keywords_exist = print_keyword_status(keywords) if not keywords_exist: - print("\nYou need to first ingest repositories using these keywords.") - print("Please use option 2 from the main menu to search for repositories.") + print('\nYou need to first ingest repositories using these keywords.') + print('Please use option 2 from the main menu to search for repositories.') return - + # Step 4: Get repositories discovered with these keywords repositories = get_repositories_from_keywords(keywords) if not repositories: - print("\nNo repositories were found using these keywords.") + print('\nNo repositories were found using these keywords.') return - - print(f"\nFound {len(repositories)} repositories discovered using these keywords.") - + + print(f'\nFound {len(repositories)} repositories discovered using these keywords.') + # NEW: Check if these repositories have been analyzed for this institution - existing_analysis = check_existing_analysis_for_repos(repositories, institution_name) - + existing_analysis = check_existing_analysis_for_repos( + repositories, institution_name + ) + if existing_analysis: - print("\n=== Existing Analysis Results ===") - print(f"Found existing analysis results for {institution_name} and these repositories:") - + print('\n=== Existing Analysis Results ===') + print( + f'Found existing analysis results for {institution_name} and these repositories:' + ) + for filter_name, stats in existing_analysis.items(): - last_run = stats["last_run"].strftime("%Y-%m-%d %H:%M:%S") - print(f"\nFilter: {filter_name} (last run: {last_run})") - print(f" Total repositories analyzed: {stats['total']}") - print(f" High confidence (≥0.7): {stats['high_confidence']}") - print(f" Medium confidence (≥0.4): {stats['medium_confidence']}") - print(f" Low confidence (>0.0): {stats['low_confidence']}") - + last_run = stats['last_run'].strftime('%Y-%m-%d %H:%M:%S') + print(f'\nFilter: {filter_name} (last run: {last_run})') + print(f' Total repositories analyzed: {stats["total"]}') + print(f' High confidence (≥0.7): {stats["high_confidence"]}') + print(f' Medium confidence (≥0.4): {stats["medium_confidence"]}') + print(f' Low confidence (>0.0): {stats["low_confidence"]}') + # Ask if they want to view detailed results or run a new analysis - choice = input("\nDo you want to [v]iew detailed results or [r]un a new analysis? (v/r) ").strip().lower() - + choice = ( + input( + '\nDo you want to [v]iew detailed results or [r]un a new analysis? (v/r) ' + ) + .strip() + .lower() + ) + if choice == 'v': # View detailed results for a specific filter view_detailed_results_for_repos(repositories, institution_name) return # If 'r' or any other input, continue with new analysis - + # Step 5: Select and apply an Association Confidence Filter available_filters = get_available_filters() - - print("\n=== Available Association Confidence Filters ===") + + print('\n=== Available Association Confidence Filters ===') filter_names = list(available_filters.keys()) for i, name in enumerate(filter_names, 1): filter_obj = available_filters[name] - print(f"{i}) {name}") - print(f" {filter_obj.description}") - + print(f'{i}) {name}') + print(f' {filter_obj.description}') + try: - selection = int(input("\nSelect a filter to apply (number): ").strip()) + selection = int(input('\nSelect a filter to apply (number): ').strip()) if selection < 1 or selection > len(filter_names): - print("Invalid selection.") + print('Invalid selection.') return - + selected_filter = filter_names[selection - 1] except ValueError: - print("Please enter a valid number.") + print('Please enter a valid number.') return - + # Step 6: Apply the selected filter institution_info = { 'name': institution_name, 'domains': domains, - 'github_orgs': org_list + 'github_orgs': org_list, } - - print(f"\nApplying {selected_filter} to {len(repositories)} repositories...") + + print(f'\nApplying {selected_filter} to {len(repositories)} repositories...') filtered_results = apply_filter( - selected_filter, - repositories, + selected_filter, + repositories, institution_info, store_results=True, - keywords=keywords + keywords=keywords, ) - + if not filtered_results: - print("\nNo repositories met the confidence threshold for association with your institution.") + print( + '\nNo repositories met the confidence threshold for association with your institution.' + ) return - + # Step 7: Display the results - min_confidence = input("\nMinimum confidence threshold (0.0-1.0, default=0.3): ").strip() or "0.3" + min_confidence = ( + input('\nMinimum confidence threshold (0.0-1.0, default=0.3): ').strip() + or '0.3' + ) try: min_confidence = float(min_confidence) min_confidence = max(0.0, min(1.0, min_confidence)) except ValueError: - print("Invalid threshold, using default 0.3") + print('Invalid threshold, using default 0.3') min_confidence = 0.3 - + # Filter by confidence threshold high_confidence_results = [r for r in filtered_results if r[1] >= min_confidence] - + if not high_confidence_results: - print(f"\nNo repositories met the confidence threshold of {min_confidence}.") + print(f'\nNo repositories met the confidence threshold of {min_confidence}.') return - - print(f"\n=== Repositories Associated with {institution_name} ===") - print(f"Found {len(high_confidence_results)} repositories with confidence ≥ {min_confidence}") - print(f"Analysis results have been stored in the database for historical tracking.") - + + print(f'\n=== Repositories Associated with {institution_name} ===') + print( + f'Found {len(high_confidence_results)} repositories with confidence ≥ {min_confidence}' + ) + print('Analysis results have been stored in the database for historical tracking.') + # Display the high confidence results for i, (repo, confidence, evidence) in enumerate(high_confidence_results, 1): - confidence_level = "HIGH" if confidence >= 0.7 else "MEDIUM" if confidence >= 0.4 else "LOW" - - print(f"\n{i}) {repo.full_name}") - print(f" Confidence: {confidence:.2f} ({confidence_level})") - print(f" URL: {repo.html_url}") - print(f" Description: {repo.description or 'None'}") - + confidence_level = ( + 'HIGH' if confidence >= 0.7 else 'MEDIUM' if confidence >= 0.4 else 'LOW' + ) + + print(f'\n{i}) {repo.full_name}') + print(f' Confidence: {confidence:.2f} ({confidence_level})') + print(f' URL: {repo.html_url}') + print(f' Description: {repo.description or "None"}') + # Display evidence highlights based on filter type - print(" Evidence:") + print(' Evidence:') display_evidence(evidence, selected_filter) - + # Step 8: Allow the user to select repositories for further analysis - print("\nWould you like to analyze specific repositories?") + print('\nWould you like to analyze specific repositories?') analyze = input("Enter 'y' to select repositories for analysis: ").strip().lower() - + if analyze == 'y': - selected_indices = input("Enter repository numbers to analyze (comma-separated): ").strip() + selected_indices = input( + 'Enter repository numbers to analyze (comma-separated): ' + ).strip() try: - indices = [int(idx.strip()) for idx in selected_indices.split(",") if idx.strip()] + indices = [ + int(idx.strip()) for idx in selected_indices.split(',') if idx.strip() + ] selected_repos = [] - + for idx in indices: if 1 <= idx <= len(high_confidence_results): - selected_repos.append(high_confidence_results[idx-1][0]) + selected_repos.append(high_confidence_results[idx - 1][0]) else: - print(f"Invalid repository number: {idx}") - + print(f'Invalid repository number: {idx}') + if selected_repos: analyze_repositories(selected_repos) except ValueError: - print("Invalid input. Please enter comma-separated numbers.") + print('Invalid input. Please enter comma-separated numbers.') + def analyze_repositories(repositories: List[Repository]): """Allow the user to run analysis queries on selected repositories.""" if not repositories: return - - print(f"\n=== Repository Analysis ===") - print(f"Selected {len(repositories)} repositories for analysis:") - + + print('\n=== Repository Analysis ===') + print(f'Selected {len(repositories)} repositories for analysis:') + for i, repo in enumerate(repositories, 1): - print(f"{i}) {repo.full_name}") - - print("\nWhat type of analysis would you like to perform?") - print("1) Top contributors") - print("2) External contributors analysis") - print("3) Citation analysis (requires DOIs)") - - choice = input("Enter your choice (1-3): ").strip() - - if choice == "1": + print(f'{i}) {repo.full_name}') + + print('\nWhat type of analysis would you like to perform?') + print('1) Top contributors') + print('2) External contributors analysis') + print('3) Citation analysis (requires DOIs)') + + choice = input('Enter your choice (1-3): ').strip() + + if choice == '1': for repo in repositories: - print(f"\nAnalyzing top contributors for {repo.full_name}:") + print(f'\nAnalyzing top contributors for {repo.full_name}:') from queries import top10 + top10.main(repo.id) - - elif choice == "2": + + elif choice == '2': for repo in repositories: - print(f"\nAnalyzing external contributors for {repo.full_name}:") + print(f'\nAnalyzing external contributors for {repo.full_name}:') from queries import externalcontributors + externalcontributors.main(repo.id) - - elif choice == "3": + + elif choice == '3': for repo in repositories: if not repo.dois: - print(f"\n{repo.full_name} has no associated DOIs, skipping citation analysis.") + print( + f'\n{repo.full_name} has no associated DOIs, skipping citation analysis.' + ) continue - - print(f"\nAnalyzing citations for {repo.full_name}:") + + print(f'\nAnalyzing citations for {repo.full_name}:') from queries import top_topics + top_topics.main(repo.id) + def main(): institutional_repository_discovery() -if __name__ == "__main__": - main() \ No newline at end of file + +if __name__ == '__main__': + main() diff --git a/Older Experiments/scrappy-proof-of-concept/queries/analysis_history.py b/Older Experiments/scrappy-proof-of-concept/queries/analysis_history.py index 7fb368c..219bb9d 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/analysis_history.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/analysis_history.py @@ -5,18 +5,21 @@ import json from datetime import datetime, timedelta -from sqlalchemy import desc, func + from db.database import get_db_session -from models.models import RepositoryInstitutionAnalysis, Repository +from models.models import Repository, RepositoryInstitutionAnalysis +from sqlalchemy import desc, func + def format_datetime(dt): """Format a datetime for display.""" - return dt.strftime("%Y-%m-%d %H:%M") + return dt.strftime('%Y-%m-%d %H:%M') + def view_analysis_history(institution_name=None, days=30, min_score=0.0, limit=50): """ Display historical analysis results for a specific institution or all institutions. - + Args: institution_name: Filter by institution name (None for all) days: Number of days to look back @@ -25,139 +28,193 @@ def view_analysis_history(institution_name=None, days=30, min_score=0.0, limit=5 """ with get_db_session() as session: # Build query - query = session.query( - RepositoryInstitutionAnalysis, - Repository - ).join( - Repository, - Repository.id == RepositoryInstitutionAnalysis.repository_id - ).filter( - RepositoryInstitutionAnalysis.confidence_score >= min_score + query = ( + session.query(RepositoryInstitutionAnalysis, Repository) + .join( + Repository, Repository.id == RepositoryInstitutionAnalysis.repository_id + ) + .filter(RepositoryInstitutionAnalysis.confidence_score >= min_score) ) - + # Apply date filter if days > 0: cutoff_date = datetime.now() - timedelta(days=days) - query = query.filter(RepositoryInstitutionAnalysis.created_at >= cutoff_date) - + query = query.filter( + RepositoryInstitutionAnalysis.created_at >= cutoff_date + ) + # Apply institution filter if provided if institution_name: - query = query.filter(RepositoryInstitutionAnalysis.institution_name == institution_name) - + query = query.filter( + RepositoryInstitutionAnalysis.institution_name == institution_name + ) + # Get results ordered by most recent first - results = query.order_by( - desc(RepositoryInstitutionAnalysis.created_at) - ).limit(limit).all() - + results = ( + query.order_by(desc(RepositoryInstitutionAnalysis.created_at)) + .limit(limit) + .all() + ) + # Display results - print(f"\n=== Repository-Institution Analysis History ===") + print('\n=== Repository-Institution Analysis History ===') if institution_name: - print(f"Institution: {institution_name}") + print(f'Institution: {institution_name}') else: - print("All Institutions") - - print(f"Time range: Past {days} days (minimum score: {min_score})") - print(f"Found {len(results)} analysis results\n") - + print('All Institutions') + + print(f'Time range: Past {days} days (minimum score: {min_score})') + print(f'Found {len(results)} analysis results\n') + for analysis, repo in results: - score_color = "\033[92m" if analysis.confidence_score >= 0.7 else \ - "\033[93m" if analysis.confidence_score >= 0.4 else "\033[0m" - - print(f"Date: {format_datetime(analysis.created_at)}") - print(f"Repository: {repo.full_name}") - print(f"Institution: {analysis.institution_name}") - print(f"Filter: {analysis.filter_name}") - print(f"Confidence: {score_color}{analysis.confidence_score:.2f}\033[0m") - + score_color = ( + '\033[92m' + if analysis.confidence_score >= 0.7 + else '\033[93m' + if analysis.confidence_score >= 0.4 + else '\033[0m' + ) + + print(f'Date: {format_datetime(analysis.created_at)}') + print(f'Repository: {repo.full_name}') + print(f'Institution: {analysis.institution_name}') + print(f'Filter: {analysis.filter_name}') + print(f'Confidence: {score_color}{analysis.confidence_score:.2f}\033[0m') + if analysis.keywords_used: - print(f"Keywords: {analysis.keywords_used}") - + print(f'Keywords: {analysis.keywords_used}') + # Display comprehensive evidence summary if analysis.evidence: try: evidence = json.loads(analysis.evidence) - print("Evidence Summary:") - + print('Evidence Summary:') + # Direct ownership (highest confidence) if 'direct_ownership' in evidence: owner_info = evidence['direct_ownership'] - print(f" - Direct ownership match (100% confidence): {owner_info.get('owner', 'Unknown')}") - + print( + f' - Direct ownership match (100% confidence): {owner_info.get("owner", "Unknown")}' + ) + # Core contributors (high confidence) if 'core_contributors' in evidence: core_ev = evidence['core_contributors'] - if 'matching_core_contributors' in core_ev and 'total_core_contributors' in core_ev: - print(f" - Core contributors: {core_ev['matching_core_contributors']}/{core_ev['total_core_contributors']} repository maintainers") + if ( + 'matching_core_contributors' in core_ev + and 'total_core_contributors' in core_ev + ): + print( + f' - Core contributors: {core_ev["matching_core_contributors"]}/{core_ev["total_core_contributors"]} repository maintainers' + ) if 'contributors' in core_ev and core_ev['contributors']: - print(f" Top contributor: {core_ev['contributors'][0]['login']}") - + print( + f' Top contributor: {core_ev["contributors"][0]["login"]}' + ) + # Combined high confidence factors if 'combined_high_confidence' in evidence: combined = evidence['combined_high_confidence'] - print(" - Multiple high-confidence factors combined:") + print(' - Multiple high-confidence factors combined:') if 'core_contributor_score' in combined: - print(f" • Core Contributors: {combined['core_contributor_score']:.2f}") + print( + f' • Core Contributors: {combined["core_contributor_score"]:.2f}' + ) if 'email_score' in combined: - print(f" • Email Domains: {combined['email_score']:.2f}") + print(f' • Email Domains: {combined["email_score"]:.2f}') if 'openalex_score' in combined: - print(f" • OpenAlex Affiliations: {combined['openalex_score']:.2f}") - + print( + f' • OpenAlex Affiliations: {combined["openalex_score"]:.2f}' + ) + # Email domains if 'email_domains' in evidence: email_ev = evidence['email_domains'] - if 'matching_count' in email_ev and 'total_contributors' in email_ev: - print(f" - Email domains: {email_ev['matching_count']}/{email_ev['total_contributors']} contributors") - if 'matching_examples' in email_ev and email_ev['matching_examples']: + if ( + 'matching_count' in email_ev + and 'total_contributors' in email_ev + ): + print( + f' - Email domains: {email_ev["matching_count"]}/{email_ev["total_contributors"]} contributors' + ) + if ( + 'matching_examples' in email_ev + and email_ev['matching_examples'] + ): examples = ', '.join(email_ev['matching_examples'][:2]) - print(f" Examples: {examples}") - + print(f' Examples: {examples}') + # OpenAlex affiliations if 'openalex_affiliations' in evidence: oa_ev = evidence['openalex_affiliations'] if 'matching_works' in oa_ev and 'total_works' in oa_ev: - print(f" - OpenAlex affiliations: {oa_ev['matching_works']}/{oa_ev['total_works']} works") - if 'matching_authors' in oa_ev and oa_ev['matching_authors']: + print( + f' - OpenAlex affiliations: {oa_ev["matching_works"]}/{oa_ev["total_works"]} works' + ) + if ( + 'matching_authors' in oa_ev + and oa_ev['matching_authors'] + ): authors = ', '.join(oa_ev['matching_authors'][:2]) - print(f" Authors: {authors}") - + print(f' Authors: {authors}') + # Name/description matches if 'naming_references' in evidence: naming_ev = evidence['naming_references'] - print(" - Name/description matches:") + print(' - Name/description matches:') if 'name_match' in naming_ev: - print(f" • Repository name: {naming_ev['name_match']['text']}") + print( + f' • Repository name: {naming_ev["name_match"]["text"]}' + ) elif 'fullname_match' in naming_ev: - print(f" • Repository full name: {naming_ev['fullname_match']['text']}") + print( + f' • Repository full name: {naming_ev["fullname_match"]["text"]}' + ) if 'description_match' in naming_ev: - print(" • Repository description contains institution name") - + print( + ' • Repository description contains institution name' + ) + # Topic matches if 'topic_matches' in evidence: topic_ev = evidence['topic_matches'] if 'matching_topics' in topic_ev: topics = ', '.join(topic_ev['matching_topics'][:3]) - print(f" - Topic matches: {topics}") - + print(f' - Topic matches: {topics}') + # Multi-factor bonus - if 'multi_factor_bonus' in evidence and evidence['multi_factor_bonus']: - print(" - Multiple confidence factors (score bonus applied)") - + if ( + 'multi_factor_bonus' in evidence + and evidence['multi_factor_bonus'] + ): + print(' - Multiple confidence factors (score bonus applied)') + # Check if no specific evidence was printed but we have a score - evidence_types = ['direct_ownership', 'core_contributors', 'combined_high_confidence', - 'email_domains', 'openalex_affiliations', 'naming_references', - 'topic_matches', 'multi_factor_bonus'] + evidence_types = [ + 'direct_ownership', + 'core_contributors', + 'combined_high_confidence', + 'email_domains', + 'openalex_affiliations', + 'naming_references', + 'topic_matches', + 'multi_factor_bonus', + ] if not any(k in evidence for k in evidence_types): - print(" - Confidence score based on combination of repository attributes") - + print( + ' - Confidence score based on combination of repository attributes' + ) + except json.JSONDecodeError: - print(" - Evidence data could not be parsed") - - print("-" * 60) + print(' - Evidence data could not be parsed') + + print('-' * 60) + def view_institution_score_trends(institution_name, days=90, chart=False): """ View trends in confidence scores for a specific institution over time. - + Args: institution_name: Name of the institution to analyze days: Number of days to look back @@ -166,107 +223,115 @@ def view_institution_score_trends(institution_name, days=90, chart=False): with get_db_session() as session: # Filter date range cutoff_date = datetime.now() - timedelta(days=days) - + # Get average score per day - daily_scores = session.query( - func.date(RepositoryInstitutionAnalysis.created_at).label('date'), - func.avg(RepositoryInstitutionAnalysis.confidence_score).label('avg_score'), - func.count(RepositoryInstitutionAnalysis.id).label('count') - ).filter( - RepositoryInstitutionAnalysis.institution_name == institution_name, - RepositoryInstitutionAnalysis.created_at >= cutoff_date - ).group_by( - func.date(RepositoryInstitutionAnalysis.created_at) - ).order_by( - 'date' - ).all() - + daily_scores = ( + session.query( + func.date(RepositoryInstitutionAnalysis.created_at).label('date'), + func.avg(RepositoryInstitutionAnalysis.confidence_score).label( + 'avg_score' + ), + func.count(RepositoryInstitutionAnalysis.id).label('count'), + ) + .filter( + RepositoryInstitutionAnalysis.institution_name == institution_name, + RepositoryInstitutionAnalysis.created_at >= cutoff_date, + ) + .group_by(func.date(RepositoryInstitutionAnalysis.created_at)) + .order_by('date') + .all() + ) + # Display results - print(f"\n=== Confidence Score Trends for {institution_name} ===") - print(f"Time range: Past {days} days") - + print(f'\n=== Confidence Score Trends for {institution_name} ===') + print(f'Time range: Past {days} days') + if not daily_scores: - print("No analysis data found for this time period.") + print('No analysis data found for this time period.') return - - print("\nDaily Average Confidence Scores:") + + print('\nDaily Average Confidence Scores:') for date, avg_score, count in daily_scores: - print(f"{date}: {avg_score:.2f} (from {count} repositories)") - + print(f'{date}: {avg_score:.2f} (from {count} repositories)') + # Calculate overall statistics avg_scores = [score for _, score, _ in daily_scores] if avg_scores: overall_avg = sum(avg_scores) / len(avg_scores) - print(f"\nOverall average score: {overall_avg:.2f}") - + print(f'\nOverall average score: {overall_avg:.2f}') + # Trend analysis if len(avg_scores) >= 2: - first_week = avg_scores[:min(7, len(avg_scores))] - last_week = avg_scores[-min(7, len(avg_scores)):] - + first_week = avg_scores[: min(7, len(avg_scores))] + last_week = avg_scores[-min(7, len(avg_scores)) :] + first_week_avg = sum(first_week) / len(first_week) last_week_avg = sum(last_week) / len(last_week) - + if last_week_avg > first_week_avg: - print(f"Trend: Improving (+{(last_week_avg - first_week_avg):.2f})") + print(f'Trend: Improving (+{(last_week_avg - first_week_avg):.2f})') elif last_week_avg < first_week_avg: - print(f"Trend: Declining ({(last_week_avg - first_week_avg):.2f})") + print(f'Trend: Declining ({(last_week_avg - first_week_avg):.2f})') else: - print("Trend: Stable") + print('Trend: Stable') + def main(): """Interactive menu for analysis history queries.""" - print("\n=== Analysis History Queries ===") - print("1) View recent analysis results") - print("2) View analysis history for a specific institution") - print("3) View institution confidence score trends") - - choice = input("Enter your choice (1-3): ").strip() - - if choice == "1": - days = input("Number of days to look back (default: 30): ").strip() + print('\n=== Analysis History Queries ===') + print('1) View recent analysis results') + print('2) View analysis history for a specific institution') + print('3) View institution confidence score trends') + + choice = input('Enter your choice (1-3): ').strip() + + if choice == '1': + days = input('Number of days to look back (default: 30): ').strip() days = int(days) if days.isdigit() else 30 - - min_score = input("Minimum confidence score (0.0-1.0, default: 0.3): ").strip() + + min_score = input('Minimum confidence score (0.0-1.0, default: 0.3): ').strip() try: min_score = float(min_score) if min_score else 0.3 min_score = max(0.0, min(1.0, min_score)) except ValueError: min_score = 0.3 - + view_analysis_history(days=days, min_score=min_score) - - elif choice == "2": - institution = input("Institution name: ").strip() + + elif choice == '2': + institution = input('Institution name: ').strip() if not institution: - print("Institution name cannot be empty.") + print('Institution name cannot be empty.') return - - days = input("Number of days to look back (default: 30): ").strip() + + days = input('Number of days to look back (default: 30): ').strip() days = int(days) if days.isdigit() else 30 - - min_score = input("Minimum confidence score (0.0-1.0, default: 0.3): ").strip() + + min_score = input('Minimum confidence score (0.0-1.0, default: 0.3): ').strip() try: min_score = float(min_score) if min_score else 0.3 min_score = max(0.0, min(1.0, min_score)) except ValueError: min_score = 0.3 - - view_analysis_history(institution_name=institution, days=days, min_score=min_score) - - elif choice == "3": - institution = input("Institution name: ").strip() + + view_analysis_history( + institution_name=institution, days=days, min_score=min_score + ) + + elif choice == '3': + institution = input('Institution name: ').strip() if not institution: - print("Institution name cannot be empty.") + print('Institution name cannot be empty.') return - - days = input("Number of days to look back (default: 90): ").strip() + + days = input('Number of days to look back (default: 90): ').strip() days = int(days) if days.isdigit() else 90 - + view_institution_score_trends(institution, days=days) - + else: - print("Invalid choice.") + print('Invalid choice.') + -if __name__ == "__main__": - main() \ No newline at end of file +if __name__ == '__main__': + main() diff --git a/Older Experiments/scrappy-proof-of-concept/queries/citing_works.py b/Older Experiments/scrappy-proof-of-concept/queries/citing_works.py index 8ac6e1d..8859b55 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/citing_works.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/citing_works.py @@ -1,77 +1,108 @@ -from sqlalchemy.orm import joinedload from db.database import get_db_session -from models.models import Repository, OpenAlexWork +from models.models import OpenAlexWork, Repository +from sqlalchemy.orm import joinedload + def main(repo_id, doi_filter=None): with get_db_session() as session: - repo = session.query(Repository).options(joinedload(Repository.dois)).filter_by(id=repo_id).first() + repo = ( + session.query(Repository) + .options(joinedload(Repository.dois)) + .filter_by(id=repo_id) + .first() + ) if not repo: - print("Repository not found.") + print('Repository not found.') return if doi_filter: selected_doi = doi_filter else: if repo.dois: selected_doi = repo.dois[0].doi - print(f"No specific DOI selected; defaulting to first DOI: {selected_doi}") + print( + f'No specific DOI selected; defaulting to first DOI: {selected_doi}' + ) else: - print("No DOIs found for this repository.") + print('No DOIs found for this repository.') return - work = session.query(OpenAlexWork).filter(OpenAlexWork.doi == selected_doi).first() + work = ( + session.query(OpenAlexWork).filter(OpenAlexWork.doi == selected_doi).first() + ) if not work: - print(f"No OpenAlex work found with DOI: {selected_doi}") + print(f'No OpenAlex work found with DOI: {selected_doi}') return - print(f"\nInitiating Work: {work.title} (DB ID: {work.id})") - print(f"It is cited by {len(work.citing_works)} work(s).\n") + print(f'\nInitiating Work: {work.title} (DB ID: {work.id})') + print(f'It is cited by {len(work.citing_works)} work(s).\n') topics_count = {} subfields_count = {} fields_count = {} domains_count = {} for citing_work in work.citing_works: - print(f"Citing Work: {citing_work.title} (DB ID: {citing_work.id})") + print(f'Citing Work: {citing_work.title} (DB ID: {citing_work.id})') if citing_work.topics: for topic in citing_work.topics: - topic_name = topic.display_name if topic.display_name else "N/A" - subfield_name = topic.subfield_display_name if topic.subfield_display_name else "N/A" - field_name = topic.field_display_name if topic.field_display_name else "N/A" - domain_name = topic.domain_display_name if topic.domain_display_name else "N/A" - print(f" Topic: {topic_name}") - print(f" Domain: {domain_name}") - print(f" Field: {field_name}") - print(f" Subfield: {subfield_name}") + topic_name = topic.display_name if topic.display_name else 'N/A' + subfield_name = ( + topic.subfield_display_name + if topic.subfield_display_name + else 'N/A' + ) + field_name = ( + topic.field_display_name if topic.field_display_name else 'N/A' + ) + domain_name = ( + topic.domain_display_name + if topic.domain_display_name + else 'N/A' + ) + print(f' Topic: {topic_name}') + print(f' Domain: {domain_name}') + print(f' Field: {field_name}') + print(f' Subfield: {subfield_name}') topics_count[topic_name] = topics_count.get(topic_name, 0) + 1 - subfields_count[subfield_name] = subfields_count.get(subfield_name, 0) + 1 + subfields_count[subfield_name] = ( + subfields_count.get(subfield_name, 0) + 1 + ) fields_count[field_name] = fields_count.get(field_name, 0) + 1 domains_count[domain_name] = domains_count.get(domain_name, 0) + 1 else: - print(" Topics: None") - print("-" * 40) - print("\nAggregate Counts for Citing Works:") + print(' Topics: None') + print('-' * 40) + print('\nAggregate Counts for Citing Works:') if topics_count: - print("\nTopics:") - for topic, count in sorted(topics_count.items(), key=lambda x: x[1], reverse=True): - print(f" {topic}: {count}") + print('\nTopics:') + for topic, count in sorted( + topics_count.items(), key=lambda x: x[1], reverse=True + ): + print(f' {topic}: {count}') else: - print("\nNo topics found.") + print('\nNo topics found.') if subfields_count: - print("\nSubfields:") - for subfield, count in sorted(subfields_count.items(), key=lambda x: x[1], reverse=True): - print(f" {subfield}: {count}") + print('\nSubfields:') + for subfield, count in sorted( + subfields_count.items(), key=lambda x: x[1], reverse=True + ): + print(f' {subfield}: {count}') else: - print("\nNo subfields found.") + print('\nNo subfields found.') if fields_count: - print("\nFields:") - for field, count in sorted(fields_count.items(), key=lambda x: x[1], reverse=True): - print(f" {field}: {count}") + print('\nFields:') + for field, count in sorted( + fields_count.items(), key=lambda x: x[1], reverse=True + ): + print(f' {field}: {count}') else: - print("\nNo fields found.") + print('\nNo fields found.') if domains_count: - print("\nDomains:") - for domain, count in sorted(domains_count.items(), key=lambda x: x[1], reverse=True): - print(f" {domain}: {count}") + print('\nDomains:') + for domain, count in sorted( + domains_count.items(), key=lambda x: x[1], reverse=True + ): + print(f' {domain}: {count}') else: - print("\nNo domains found.") - print(f"It is cited by {len(work.citing_works)} work(s).\n") + print('\nNo domains found.') + print(f'It is cited by {len(work.citing_works)} work(s).\n') + if __name__ == '__main__': main() diff --git a/Older Experiments/scrappy-proof-of-concept/queries/externalcontributors.py b/Older Experiments/scrappy-proof-of-concept/queries/externalcontributors.py index 99ea337..5f3154f 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/externalcontributors.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/externalcontributors.py @@ -1,12 +1,20 @@ -from sqlalchemy import func -from models.models import User, Issue, IssueComment, PRReviewComment, PullRequest, Repository from db.database import get_db_session +from models.models import ( + Issue, + IssueComment, + PRReviewComment, + PullRequest, + Repository, + User, +) +from sqlalchemy import func from sqlalchemy.orm import Session + def get_engaged_non_pr_users(session: Session, repo_id: int): repo = session.query(Repository).filter_by(id=repo_id).first() if not repo: - print(f"Repository with id {repo_id} not found.") + print(f'Repository with id {repo_id} not found.') return [] engaged_users_subq = ( session.query(User.id) @@ -38,24 +46,23 @@ def get_engaged_non_pr_users(session: Session, repo_id: int): ) return users_never_pr + def main(repo_id): with get_db_session() as session: repo_obj = session.query(Repository).filter_by(id=repo_id).first() repo_name = repo_obj.full_name if repo_obj else str(repo_id) engaged_bystanders = get_engaged_non_pr_users(session, repo_id) - print(f"Users who engaged but never opened a PR for repository: {repo_name}") + print(f'Users who engaged but never opened a PR for repository: {repo_name}') for user in engaged_bystanders: - issue_count = session.query(func.count(Issue.id)).filter( - Issue.user_id == user.id, - Issue.repository_id == repo_id - ).scalar() + issue_count = ( + session.query(func.count(Issue.id)) + .filter(Issue.user_id == user.id, Issue.repository_id == repo_id) + .scalar() + ) issue_comment_count = ( session.query(func.count(IssueComment.id)) .join(Issue, IssueComment.issue_id == Issue.id) - .filter( - IssueComment.user_id == user.id, - Issue.repository_id == repo_id - ) + .filter(IssueComment.user_id == user.id, Issue.repository_id == repo_id) .scalar() ) pr_review_count = ( @@ -63,15 +70,16 @@ def main(repo_id): .join(PullRequest, PRReviewComment.pr_id == PullRequest.id) .filter( PRReviewComment.user_id == user.id, - PullRequest.repository_id == repo_id + PullRequest.repository_id == repo_id, ) .scalar() ) org_info = user.company if user.company else user.type print( - f"- {user.login} (User ID={user.id}), Issues={issue_count}, " - f"Comments={issue_comment_count}, PRReviews={pr_review_count}, Org={org_info}" + f'- {user.login} (User ID={user.id}), Issues={issue_count}, ' + f'Comments={issue_comment_count}, PRReviews={pr_review_count}, Org={org_info}' ) -if __name__ == "__main__": - print("This module is intended to be run from run_queries.py") + +if __name__ == '__main__': + print('This module is intended to be run from run_queries.py') diff --git a/Older Experiments/scrappy-proof-of-concept/queries/institution_analysis_query.py b/Older Experiments/scrappy-proof-of-concept/queries/institution_analysis_query.py index dea6639..502dce5 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/institution_analysis_query.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/institution_analysis_query.py @@ -6,342 +6,377 @@ import json import logging -import sys from datetime import datetime -from typing import Dict, List, Any, Optional, Union, Tuple +from typing import Any, Dict, List, Optional -from services.institution_analysis import InstitutionAnalysisManager -from services.institution_analysis_impl.surfacing import ( - KeywordRepositorySurfacing, DomainRepositorySurfacing, NameRepositorySurfacing, - DomainPeopleSurfacing, ProfilePeopleSurfacing, OpenAlexPeopleSurfacing -) -from services.institution_analysis_impl.person_acf import ( - EmailDomainPersonFilter, ProfilePersonFilter, OpenAlexPersonFilter, - CombinedPersonFilter +from db.database import get_db_session +from models.models import ( + ACFPersonResult, + ACFRepositoryResult, + ACFResult, + OpenAlexAuthor, + Repository, + SurfacedPerson, + SurfacedRepository, + SurfacingResult, + User, ) from services.acf_framework import ( - get_available_filters, get_filter_by_name, apply_filter + apply_filter, + get_available_filters, ) -from db.database import get_db_session -from models.models import ( - Repository, User, OpenAlexAuthor, AnalysisSession, SurfacingResult, - SurfacedRepository, SurfacedPerson, ACFResult, ACFRepositoryResult, - ACFPersonResult +from services.institution_analysis import InstitutionAnalysisManager +from services.institution_analysis_impl.person_acf import ( + CombinedPersonFilter, + EmailDomainPersonFilter, + OpenAlexPersonFilter, + ProfilePersonFilter, +) +from services.institution_analysis_impl.surfacing import ( + DomainPeopleSurfacing, + DomainRepositorySurfacing, + KeywordRepositorySurfacing, + NameRepositorySurfacing, + OpenAlexPeopleSurfacing, + ProfilePeopleSurfacing, ) logger = logging.getLogger(__name__) -def get_available_surfacing_algorithms(analysis_type: str = "repository") -> Dict[str, Any]: + +def get_available_surfacing_algorithms( + analysis_type: str = 'repository', +) -> Dict[str, Any]: """ Get available surfacing algorithms for the given analysis type. - + Args: analysis_type: Either "repository" or "people" - + Returns: Dictionary mapping algorithm keys to objects """ - if analysis_type == "repository": + if analysis_type == 'repository': return { - "1": KeywordRepositorySurfacing(), - "2": DomainRepositorySurfacing(), - "3": NameRepositorySurfacing() + '1': KeywordRepositorySurfacing(), + '2': DomainRepositorySurfacing(), + '3': NameRepositorySurfacing(), } else: # people return { - "1": DomainPeopleSurfacing(), - "2": ProfilePeopleSurfacing(), - "3": OpenAlexPeopleSurfacing() + '1': DomainPeopleSurfacing(), + '2': ProfilePeopleSurfacing(), + '3': OpenAlexPeopleSurfacing(), } + def get_available_person_filters() -> Dict[str, Any]: """ Get available person ACF filters. - + Returns: Dictionary mapping filter keys to objects """ return { - "1": EmailDomainPersonFilter(), - "2": ProfilePersonFilter(), - "3": OpenAlexPersonFilter(), - "4": CombinedPersonFilter() + '1': EmailDomainPersonFilter(), + '2': ProfilePersonFilter(), + '3': OpenAlexPersonFilter(), + '4': CombinedPersonFilter(), } + def print_institution_analysis_menu(): """Print the main institution analysis menu.""" - print("\n=== Institution Analysis Menu ===") - print("1) Repository Analysis") - print("2) People Analysis") - print("3) Return to Main Menu") + print('\n=== Institution Analysis Menu ===') + print('1) Repository Analysis') + print('2) People Analysis') + print('3) Return to Main Menu') + def collect_institution_info() -> Dict[str, Any]: """ Collect institution information from the user. - + Returns: Dictionary with institution data """ - print("\n=== Institution Information ===") + print('\n=== Institution Information ===') institution_name = input("Institution name (e.g., 'Stanford University'): ").strip() if not institution_name: - print("Institution name cannot be empty.") + print('Institution name cannot be empty.') return {} - - institution_domains = input("Email domains (comma-separated, e.g., 'stanford.edu,cs.stanford.edu'): ").strip() - domains = [d.strip() for d in institution_domains.split(",") if d.strip()] - - github_orgs = input("GitHub organization names (comma-separated, e.g., 'stanford,StanfordVL'): ").strip() - org_list = [org.strip() for org in github_orgs.split(",") if org.strip()] - - return { - "name": institution_name, - "domains": domains, - "github_orgs": org_list - } + + institution_domains = input( + "Email domains (comma-separated, e.g., 'stanford.edu,cs.stanford.edu'): " + ).strip() + domains = [d.strip() for d in institution_domains.split(',') if d.strip()] + + github_orgs = input( + "GitHub organization names (comma-separated, e.g., 'stanford,StanfordVL'): " + ).strip() + org_list = [org.strip() for org in github_orgs.split(',') if org.strip()] + + return {'name': institution_name, 'domains': domains, 'github_orgs': org_list} + def check_past_sessions(manager: InstitutionAnalysisManager) -> Optional[str]: """ Check for past analysis sessions and allow the user to choose one. - + Args: manager: The InstitutionAnalysisManager instance - + Returns: Session ID if a past session was chosen, None otherwise """ past_sessions = manager.get_past_sessions() - + if not past_sessions: - print("No past analyses found for this institution and analysis type.") + print('No past analyses found for this institution and analysis type.') return None - - print("\n=== Past Analyses ===") - print(f"Found {len(past_sessions)} past analyses for {manager.institution_name}:") - + + print('\n=== Past Analyses ===') + print(f'Found {len(past_sessions)} past analyses for {manager.institution_name}:') + for i, session in enumerate(past_sessions, 1): - status = session["status"].capitalize() - date = session["last_updated"].strftime("%Y-%m-%d %H:%M") - print(f"{i}) {date}: {status} (Surfacing: {session['surfacing_count']}, ACF: {session['acf_count']})") - - print("\nDo you want to:") - print("1) Continue with a past analysis") - print("2) Start a new analysis") - - choice = input("Enter your choice (1-2): ").strip() - - if choice == "1": - session_idx = input("Select a past analysis (number): ").strip() + status = session['status'].capitalize() + date = session['last_updated'].strftime('%Y-%m-%d %H:%M') + print( + f'{i}) {date}: {status} (Surfacing: {session["surfacing_count"]}, ACF: {session["acf_count"]})' + ) + + print('\nDo you want to:') + print('1) Continue with a past analysis') + print('2) Start a new analysis') + + choice = input('Enter your choice (1-2): ').strip() + + if choice == '1': + session_idx = input('Select a past analysis (number): ').strip() try: idx = int(session_idx) - 1 if 0 <= idx < len(past_sessions): - return past_sessions[idx]["session_id"] + return past_sessions[idx]['session_id'] else: - print("Invalid selection.") + print('Invalid selection.') except ValueError: - print("Invalid input.") - + print('Invalid input.') + return None + def repository_surfacing_phase(manager: InstitutionAnalysisManager) -> bool: """ Run the repository surfacing phase. - + Args: manager: The InstitutionAnalysisManager instance - + Returns: True if surfacing was successful, False otherwise """ - print("\n=== Repository Surfacing Phase ===") - manager.set_phase("surfacing") - + print('\n=== Repository Surfacing Phase ===') + manager.set_phase('surfacing') + # Check for past surfacing runs with get_db_session() as session: - past_runs = session.query(SurfacingResult).filter( - SurfacingResult.session_id == manager.db_session_id - ).order_by( - SurfacingResult.run_at.desc() - ).all() - + past_runs = ( + session.query(SurfacingResult) + .filter(SurfacingResult.session_id == manager.db_session_id) + .order_by(SurfacingResult.run_at.desc()) + .all() + ) + if past_runs: - print("\nPast surfacing runs for this session:") + print('\nPast surfacing runs for this session:') for i, run in enumerate(past_runs, 1): algorithm = run.algorithm - date = run.run_at.strftime("%Y-%m-%d %H:%M") + date = run.run_at.strftime('%Y-%m-%d %H:%M') count = run.result_count - print(f"{i}) {algorithm} ({date}): {count} repositories found") - - print("\nDo you want to:") - print("1) Use a past surfacing run") - print("2) Run a new surfacing algorithm") - - choice = input("Enter your choice (1-2): ").strip() - - if choice == "1": - run_idx = input("Select a surfacing run (number): ").strip() + print(f'{i}) {algorithm} ({date}): {count} repositories found') + + print('\nDo you want to:') + print('1) Use a past surfacing run') + print('2) Run a new surfacing algorithm') + + choice = input('Enter your choice (1-2): ').strip() + + if choice == '1': + run_idx = input('Select a surfacing run (number): ').strip() try: idx = int(run_idx) - 1 if 0 <= idx < len(past_runs): manager.surfacing_id = past_runs[idx].id - print(f"Using past surfacing run: {past_runs[idx].algorithm}") + print(f'Using past surfacing run: {past_runs[idx].algorithm}') return True else: - print("Invalid selection.") + print('Invalid selection.') except ValueError: - print("Invalid input.") - + print('Invalid input.') + # Get available surfacing algorithms - algorithms = get_available_surfacing_algorithms("repository") - - print("\n=== Available Surfacing Algorithms ===") + algorithms = get_available_surfacing_algorithms('repository') + + print('\n=== Available Surfacing Algorithms ===') for key, algorithm in algorithms.items(): - print(f"{key}) {algorithm.name}: {algorithm.description}") - - choice = input("\nSelect a surfacing algorithm (number): ").strip() - + print(f'{key}) {algorithm.name}: {algorithm.description}') + + choice = input('\nSelect a surfacing algorithm (number): ').strip() + if choice in algorithms: algorithm = algorithms[choice] - print(f"\nRunning {algorithm.name}...") - + print(f'\nRunning {algorithm.name}...') + # Collect algorithm-specific parameters parameters = {} - + if isinstance(algorithm, KeywordRepositorySurfacing): - print("\nEnter keywords associated with your institution (one per line).") - print("These could include research areas, lab names, project identifiers, etc.") - print("Press Enter on an empty line when finished.") - + print('\nEnter keywords associated with your institution (one per line).') + print( + 'These could include research areas, lab names, project identifiers, etc.' + ) + print('Press Enter on an empty line when finished.') + keywords = [] while True: - keyword = input("> ").strip() + keyword = input('> ').strip() if not keyword: break keywords.append(keyword) - + if not keywords: - print("You must provide at least one keyword.") + print('You must provide at least one keyword.') return False - - parameters["keywords"] = keywords - - github_token = input("\nEnter GitHub token for searching (optional): ").strip() + + parameters['keywords'] = keywords + + github_token = input( + '\nEnter GitHub token for searching (optional): ' + ).strip() if github_token: - parameters["github_token"] = github_token - + parameters['github_token'] = github_token + # Run the algorithm try: surfacing_id = algorithm.run( - manager.db_session_id, - manager.institution_info, - parameters + manager.db_session_id, manager.institution_info, parameters ) - + manager.surfacing_id = surfacing_id - + # Show results with get_db_session() as session: - result = session.query(SurfacingResult).filter_by(id=surfacing_id).first() + result = ( + session.query(SurfacingResult).filter_by(id=surfacing_id).first() + ) if result: - print(f"\nSurfacing complete. Found {result.result_count} repositories.") + print( + f'\nSurfacing complete. Found {result.result_count} repositories.' + ) return True except Exception as e: - logger.error(f"Error during surfacing: {e}") - print(f"Error during surfacing: {e}") + logger.error(f'Error during surfacing: {e}') + print(f'Error during surfacing: {e}') else: - print("Invalid selection.") - + print('Invalid selection.') + return False + def repository_acf_phase(manager: InstitutionAnalysisManager) -> bool: """ Run the repository ACF phase. - + Args: manager: The InstitutionAnalysisManager instance - + Returns: True if ACF was successful, False otherwise """ if not manager.surfacing_id: - print("No surfacing results available. Please complete the surfacing phase first.") + print( + 'No surfacing results available. Please complete the surfacing phase first.' + ) return False - - print("\n=== Repository ACF Phase ===") - manager.set_phase("acf") - + + print('\n=== Repository ACF Phase ===') + manager.set_phase('acf') + # Check for past ACF runs with get_db_session() as session: - past_runs = session.query(ACFResult).filter( - ACFResult.session_id == manager.db_session_id - ).order_by( - ACFResult.run_at.desc() - ).all() - + past_runs = ( + session.query(ACFResult) + .filter(ACFResult.session_id == manager.db_session_id) + .order_by(ACFResult.run_at.desc()) + .all() + ) + if past_runs: - print("\nPast ACF runs for this session:") + print('\nPast ACF runs for this session:') for i, run in enumerate(past_runs, 1): filter_name = run.filter_name - date = run.run_at.strftime("%Y-%m-%d %H:%M") - print(f"{i}) {filter_name} ({date})") - - print("\nDo you want to:") - print("1) Use a past ACF run") - print("2) Run a new ACF") - - choice = input("Enter your choice (1-2): ").strip() - - if choice == "1": - run_idx = input("Select an ACF run (number): ").strip() + date = run.run_at.strftime('%Y-%m-%d %H:%M') + print(f'{i}) {filter_name} ({date})') + + print('\nDo you want to:') + print('1) Use a past ACF run') + print('2) Run a new ACF') + + choice = input('Enter your choice (1-2): ').strip() + + if choice == '1': + run_idx = input('Select an ACF run (number): ').strip() try: idx = int(run_idx) - 1 if 0 <= idx < len(past_runs): manager.acf_id = past_runs[idx].id - print(f"Using past ACF run: {past_runs[idx].filter_name}") + print(f'Using past ACF run: {past_runs[idx].filter_name}') return True else: - print("Invalid selection.") + print('Invalid selection.') except ValueError: - print("Invalid input.") - + print('Invalid input.') + # Get available ACF filters filters = get_available_filters() - - print("\n=== Available Association Confidence Filters ===") + + print('\n=== Available Association Confidence Filters ===') filter_names = list(filters.keys()) for i, name in enumerate(filter_names, 1): filter_obj = filters[name] - print(f"{i}) {name}") - print(f" {filter_obj.description}") - + print(f'{i}) {name}') + print(f' {filter_obj.description}') + try: - selection = int(input("\nSelect a filter to apply (number): ").strip()) + selection = int(input('\nSelect a filter to apply (number): ').strip()) if selection < 1 or selection > len(filter_names): - print("Invalid selection.") + print('Invalid selection.') return False - + selected_filter = filter_names[selection - 1] except ValueError: - print("Please enter a valid number.") + print('Please enter a valid number.') return False - + # Get repositories from surfacing with get_db_session() as session: - surfaced_repos = session.query(SurfacedRepository).filter( - SurfacedRepository.surfacing_id == manager.surfacing_id - ).all() - + surfaced_repos = ( + session.query(SurfacedRepository) + .filter(SurfacedRepository.surfacing_id == manager.surfacing_id) + .all() + ) + if not surfaced_repos: - print("No repositories found from surfacing. Cannot apply ACF.") + print('No repositories found from surfacing. Cannot apply ACF.') return False - + repo_ids = [sr.repository_id for sr in surfaced_repos] - repositories = session.query(Repository).filter( - Repository.id.in_(repo_ids) - ).all() - + repositories = ( + session.query(Repository).filter(Repository.id.in_(repo_ids)).all() + ) + # Apply the selected filter - print(f"\nApplying {selected_filter} to {len(repositories)} repositories...") + print(f'\nApplying {selected_filter} to {len(repositories)} repositories...') try: # Create a new ACF result record with get_db_session() as session: @@ -350,20 +385,20 @@ def repository_acf_phase(manager: InstitutionAnalysisManager) -> bool: surfacing_id=manager.surfacing_id, filter_name=selected_filter, run_at=datetime.now(), - parameters=json.dumps(manager.institution_info) + parameters=json.dumps(manager.institution_info), ) session.add(acf_result) session.commit() acf_id = acf_result.id - + # Apply the filter filtered_results = apply_filter( selected_filter, repositories, manager.institution_info, - store_results=False # We'll store our own results + store_results=False, # We'll store our own results ) - + # Store the results with get_db_session() as session: for repo, confidence, evidence in filtered_results: @@ -371,380 +406,435 @@ def repository_acf_phase(manager: InstitutionAnalysisManager) -> bool: acf_id=acf_id, repository_id=repo.id, confidence_score=confidence, - evidence=json.dumps(evidence) + evidence=json.dumps(evidence), ) session.add(result) - + # Update the ACF result summary acf_result = session.query(ACFResult).filter_by(id=acf_id).first() if acf_result: result_count = len(filtered_results) - acf_result.result_summary = json.dumps({ - "count": result_count, - "high_confidence": len([r for r, c, _ in filtered_results if c >= 0.7]), - "medium_confidence": len([r for r, c, _ in filtered_results if 0.4 <= c < 0.7]), - "low_confidence": len([r for r, c, _ in filtered_results if c < 0.4]) - }) - + acf_result.result_summary = json.dumps( + { + 'count': result_count, + 'high_confidence': len( + [r for r, c, _ in filtered_results if c >= 0.7] + ), + 'medium_confidence': len( + [r for r, c, _ in filtered_results if 0.4 <= c < 0.7] + ), + 'low_confidence': len( + [r for r, c, _ in filtered_results if c < 0.4] + ), + } + ) + manager.acf_id = acf_id - print(f"\nACF complete. Found {len(filtered_results)} repositories with confidence scores.") + print( + f'\nACF complete. Found {len(filtered_results)} repositories with confidence scores.' + ) return True except Exception as e: - logger.error(f"Error during ACF: {e}") - print(f"Error during ACF: {e}") - + logger.error(f'Error during ACF: {e}') + print(f'Error during ACF: {e}') + return False + def repository_analysis_phase(manager: InstitutionAnalysisManager) -> bool: """ Run the repository analysis phase. - + Args: manager: The InstitutionAnalysisManager instance - + Returns: True if analysis was successful, False otherwise """ if not manager.acf_id: - print("No ACF results available. Please complete the ACF phase first.") + print('No ACF results available. Please complete the ACF phase first.') return False - - print("\n=== Repository Analysis Phase ===") - manager.set_phase("analysis") - + + print('\n=== Repository Analysis Phase ===') + manager.set_phase('analysis') + # Get ACF results with get_db_session() as session: - acf_results = session.query(ACFRepositoryResult).filter( - ACFRepositoryResult.acf_id == manager.acf_id - ).order_by( - ACFRepositoryResult.confidence_score.desc() - ).all() - + acf_results = ( + session.query(ACFRepositoryResult) + .filter(ACFRepositoryResult.acf_id == manager.acf_id) + .order_by(ACFRepositoryResult.confidence_score.desc()) + .all() + ) + if not acf_results: - print("No repository ACF results found. Cannot perform analysis.") + print('No repository ACF results found. Cannot perform analysis.') return False - + # Ask for confidence threshold - min_confidence = input("\nMinimum confidence threshold (0.0-1.0, default=0.5): ").strip() or "0.5" + min_confidence = ( + input('\nMinimum confidence threshold (0.0-1.0, default=0.5): ').strip() + or '0.5' + ) try: min_confidence = float(min_confidence) min_confidence = max(0.0, min(1.0, min_confidence)) except ValueError: - print("Invalid threshold, using default 0.5") + print('Invalid threshold, using default 0.5') min_confidence = 0.5 - + # Filter by confidence threshold with get_db_session() as session: - filtered_results = session.query(ACFRepositoryResult, Repository).join( - Repository, Repository.id == ACFRepositoryResult.repository_id - ).filter( - ACFRepositoryResult.acf_id == manager.acf_id, - ACFRepositoryResult.confidence_score >= min_confidence - ).order_by( - ACFRepositoryResult.confidence_score.desc() - ).all() - + filtered_results = ( + session.query(ACFRepositoryResult, Repository) + .join(Repository, Repository.id == ACFRepositoryResult.repository_id) + .filter( + ACFRepositoryResult.acf_id == manager.acf_id, + ACFRepositoryResult.confidence_score >= min_confidence, + ) + .order_by(ACFRepositoryResult.confidence_score.desc()) + .all() + ) + if not filtered_results: - print(f"No repositories meet the confidence threshold of {min_confidence}.") + print(f'No repositories meet the confidence threshold of {min_confidence}.') return False - + # Display the results - print(f"\n=== Repositories Associated with {manager.institution_name} ===") - print(f"Found {len(filtered_results)} repositories with confidence ≥ {min_confidence}") - + print(f'\n=== Repositories Associated with {manager.institution_name} ===') + print( + f'Found {len(filtered_results)} repositories with confidence ≥ {min_confidence}' + ) + for i, (result, repo) in enumerate(filtered_results, 1): - confidence_level = "HIGH" if result.confidence_score >= 0.7 else "MEDIUM" if result.confidence_score >= 0.4 else "LOW" - print(f"\n{i}) {repo.full_name}") - print(f" Confidence: {result.confidence_score:.2f} ({confidence_level})") - print(f" URL: {repo.html_url}") - print(f" Description: {repo.description or 'None'}") - + confidence_level = ( + 'HIGH' + if result.confidence_score >= 0.7 + else 'MEDIUM' + if result.confidence_score >= 0.4 + else 'LOW' + ) + print(f'\n{i}) {repo.full_name}') + print(f' Confidence: {result.confidence_score:.2f} ({confidence_level})') + print(f' URL: {repo.html_url}') + print(f' Description: {repo.description or "None"}') + # Display evidence highlights if result.evidence: try: evidence = json.loads(result.evidence) - print(" Evidence Highlights:") + print(' Evidence Highlights:') display_evidence(evidence) except json.JSONDecodeError: pass - + # Ask if the user wants to analyze specific repositories - print("\nWould you like to analyze specific repositories?") + print('\nWould you like to analyze specific repositories?') analyze = input("Enter 'y' to select repositories for analysis: ").strip().lower() - + if analyze == 'y': - selected_indices = input("Enter repository numbers to analyze (comma-separated): ").strip() + selected_indices = input( + 'Enter repository numbers to analyze (comma-separated): ' + ).strip() try: - indices = [int(idx.strip()) for idx in selected_indices.split(",") if idx.strip()] + indices = [ + int(idx.strip()) for idx in selected_indices.split(',') if idx.strip() + ] selected_repos = [] - + for idx in indices: if 1 <= idx <= len(filtered_results): - selected_repos.append(filtered_results[idx-1][1]) # Get the Repository object + selected_repos.append( + filtered_results[idx - 1][1] + ) # Get the Repository object else: - print(f"Invalid repository number: {idx}") - + print(f'Invalid repository number: {idx}') + if selected_repos: analyze_repositories(selected_repos) - manager.set_phase("completed") + manager.set_phase('completed') return True except ValueError: - print("Invalid input. Please enter comma-separated numbers.") - - manager.set_phase("completed") + print('Invalid input. Please enter comma-separated numbers.') + + manager.set_phase('completed') return True + def display_evidence(evidence: Dict): """ Format and display evidence from ACF results. - + Args: evidence: Evidence dictionary from ACF """ # Display direct ownership (highest confidence) if 'direct_ownership' in evidence: ownership = evidence['direct_ownership'] - print(f" ✓ DIRECT OWNERSHIP: Repository is owned by {ownership.get('owner', 'Unknown')}") + print( + f' ✓ DIRECT OWNERSHIP: Repository is owned by {ownership.get("owner", "Unknown")}' + ) return - + # Display email domain matches if 'email_domains' in evidence and 'matching_count' in evidence['email_domains']: email_ev = evidence['email_domains'] - print(f" ✓ Email domains: {email_ev['matching_count']}/{email_ev['total_contributors']} contributors") + print( + f' ✓ Email domains: {email_ev["matching_count"]}/{email_ev["total_contributors"]} contributors' + ) if 'matching_examples' in email_ev and email_ev['matching_examples']: - print(f" Examples: {', '.join(email_ev['matching_examples'][:3])}") - + print(f' Examples: {", ".join(email_ev["matching_examples"][:3])}') + # Display OpenAlex affiliations - if 'openalex_affiliations' in evidence and 'matching_works' in evidence['openalex_affiliations']: + if ( + 'openalex_affiliations' in evidence + and 'matching_works' in evidence['openalex_affiliations'] + ): oa_ev = evidence['openalex_affiliations'] - print(f" ✓ OpenAlex: {oa_ev['matching_works']}/{oa_ev['total_works']} works") + print( + f' ✓ OpenAlex: {oa_ev["matching_works"]}/{oa_ev["total_works"]} works' + ) if 'matching_authors' in oa_ev and oa_ev['matching_authors']: - print(f" Authors: {', '.join(oa_ev['matching_authors'][:3])}") - + print(f' Authors: {", ".join(oa_ev["matching_authors"][:3])}') + # Display name matches if 'naming_references' in evidence: naming_ev = evidence['naming_references'] if 'name_match' in naming_ev: - print(f" ✓ Name match: {naming_ev['name_match']['text']}") + print(f' ✓ Name match: {naming_ev["name_match"]["text"]}') elif 'fullname_match' in naming_ev: - print(f" ✓ Full name match: {naming_ev['fullname_match']['text']}") + print(f' ✓ Full name match: {naming_ev["fullname_match"]["text"]}') if 'description_match' in naming_ev: - print(" ✓ Description mentions institution") - + print(' ✓ Description mentions institution') + # Display combined scores if 'component_scores' in evidence: - print(" ✓ Combined from multiple factors:") + print(' ✓ Combined from multiple factors:') for filter_name, score in evidence['component_scores'].items(): - print(f" • {filter_name}: {score:.2f}") + print(f' • {filter_name}: {score:.2f}') + def analyze_repositories(repositories: List[Repository]): """ Run analysis queries on selected repositories. - + Args: repositories: List of Repository objects to analyze """ if not repositories: return - - print(f"\n=== Repository Analysis ===") - print(f"Selected {len(repositories)} repositories for analysis:") - + + print('\n=== Repository Analysis ===') + print(f'Selected {len(repositories)} repositories for analysis:') + for i, repo in enumerate(repositories, 1): - print(f"{i}) {repo.full_name}") - - print("\nWhat type of analysis would you like to perform?") - print("1) Top contributors") - print("2) External contributors analysis") - print("3) Citation analysis (requires DOIs)") - - choice = input("Enter your choice (1-3): ").strip() - - if choice == "1": + print(f'{i}) {repo.full_name}') + + print('\nWhat type of analysis would you like to perform?') + print('1) Top contributors') + print('2) External contributors analysis') + print('3) Citation analysis (requires DOIs)') + + choice = input('Enter your choice (1-3): ').strip() + + if choice == '1': for repo in repositories: - print(f"\nAnalyzing top contributors for {repo.full_name}:") + print(f'\nAnalyzing top contributors for {repo.full_name}:') from queries import top10 + top10.main(repo.id) - - elif choice == "2": + + elif choice == '2': for repo in repositories: - print(f"\nAnalyzing external contributors for {repo.full_name}:") + print(f'\nAnalyzing external contributors for {repo.full_name}:') from queries import externalcontributors + externalcontributors.main(repo.id) - - elif choice == "3": + + elif choice == '3': for repo in repositories: if not repo.dois: - print(f"\n{repo.full_name} has no associated DOIs, skipping citation analysis.") + print( + f'\n{repo.full_name} has no associated DOIs, skipping citation analysis.' + ) continue - - print(f"\nAnalyzing citations for {repo.full_name}:") + + print(f'\nAnalyzing citations for {repo.full_name}:') from queries import top_topics + top_topics.main(repo.id) + def people_surfacing_phase(manager: InstitutionAnalysisManager) -> bool: """ Run the people surfacing phase. - + Args: manager: The InstitutionAnalysisManager instance - + Returns: True if surfacing was successful, False otherwise """ - print("\n=== People Surfacing Phase ===") - manager.set_phase("surfacing") - + print('\n=== People Surfacing Phase ===') + manager.set_phase('surfacing') + # Check for past surfacing runs with get_db_session() as session: - past_runs = session.query(SurfacingResult).filter( - SurfacingResult.session_id == manager.db_session_id - ).order_by( - SurfacingResult.run_at.desc() - ).all() - + past_runs = ( + session.query(SurfacingResult) + .filter(SurfacingResult.session_id == manager.db_session_id) + .order_by(SurfacingResult.run_at.desc()) + .all() + ) + if past_runs: - print("\nPast surfacing runs for this session:") + print('\nPast surfacing runs for this session:') for i, run in enumerate(past_runs, 1): algorithm = run.algorithm - date = run.run_at.strftime("%Y-%m-%d %H:%M") + date = run.run_at.strftime('%Y-%m-%d %H:%M') count = run.result_count - print(f"{i}) {algorithm} ({date}): {count} people found") - - print("\nDo you want to:") - print("1) Use a past surfacing run") - print("2) Run a new surfacing algorithm") - - choice = input("Enter your choice (1-2): ").strip() - - if choice == "1": - run_idx = input("Select a surfacing run (number): ").strip() + print(f'{i}) {algorithm} ({date}): {count} people found') + + print('\nDo you want to:') + print('1) Use a past surfacing run') + print('2) Run a new surfacing algorithm') + + choice = input('Enter your choice (1-2): ').strip() + + if choice == '1': + run_idx = input('Select a surfacing run (number): ').strip() try: idx = int(run_idx) - 1 if 0 <= idx < len(past_runs): manager.surfacing_id = past_runs[idx].id - print(f"Using past surfacing run: {past_runs[idx].algorithm}") + print(f'Using past surfacing run: {past_runs[idx].algorithm}') return True else: - print("Invalid selection.") + print('Invalid selection.') except ValueError: - print("Invalid input.") - + print('Invalid input.') + # Get available surfacing algorithms - algorithms = get_available_surfacing_algorithms("people") - - print("\n=== Available Surfacing Algorithms ===") + algorithms = get_available_surfacing_algorithms('people') + + print('\n=== Available Surfacing Algorithms ===') for key, algorithm in algorithms.items(): - print(f"{key}) {algorithm.name}: {algorithm.description}") - - choice = input("\nSelect a surfacing algorithm (number): ").strip() - + print(f'{key}) {algorithm.name}: {algorithm.description}') + + choice = input('\nSelect a surfacing algorithm (number): ').strip() + if choice in algorithms: algorithm = algorithms[choice] - print(f"\nRunning {algorithm.name}...") - + print(f'\nRunning {algorithm.name}...') + # Collect algorithm-specific parameters parameters = {} - + # Run the algorithm try: surfacing_id = algorithm.run( - manager.db_session_id, - manager.institution_info, - parameters + manager.db_session_id, manager.institution_info, parameters ) - + manager.surfacing_id = surfacing_id - + # Show results with get_db_session() as session: - result = session.query(SurfacingResult).filter_by(id=surfacing_id).first() + result = ( + session.query(SurfacingResult).filter_by(id=surfacing_id).first() + ) if result: - print(f"\nSurfacing complete. Found {result.result_count} people.") + print(f'\nSurfacing complete. Found {result.result_count} people.') return True except Exception as e: - logger.error(f"Error during surfacing: {e}") - print(f"Error during surfacing: {e}") + logger.error(f'Error during surfacing: {e}') + print(f'Error during surfacing: {e}') else: - print("Invalid selection.") - + print('Invalid selection.') + return False + def people_acf_phase(manager: InstitutionAnalysisManager) -> bool: """ Run the people ACF phase. - + Args: manager: The InstitutionAnalysisManager instance - + Returns: True if ACF was successful, False otherwise """ if not manager.surfacing_id: - print("No surfacing results available. Please complete the surfacing phase first.") + print( + 'No surfacing results available. Please complete the surfacing phase first.' + ) return False - - print("\n=== People ACF Phase ===") - manager.set_phase("acf") - + + print('\n=== People ACF Phase ===') + manager.set_phase('acf') + # Check for past ACF runs with get_db_session() as session: - past_runs = session.query(ACFResult).filter( - ACFResult.session_id == manager.db_session_id - ).order_by( - ACFResult.run_at.desc() - ).all() - + past_runs = ( + session.query(ACFResult) + .filter(ACFResult.session_id == manager.db_session_id) + .order_by(ACFResult.run_at.desc()) + .all() + ) + if past_runs: - print("\nPast ACF runs for this session:") + print('\nPast ACF runs for this session:') for i, run in enumerate(past_runs, 1): filter_name = run.filter_name - date = run.run_at.strftime("%Y-%m-%d %H:%M") - print(f"{i}) {filter_name} ({date})") - - print("\nDo you want to:") - print("1) Use a past ACF run") - print("2) Run a new ACF") - - choice = input("Enter your choice (1-2): ").strip() - - if choice == "1": - run_idx = input("Select an ACF run (number): ").strip() + date = run.run_at.strftime('%Y-%m-%d %H:%M') + print(f'{i}) {filter_name} ({date})') + + print('\nDo you want to:') + print('1) Use a past ACF run') + print('2) Run a new ACF') + + choice = input('Enter your choice (1-2): ').strip() + + if choice == '1': + run_idx = input('Select an ACF run (number): ').strip() try: idx = int(run_idx) - 1 if 0 <= idx < len(past_runs): manager.acf_id = past_runs[idx].id - print(f"Using past ACF run: {past_runs[idx].filter_name}") + print(f'Using past ACF run: {past_runs[idx].filter_name}') return True else: - print("Invalid selection.") + print('Invalid selection.') except ValueError: - print("Invalid input.") - + print('Invalid input.') + # Get available person ACF filters filters = get_available_person_filters() - - print("\n=== Available Person Confidence Filters ===") + + print('\n=== Available Person Confidence Filters ===') for key, filter_obj in filters.items(): - print(f"{key}) {filter_obj.name}") - print(f" {filter_obj.description}") - - choice = input("\nSelect a filter to apply (number): ").strip() - + print(f'{key}) {filter_obj.name}') + print(f' {filter_obj.description}') + + choice = input('\nSelect a filter to apply (number): ').strip() + if choice in filters: filter_obj = filters[choice] - print(f"\nApplying {filter_obj.name}...") - + print(f'\nApplying {filter_obj.name}...') + # Get people from surfacing with get_db_session() as session: - surfaced_people = session.query(SurfacedPerson).filter( - SurfacedPerson.surfacing_id == manager.surfacing_id - ).all() - + surfaced_people = ( + session.query(SurfacedPerson) + .filter(SurfacedPerson.surfacing_id == manager.surfacing_id) + .all() + ) + if not surfaced_people: - print("No people found from surfacing. Cannot apply ACF.") + print('No people found from surfacing. Cannot apply ACF.') return False - + # Create a new ACF result record with get_db_session() as session: acf_result = ACFResult( @@ -752,31 +842,33 @@ def people_acf_phase(manager: InstitutionAnalysisManager) -> bool: surfacing_id=manager.surfacing_id, filter_name=filter_obj.name, run_at=datetime.now(), - parameters=json.dumps(manager.institution_info) + parameters=json.dumps(manager.institution_info), ) session.add(acf_result) session.commit() acf_id = acf_result.id - + # Apply the filter to each person with get_db_session() as session: high_confidence = 0 medium_confidence = 0 low_confidence = 0 - + for person in surfaced_people: - confidence, evidence = filter_obj.calculate_confidence(person, manager.institution_info) - + confidence, evidence = filter_obj.calculate_confidence( + person, manager.institution_info + ) + if confidence > 0: # Store the result result = ACFPersonResult( acf_id=acf_id, surfaced_person_id=person.id, confidence_score=confidence, - evidence=json.dumps(evidence) + evidence=json.dumps(evidence), ) session.add(result) - + # Count by confidence level if confidence >= 0.7: high_confidence += 1 @@ -784,275 +876,309 @@ def people_acf_phase(manager: InstitutionAnalysisManager) -> bool: medium_confidence += 1 else: low_confidence += 1 - + # Update the ACF result summary acf_result = session.query(ACFResult).filter_by(id=acf_id).first() if acf_result: result_count = high_confidence + medium_confidence + low_confidence - acf_result.result_summary = json.dumps({ - "count": result_count, - "high_confidence": high_confidence, - "medium_confidence": medium_confidence, - "low_confidence": low_confidence - }) - + acf_result.result_summary = json.dumps( + { + 'count': result_count, + 'high_confidence': high_confidence, + 'medium_confidence': medium_confidence, + 'low_confidence': low_confidence, + } + ) + manager.acf_id = acf_id total_results = high_confidence + medium_confidence + low_confidence - print(f"\nACF complete. Found {total_results} people with confidence scores.") - print(f" High confidence (≥0.7): {high_confidence}") - print(f" Medium confidence (≥0.4): {medium_confidence}") - print(f" Low confidence (>0.0): {low_confidence}") - + print(f'\nACF complete. Found {total_results} people with confidence scores.') + print(f' High confidence (≥0.7): {high_confidence}') + print(f' Medium confidence (≥0.4): {medium_confidence}') + print(f' Low confidence (>0.0): {low_confidence}') + return True else: - print("Invalid selection.") - + print('Invalid selection.') + return False + def people_analysis_phase(manager: InstitutionAnalysisManager) -> bool: """ Run the people analysis phase. - + Args: manager: The InstitutionAnalysisManager instance - + Returns: True if analysis was successful, False otherwise """ if not manager.acf_id: - print("No ACF results available. Please complete the ACF phase first.") + print('No ACF results available. Please complete the ACF phase first.') return False - - print("\n=== People Analysis Phase ===") - manager.set_phase("analysis") - + + print('\n=== People Analysis Phase ===') + manager.set_phase('analysis') + # Get ACF results with get_db_session() as session: - acf_results = session.query(ACFPersonResult).filter( - ACFPersonResult.acf_id == manager.acf_id - ).order_by( - ACFPersonResult.confidence_score.desc() - ).all() - + acf_results = ( + session.query(ACFPersonResult) + .filter(ACFPersonResult.acf_id == manager.acf_id) + .order_by(ACFPersonResult.confidence_score.desc()) + .all() + ) + if not acf_results: - print("No person ACF results found. Cannot perform analysis.") + print('No person ACF results found. Cannot perform analysis.') return False - + # Ask for confidence threshold - min_confidence = input("\nMinimum confidence threshold (0.0-1.0, default=0.5): ").strip() or "0.5" + min_confidence = ( + input('\nMinimum confidence threshold (0.0-1.0, default=0.5): ').strip() + or '0.5' + ) try: min_confidence = float(min_confidence) min_confidence = max(0.0, min(1.0, min_confidence)) except ValueError: - print("Invalid threshold, using default 0.5") + print('Invalid threshold, using default 0.5') min_confidence = 0.5 - + # Filter by confidence threshold and collect person details with get_db_session() as session: - filtered_results = session.query( - ACFPersonResult, SurfacedPerson - ).join( - SurfacedPerson, SurfacedPerson.id == ACFPersonResult.surfaced_person_id - ).filter( - ACFPersonResult.acf_id == manager.acf_id, - ACFPersonResult.confidence_score >= min_confidence - ).order_by( - ACFPersonResult.confidence_score.desc() - ).all() - + filtered_results = ( + session.query(ACFPersonResult, SurfacedPerson) + .join( + SurfacedPerson, SurfacedPerson.id == ACFPersonResult.surfaced_person_id + ) + .filter( + ACFPersonResult.acf_id == manager.acf_id, + ACFPersonResult.confidence_score >= min_confidence, + ) + .order_by(ACFPersonResult.confidence_score.desc()) + .all() + ) + if not filtered_results: - print(f"No people meet the confidence threshold of {min_confidence}.") + print(f'No people meet the confidence threshold of {min_confidence}.') return False - + # Display the results - print(f"\n=== People Associated with {manager.institution_name} ===") - print(f"Found {len(filtered_results)} people with confidence ≥ {min_confidence}") - + print(f'\n=== People Associated with {manager.institution_name} ===') + print( + f'Found {len(filtered_results)} people with confidence ≥ {min_confidence}' + ) + for i, (result, person) in enumerate(filtered_results, 1): - confidence_level = "HIGH" if result.confidence_score >= 0.7 else "MEDIUM" if result.confidence_score >= 0.4 else "LOW" - + confidence_level = ( + 'HIGH' + if result.confidence_score >= 0.7 + else 'MEDIUM' + if result.confidence_score >= 0.4 + else 'LOW' + ) + # Get person details details = [] if person.name: - details.append(f"Name: {person.name}") + details.append(f'Name: {person.name}') if person.email: - details.append(f"Email: {person.email}") - + details.append(f'Email: {person.email}') + # Get user or author details if available user = None author = None - + if person.user_id: user = session.query(User).filter_by(id=person.user_id).first() if user: - details.append(f"GitHub: {user.login}") + details.append(f'GitHub: {user.login}') if user.company: - details.append(f"Company: {user.company}") - + details.append(f'Company: {user.company}') + if person.openalex_author_id: - author = session.query(OpenAlexAuthor).filter_by(id=person.openalex_author_id).first() + author = ( + session.query(OpenAlexAuthor) + .filter_by(id=person.openalex_author_id) + .first() + ) if author: - details.append(f"OpenAlex ID: {author.openalex_id}") - details.append(f"Works: {author.works_count or 'Unknown'}") - - print(f"\n{i}) {person.name or 'Unknown'}") - print(f" Confidence: {result.confidence_score:.2f} ({confidence_level})") + details.append(f'OpenAlex ID: {author.openalex_id}') + details.append(f'Works: {author.works_count or "Unknown"}') + + print(f'\n{i}) {person.name or "Unknown"}') + print(f' Confidence: {result.confidence_score:.2f} ({confidence_level})') for detail in details: - print(f" {detail}") - + print(f' {detail}') + # Display evidence highlights if result.evidence: try: evidence = json.loads(result.evidence) - print(" Evidence Highlights:") + print(' Evidence Highlights:') display_person_evidence(evidence) except json.JSONDecodeError: pass - + # Future expansion: Add person-specific analysis options here - - manager.set_phase("completed") + + manager.set_phase('completed') return True + def display_person_evidence(evidence: Dict): """ Format and display evidence from Person ACF results. - + Args: evidence: Evidence dictionary from ACF """ if 'email_match' in evidence: email_info = evidence['email_match'] - print(f" ✓ Email domain match: {email_info['email']}") - + print(f' ✓ Email domain match: {email_info["email"]}') + if 'subdomain_match' in evidence: subdomain_info = evidence['subdomain_match'] - print(f" ✓ Subdomain match: {subdomain_info['user_domain']} (institution: {subdomain_info['institution_domain']})") - + print( + f' ✓ Subdomain match: {subdomain_info["user_domain"]} (institution: {subdomain_info["institution_domain"]})' + ) + if 'company_match' in evidence: company_info = evidence['company_match'] - print(f" ✓ Company/organization match: {company_info.get('company', 'Institution mentioned')}") - + print( + f' ✓ Company/organization match: {company_info.get("company", "Institution mentioned")}' + ) + if 'bio_match' in evidence: bio_info = evidence['bio_match'] - print(f" ✓ Bio mentions institution: {bio_info.get('bio_excerpt', '')}") - + print(f' ✓ Bio mentions institution: {bio_info.get("bio_excerpt", "")}') + if 'location_match' in evidence: location_info = evidence['location_match'] - print(f" ✓ Location match: {location_info.get('location', '')}") - + print(f' ✓ Location match: {location_info.get("location", "")}') + if 'institution_affiliation' in evidence: affiliation = evidence['institution_affiliation'] - print(f" ✓ OpenAlex institutional affiliation: {affiliation['institution']}") - + print( + f' ✓ OpenAlex institutional affiliation: {affiliation["institution"]}' + ) + if 'coauthor_affiliations' in evidence: coauthor_info = evidence['coauthor_affiliations'] if 'matching_works' in coauthor_info: - print(f" ✓ Co-authored with institution affiliates:") + print(' ✓ Co-authored with institution affiliates:') for i, work in enumerate(coauthor_info['matching_works'][:2], 1): - print(f" {i}. {work.get('title', 'Unknown')} ({work.get('year', 'Unknown')})") - + print( + f' {i}. {work.get("title", "Unknown")} ({work.get("year", "Unknown")})' + ) + if 'component_scores' in evidence: - print(" ✓ Combined from multiple factors:") + print(' ✓ Combined from multiple factors:') for filter_name, score in evidence['component_scores'].items(): - print(f" • {filter_name}: {score:.2f}") + print(f' • {filter_name}: {score:.2f}') + def repository_analysis_workflow(manager: InstitutionAnalysisManager) -> None: """ Run the complete repository analysis workflow. - + Args: manager: The InstitutionAnalysisManager instance """ # Phase 1: Surfacing - if manager.current_phase in ["initiated", "surfacing"]: + if manager.current_phase in ['initiated', 'surfacing']: if not repository_surfacing_phase(manager): - print("Repository surfacing failed. Cannot continue.") + print('Repository surfacing failed. Cannot continue.') return - + # Phase 2: ACF - if manager.current_phase in ["surfacing", "acf"]: + if manager.current_phase in ['surfacing', 'acf']: if not repository_acf_phase(manager): - print("Repository ACF failed. Cannot continue.") + print('Repository ACF failed. Cannot continue.') return - + # Phase 3: Analysis - if manager.current_phase in ["acf", "analysis"]: + if manager.current_phase in ['acf', 'analysis']: if not repository_analysis_phase(manager): - print("Repository analysis failed.") + print('Repository analysis failed.') return + def people_analysis_workflow(manager: InstitutionAnalysisManager) -> None: """ Run the complete people analysis workflow. - + Args: manager: The InstitutionAnalysisManager instance """ # Phase 1: Surfacing - if manager.current_phase in ["initiated", "surfacing"]: + if manager.current_phase in ['initiated', 'surfacing']: if not people_surfacing_phase(manager): - print("People surfacing failed. Cannot continue.") + print('People surfacing failed. Cannot continue.') return - + # Phase 2: ACF - if manager.current_phase in ["surfacing", "acf"]: + if manager.current_phase in ['surfacing', 'acf']: if not people_acf_phase(manager): - print("People ACF failed. Cannot continue.") + print('People ACF failed. Cannot continue.') return - + # Phase 3: Analysis - if manager.current_phase in ["acf", "analysis"]: + if manager.current_phase in ['acf', 'analysis']: if not people_analysis_phase(manager): - print("People analysis failed.") + print('People analysis failed.') return + def institutional_repository_discovery(): """Main entry point for the institution analysis interactive mode.""" while True: print_institution_analysis_menu() - choice = input("Enter your choice (1-3): ").strip() - - if choice == "3": - print("Returning to main menu.") + choice = input('Enter your choice (1-3): ').strip() + + if choice == '3': + print('Returning to main menu.') return - - if choice not in ["1", "2"]: - print("Invalid choice. Please try again.") + + if choice not in ['1', '2']: + print('Invalid choice. Please try again.') continue - - analysis_type = "repository" if choice == "1" else "people" - + + analysis_type = 'repository' if choice == '1' else 'people' + # Collect institution information institution_info = collect_institution_info() if not institution_info: continue - + # Initialize the analysis manager manager = InstitutionAnalysisManager( - institution_name=institution_info["name"], - analysis_type=analysis_type + institution_name=institution_info['name'], analysis_type=analysis_type ) - + # Set additional institution information manager.set_institution_info( - domains=institution_info["domains"], - github_orgs=institution_info["github_orgs"] + domains=institution_info['domains'], + github_orgs=institution_info['github_orgs'], ) - + # Check for past sessions past_session_id = check_past_sessions(manager) if past_session_id: manager.load_session(past_session_id) - + # Run the appropriate workflow - if analysis_type == "repository": + if analysis_type == 'repository': repository_analysis_workflow(manager) else: # people people_analysis_workflow(manager) + def main(): institutional_repository_discovery() -if __name__ == "__main__": - main() \ No newline at end of file + +if __name__ == '__main__': + main() diff --git a/Older Experiments/scrappy-proof-of-concept/queries/interactive_query.py b/Older Experiments/scrappy-proof-of-concept/queries/interactive_query.py index ad5c8e2..7035679 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/interactive_query.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/interactive_query.py @@ -1,33 +1,35 @@ -import sys import logging -from sqlalchemy.orm import joinedload +import sys + from db.database import SessionLocal from models.models import Repository -import re +from sqlalchemy.orm import joinedload # Set up logging with both file and stream handlers logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler("query_results.log"), - logging.StreamHandler() - ] + handlers=[logging.FileHandler('query_results.log'), logging.StreamHandler()], ) + # Optional: Redirect stdout to logging so all prints are captured class LoggerWriter: def __init__(self, level): self.level = level + def write(self, message): message = message.strip() if message: self.level(message) + def flush(self): pass + sys.stdout = LoggerWriter(logging.info) + def select_repository_custom(): """ Allow the user to iteratively apply filters to the repository list. @@ -39,70 +41,77 @@ def select_repository_custom(): filter_stack = [] # Keep track of applied filters # Import available filter functions from utils/filters.py - from utils.filters import filter_has_doi, filter_has_stars, filter_has_contributors, filter_has_forks + from utils.filters import ( + filter_has_contributors, + filter_has_doi, + filter_has_forks, + filter_has_stars, + ) + available_filters = { - "1": ("Has DOI", filter_has_doi), - "2": ("Has Stars", filter_has_stars), - "3": ("Has Contributor", filter_has_contributors), - "4": ("Has Fork", filter_has_forks) + '1': ('Has DOI', filter_has_doi), + '2': ('Has Stars', filter_has_stars), + '3': ('Has Contributor', filter_has_contributors), + '4': ('Has Fork', filter_has_forks), } while True: repos = current_query.all() - print("\nCurrent Repositories:") + print('\nCurrent Repositories:') if repos: for i, repo in enumerate(repos, start=1): - print(f"{i}) {repo.full_name}") + print(f'{i}) {repo.full_name}') else: - print("No repositories match the current filters.") + print('No repositories match the current filters.') - print("\nOptions:") - print("A) Add a new filter") + print('\nOptions:') + print('A) Add a new filter') if filter_stack: - print("B) Undo last filter") - print("R) Reset all filters") - print("S) Select a repository from the list") - option = input("Enter your choice (A/B/R/S): ").strip().upper() + print('B) Undo last filter') + print('R) Reset all filters') + print('S) Select a repository from the list') + option = input('Enter your choice (A/B/R/S): ').strip().upper() - if option == "A": - print("\nAvailable Filters:") + if option == 'A': + print('\nAvailable Filters:') for key, (desc, _) in available_filters.items(): - print(f"{key}) {desc}") - chosen = input("Enter the filter number to apply: ").strip() + print(f'{key}) {desc}') + chosen = input('Enter the filter number to apply: ').strip() if chosen in available_filters: _, filter_func = available_filters[chosen] filter_stack.append((chosen, filter_func)) current_query = filter_func(current_query) else: - print("Invalid filter selection. Try again.") - elif option == "B" and filter_stack: + print('Invalid filter selection. Try again.') + elif option == 'B' and filter_stack: removed_filter = filter_stack.pop() - print(f"Removed filter: {available_filters[removed_filter[0]][0]}") + print(f'Removed filter: {available_filters[removed_filter[0]][0]}') # Rebuild the current query from the base query using remaining filters. current_query = base_query for _, func in filter_stack: current_query = func(current_query) - elif option == "R": + elif option == 'R': filter_stack = [] current_query = base_query - print("All filters have been reset.") - elif option == "S": + print('All filters have been reset.') + elif option == 'S': if not repos: - print("No repositories available to select. Please adjust filters.") + print('No repositories available to select. Please adjust filters.') continue try: - selection = int(input("Enter the number of the repository: ").strip()) + selection = int(input('Enter the number of the repository: ').strip()) if 1 <= selection <= len(repos): selected_repo = repos[selection - 1] - print(f"Selected repository: {selected_repo.full_name}") + print(f'Selected repository: {selected_repo.full_name}') session.close() return selected_repo else: - print("Invalid repository number. Try again.") + print('Invalid repository number. Try again.') except ValueError: - print("Please enter a valid number.") + print('Please enter a valid number.') else: - print("Invalid option. Please try again.") + print('Invalid option. Please try again.') + def select_doi(repository): """ @@ -111,78 +120,89 @@ def select_doi(repository): """ dois = repository.dois if not dois: - print("No DOIs found for this repository. Defaulting to all associated DOIs.") + print('No DOIs found for this repository. Defaulting to all associated DOIs.') return None - print("\nSelect a DOI to analyze:") - print("0) All Associated DOIs") + print('\nSelect a DOI to analyze:') + print('0) All Associated DOIs') for i, doi_obj in enumerate(dois, start=1): - print(f"{i}) {doi_obj.doi} (Source: {doi_obj.source})") + print(f'{i}) {doi_obj.doi} (Source: {doi_obj.source})') while True: - choice = input("Enter the number of your choice: ").strip() + choice = input('Enter the number of your choice: ').strip() try: idx = int(choice) if idx == 0: - print("Selected: All Associated DOIs") + print('Selected: All Associated DOIs') return None elif 1 <= idx <= len(dois): selected_doi = dois[idx - 1].doi - print(f"Selected DOI: {selected_doi}") + print(f'Selected DOI: {selected_doi}') return selected_doi else: - print("Invalid number. Please try again.") + print('Invalid number. Please try again.') except ValueError: - print("Please enter a valid number.") + print('Please enter a valid number.') + def print_query_menu(): - print("\nSelect a query to run:") - print("1) Institutions with Works Matching the DOI (usage query)") - print("2) Top 10 contributors by merged PRs (top10 query)") - print("3) Engaged but Non-PR Users (external contributors query)") - print("4) Top Topics of Works that Cite the DOI") - print("5) Top Subfields of Works that Cite the DOI") - print("6) Top Fields of Works that Cite the DOI") - print("7) Top Domains of Works that Cite the DOI") - print("8) Citing Works") - print("0) Exit") + print('\nSelect a query to run:') + print('1) Institutions with Works Matching the DOI (usage query)') + print('2) Top 10 contributors by merged PRs (top10 query)') + print('3) Engaged but Non-PR Users (external contributors query)') + print('4) Top Topics of Works that Cite the DOI') + print('5) Top Subfields of Works that Cite the DOI') + print('6) Top Fields of Works that Cite the DOI') + print('7) Top Domains of Works that Cite the DOI') + print('8) Citing Works') + print('0) Exit') + def interactive_query(): repo = select_repository_custom() if not repo: - sys.exit("No repository selected. Exiting.") + sys.exit('No repository selected. Exiting.') selected_doi = select_doi(repo) repo_id = repo.id while True: print_query_menu() - choice = input("Enter your choice: ").strip() - if choice == "1": + choice = input('Enter your choice: ').strip() + if choice == '1': from queries import usage + usage.main(repo_id, doi_filter=selected_doi) - elif choice == "2": + elif choice == '2': from queries import top10 + top10.main(repo_id) - elif choice == "3": + elif choice == '3': from queries import externalcontributors + externalcontributors.main(repo_id) - elif choice == "4": + elif choice == '4': from queries import top_topics + top_topics.main(repo_id, doi_filter=selected_doi) - elif choice == "5": + elif choice == '5': from queries import top_subfields + top_subfields.main(repo_id, doi_filter=selected_doi) - elif choice == "6": + elif choice == '6': from queries import top_fields + top_fields.main(repo_id, doi_filter=selected_doi) - elif choice == "7": + elif choice == '7': from queries import top_domains + top_domains.main(repo_id, doi_filter=selected_doi) - elif choice == "8": + elif choice == '8': from queries import citing_works + citing_works.main(repo_id, doi_filter=selected_doi) - elif choice == "0": - print("Exiting interactive query mode.") + elif choice == '0': + print('Exiting interactive query mode.') sys.exit(0) else: - print("Invalid choice, please try again.") + print('Invalid choice, please try again.') + -if __name__ == "__main__": +if __name__ == '__main__': interactive_query() diff --git a/Older Experiments/scrappy-proof-of-concept/queries/top10.py b/Older Experiments/scrappy-proof-of-concept/queries/top10.py index b6b549f..0af5aab 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/top10.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/top10.py @@ -1,31 +1,34 @@ -from sqlalchemy import desc, func -from models.models import User, PullRequest, Repository from db.database import get_db_session +from models.models import PullRequest, Repository, User +from sqlalchemy import desc, func + def top_merged_pr_contributors(session, repo_id, limit=10): results = ( session.query( - User.login.label("user_login"), - func.count(PullRequest.id).label("merged_count") + User.login.label('user_login'), + func.count(PullRequest.id).label('merged_count'), ) .join(PullRequest, PullRequest.user_id == User.id) .filter(PullRequest.merged_at.isnot(None)) .filter(PullRequest.repository_id == repo_id) .group_by(User.login) - .order_by(desc("merged_count")) + .order_by(desc('merged_count')) .limit(limit) .all() ) return results + def main(repo_id): with get_db_session() as session: repo_obj = session.query(Repository).filter_by(id=repo_id).first() repo_name = repo_obj.full_name if repo_obj else str(repo_id) contributors = top_merged_pr_contributors(session, repo_id, limit=10) - print(f"Top 10 contributors by merged PRs for repository: {repo_name}") + print(f'Top 10 contributors by merged PRs for repository: {repo_name}') for user_login, merged_count in contributors: - print(f"{user_login}: {merged_count} merged PRs") + print(f'{user_login}: {merged_count} merged PRs') + -if __name__ == "__main__": - print("This module is intended to be run from run_queries.py") +if __name__ == '__main__': + print('This module is intended to be run from run_queries.py') diff --git a/Older Experiments/scrappy-proof-of-concept/queries/top_domains.py b/Older Experiments/scrappy-proof-of-concept/queries/top_domains.py index 01eab63..0040200 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/top_domains.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/top_domains.py @@ -1,34 +1,44 @@ from db.database import get_db_session from models.models import OpenAlexWork, Repository + def main(repo_id, doi_filter=None): with get_db_session() as session: repo = session.query(Repository).filter_by(id=repo_id).first() if not repo: - print("Repository not found.") + print('Repository not found.') return if doi_filter: selected_doi = doi_filter else: if repo.dois: selected_doi = repo.dois[0].doi - print(f"No specific DOI selected; defaulting to first DOI: {selected_doi}") + print( + f'No specific DOI selected; defaulting to first DOI: {selected_doi}' + ) else: - print("No DOIs found for this repository.") + print('No DOIs found for this repository.') return - work = session.query(OpenAlexWork).filter(OpenAlexWork.doi == selected_doi).first() + work = ( + session.query(OpenAlexWork).filter(OpenAlexWork.doi == selected_doi).first() + ) if not work: - print(f"No OpenAlex work found with DOI: {selected_doi}") + print(f'No OpenAlex work found with DOI: {selected_doi}') return domain_counts = {} for citing_work in work.citing_works: if citing_work.topics: for topic in citing_work.topics: - domain = topic.domain_display_name or "N/A" + domain = topic.domain_display_name or 'N/A' domain_counts[domain] = domain_counts.get(domain, 0) + 1 - print(f"\nAggregate Top Domains for works citing the work with DOI: {selected_doi}") - for domain, count in sorted(domain_counts.items(), key=lambda x: x[1], reverse=True): - print(f" {domain}: {count}") + print( + f'\nAggregate Top Domains for works citing the work with DOI: {selected_doi}' + ) + for domain, count in sorted( + domain_counts.items(), key=lambda x: x[1], reverse=True + ): + print(f' {domain}: {count}') + -if __name__ == "__main__": - print("This module is intended to be run from run_queries.py") +if __name__ == '__main__': + print('This module is intended to be run from run_queries.py') diff --git a/Older Experiments/scrappy-proof-of-concept/queries/top_fields.py b/Older Experiments/scrappy-proof-of-concept/queries/top_fields.py index 24e6174..f2a9932 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/top_fields.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/top_fields.py @@ -1,34 +1,44 @@ from db.database import get_db_session from models.models import OpenAlexWork, Repository + def main(repo_id, doi_filter=None): with get_db_session() as session: repo = session.query(Repository).filter_by(id=repo_id).first() if not repo: - print("Repository not found.") + print('Repository not found.') return if doi_filter: selected_doi = doi_filter else: if repo.dois: selected_doi = repo.dois[0].doi - print(f"No specific DOI selected; defaulting to first DOI: {selected_doi}") + print( + f'No specific DOI selected; defaulting to first DOI: {selected_doi}' + ) else: - print("No DOIs found for this repository.") + print('No DOIs found for this repository.') return - work = session.query(OpenAlexWork).filter(OpenAlexWork.doi == selected_doi).first() + work = ( + session.query(OpenAlexWork).filter(OpenAlexWork.doi == selected_doi).first() + ) if not work: - print(f"No OpenAlex work found with DOI: {selected_doi}") + print(f'No OpenAlex work found with DOI: {selected_doi}') return field_counts = {} for citing_work in work.citing_works: if citing_work.topics: for topic in citing_work.topics: - field = topic.field_display_name or "N/A" + field = topic.field_display_name or 'N/A' field_counts[field] = field_counts.get(field, 0) + 1 - print(f"\nAggregate Top Fields for works citing the work with DOI: {selected_doi}") - for field, count in sorted(field_counts.items(), key=lambda x: x[1], reverse=True): - print(f" {field}: {count}") + print( + f'\nAggregate Top Fields for works citing the work with DOI: {selected_doi}' + ) + for field, count in sorted( + field_counts.items(), key=lambda x: x[1], reverse=True + ): + print(f' {field}: {count}') + -if __name__ == "__main__": - print("This module is intended to be run from run_queries.py") +if __name__ == '__main__': + print('This module is intended to be run from run_queries.py') diff --git a/Older Experiments/scrappy-proof-of-concept/queries/top_subfields.py b/Older Experiments/scrappy-proof-of-concept/queries/top_subfields.py index dac1ace..7d895bb 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/top_subfields.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/top_subfields.py @@ -1,34 +1,44 @@ from db.database import get_db_session from models.models import OpenAlexWork, Repository + def main(repo_id, doi_filter=None): with get_db_session() as session: repo = session.query(Repository).filter_by(id=repo_id).first() if not repo: - print("Repository not found.") + print('Repository not found.') return if doi_filter: selected_doi = doi_filter else: if repo.dois: selected_doi = repo.dois[0].doi - print(f"No specific DOI selected; defaulting to first DOI: {selected_doi}") + print( + f'No specific DOI selected; defaulting to first DOI: {selected_doi}' + ) else: - print("No DOIs found for this repository.") + print('No DOIs found for this repository.') return - work = session.query(OpenAlexWork).filter(OpenAlexWork.doi == selected_doi).first() + work = ( + session.query(OpenAlexWork).filter(OpenAlexWork.doi == selected_doi).first() + ) if not work: - print(f"No OpenAlex work found with DOI: {selected_doi}") + print(f'No OpenAlex work found with DOI: {selected_doi}') return subfield_counts = {} for citing_work in work.citing_works: if citing_work.topics: for topic in citing_work.topics: - subfield = topic.subfield_display_name or "N/A" + subfield = topic.subfield_display_name or 'N/A' subfield_counts[subfield] = subfield_counts.get(subfield, 0) + 1 - print(f"\nAggregate Top Subfields for works citing the work with DOI: {selected_doi}") - for subfield, count in sorted(subfield_counts.items(), key=lambda x: x[1], reverse=True): - print(f" {subfield}: {count}") + print( + f'\nAggregate Top Subfields for works citing the work with DOI: {selected_doi}' + ) + for subfield, count in sorted( + subfield_counts.items(), key=lambda x: x[1], reverse=True + ): + print(f' {subfield}: {count}') + -if __name__ == "__main__": - print("This module is intended to be run from run_queries.py") +if __name__ == '__main__': + print('This module is intended to be run from run_queries.py') diff --git a/Older Experiments/scrappy-proof-of-concept/queries/top_topics.py b/Older Experiments/scrappy-proof-of-concept/queries/top_topics.py index 3efd084..51d3e46 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/top_topics.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/top_topics.py @@ -1,34 +1,44 @@ from db.database import get_db_session from models.models import OpenAlexWork, Repository + def main(repo_id, doi_filter=None): with get_db_session() as session: repo = session.query(Repository).filter_by(id=repo_id).first() if not repo: - print("Repository not found.") + print('Repository not found.') return if doi_filter: selected_doi = doi_filter else: if repo.dois: selected_doi = repo.dois[0].doi - print(f"No specific DOI selected; defaulting to first DOI: {selected_doi}") + print( + f'No specific DOI selected; defaulting to first DOI: {selected_doi}' + ) else: - print("No DOIs found for this repository.") + print('No DOIs found for this repository.') return - work = session.query(OpenAlexWork).filter(OpenAlexWork.doi == selected_doi).first() + work = ( + session.query(OpenAlexWork).filter(OpenAlexWork.doi == selected_doi).first() + ) if not work: - print(f"No OpenAlex work found with DOI: {selected_doi}") + print(f'No OpenAlex work found with DOI: {selected_doi}') return topic_counts = {} for citing_work in work.citing_works: if citing_work.topics: for topic in citing_work.topics: - topic_name = topic.display_name or "N/A" + topic_name = topic.display_name or 'N/A' topic_counts[topic_name] = topic_counts.get(topic_name, 0) + 1 - print(f"\nAggregate Top Topics for works citing the work with DOI: {selected_doi}") - for topic, count in sorted(topic_counts.items(), key=lambda x: x[1], reverse=True): - print(f" {topic}: {count}") + print( + f'\nAggregate Top Topics for works citing the work with DOI: {selected_doi}' + ) + for topic, count in sorted( + topic_counts.items(), key=lambda x: x[1], reverse=True + ): + print(f' {topic}: {count}') + -if __name__ == "__main__": - print("This module is intended to be run from run_queries.py") +if __name__ == '__main__': + print('This module is intended to be run from run_queries.py') diff --git a/Older Experiments/scrappy-proof-of-concept/queries/usage.py b/Older Experiments/scrappy-proof-of-concept/queries/usage.py index 29a99f3..c4659e4 100644 --- a/Older Experiments/scrappy-proof-of-concept/queries/usage.py +++ b/Older Experiments/scrappy-proof-of-concept/queries/usage.py @@ -1,34 +1,56 @@ -from sqlalchemy import func, select from db.database import get_db_session -from models.models import DOI, OpenAlexWork, OpenAlexInstitution, OpenAlexAuthor, Repository -from models.models import openalex_author_institutions, openalex_work_authors +from models.models import ( + DOI, + OpenAlexAuthor, + OpenAlexInstitution, + OpenAlexWork, + Repository, + openalex_author_institutions, + openalex_work_authors, +) +from sqlalchemy import func, select + def main(repo_id, doi_filter=None): with get_db_session() as session: repository = session.query(Repository).filter_by(id=repo_id).first() if not repository: - print(f"Repository with id {repo_id} not found in the database.") + print(f'Repository with id {repo_id} not found in the database.') return repository_id = repository.id - doi_subquery = session.query(DOI.doi).filter(DOI.repository_id == repository_id).subquery() + doi_subquery = ( + session.query(DOI.doi).filter(DOI.repository_id == repository_id).subquery() + ) institutions_query_with_doi = ( session.query( OpenAlexInstitution.display_name, - func.count(func.distinct(OpenAlexAuthor.id)).label("author_count") + func.count(func.distinct(OpenAlexAuthor.id)).label('author_count'), + ) + .join( + openalex_author_institutions, + OpenAlexInstitution.id == openalex_author_institutions.c.institution_id, + ) + .join( + OpenAlexAuthor, + OpenAlexAuthor.id == openalex_author_institutions.c.author_id, + ) + .join( + openalex_work_authors, + OpenAlexAuthor.id == openalex_work_authors.c.author_id, ) - .join(openalex_author_institutions, OpenAlexInstitution.id == openalex_author_institutions.c.institution_id) - .join(OpenAlexAuthor, OpenAlexAuthor.id == openalex_author_institutions.c.author_id) - .join(openalex_work_authors, OpenAlexAuthor.id == openalex_work_authors.c.author_id) .join(OpenAlexWork, OpenAlexWork.id == openalex_work_authors.c.work_id) .filter( - func.replace(OpenAlexWork.doi, 'https://doi.org/', '').in_(select(doi_subquery.c.doi)) + func.replace(OpenAlexWork.doi, 'https://doi.org/', '').in_( + select(doi_subquery.c.doi) + ) ) .group_by(OpenAlexInstitution.id) .all() ) print("\n=== Institutions with Works Matching the Repository's DOIs ===") for institution_name, author_count in institutions_query_with_doi: - print(f"Institution: {institution_name} — {author_count} distinct authors") + print(f'Institution: {institution_name} — {author_count} distinct authors') + -if __name__ == "__main__": - print("This module is intended to be run from run_queries.py") +if __name__ == '__main__': + print('This module is intended to be run from run_queries.py') diff --git a/Older Experiments/scrappy-proof-of-concept/services/acf_base.py b/Older Experiments/scrappy-proof-of-concept/services/acf_base.py index 675fe99..494765b 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/acf_base.py +++ b/Older Experiments/scrappy-proof-of-concept/services/acf_base.py @@ -4,37 +4,40 @@ """ from abc import ABC, abstractmethod -from typing import Dict, Tuple, Any +from typing import Any, Dict, Tuple from models.models import Repository + class AssociationConfidenceFilter(ABC): """Base class for all Association Confidence Filters.""" - + @property @abstractmethod def name(self) -> str: """Return the name of the filter.""" pass - + @property @abstractmethod def description(self) -> str: """Return a description of how the filter works.""" pass - + @abstractmethod - def calculate_confidence(self, repository: Repository, institution_info: Dict[str, Any]) -> Tuple[float, Dict]: + def calculate_confidence( + self, repository: Repository, institution_info: Dict[str, Any] + ) -> Tuple[float, Dict]: """ Calculate a confidence score (0.0-1.0) that a repository is associated with the institution. - + Args: repository: The Repository object to analyze institution_info: Dictionary containing institution data (name, domains, etc.) - + Returns: Tuple of (confidence_score, evidence_dict) - confidence_score: Float from 0.0 to 1.0 - evidence_dict: Dictionary explaining the reasoning """ - pass \ No newline at end of file + pass diff --git a/Older Experiments/scrappy-proof-of-concept/services/acf_filters/__init__.py b/Older Experiments/scrappy-proof-of-concept/services/acf_filters/__init__.py index 281d748..355b2ff 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/acf_filters/__init__.py +++ b/Older Experiments/scrappy-proof-of-concept/services/acf_filters/__init__.py @@ -9,4 +9,4 @@ from services.acf_filters.comprehensive_filter import ComprehensiveFilter # Export the filter classes -__all__ = ['AssociationConfidenceFilter', 'ComprehensiveFilter'] \ No newline at end of file +__all__ = ['AssociationConfidenceFilter', 'ComprehensiveFilter'] diff --git a/Older Experiments/scrappy-proof-of-concept/services/acf_filters/comprehensive_filter.py b/Older Experiments/scrappy-proof-of-concept/services/acf_filters/comprehensive_filter.py index 7104fbb..61d7047 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/acf_filters/comprehensive_filter.py +++ b/Older Experiments/scrappy-proof-of-concept/services/acf_filters/comprehensive_filter.py @@ -1,113 +1,134 @@ # services/acf_filters/comprehensive_filter.py -import json import logging -from typing import Dict, Tuple, Any, List +from typing import Any, Dict, List, Tuple -from sqlalchemy.orm import joinedload from db.database import get_db_session -from models.models import Repository, User, Organization, OpenAlexWork, OpenAlexAuthor -from models.models import PullRequest, Issue, IssueComment, Commit -from services.acf_base import AssociationConfidenceFilter # Import from base file instead +from models.models import ( + Issue, + OpenAlexAuthor, + OpenAlexWork, + Organization, + PullRequest, + Repository, + User, +) +from services.acf_base import ( + AssociationConfidenceFilter, # Import from base file instead +) +from sqlalchemy.orm import joinedload logger = logging.getLogger(__name__) + class ComprehensiveFilter(AssociationConfidenceFilter): """ A comprehensive filter that implements a hierarchical confidence scoring system for determining if a repository is associated with an institution. """ - + @property def name(self) -> str: - return "Comprehensive Filter" - + return 'Comprehensive Filter' + @property def description(self) -> str: return ( - "Applies a hierarchical confidence scoring system with multiple factors:\n" - "- Direct ownership (100% confidence): Repository owned by institution GitHub org\n" - "- Core contributors (up to 90%): Repository maintainers affiliated with institution\n" - "- High confidence (up to 90%): Email domains match, OpenAlex affiliations\n" - "- Medium confidence (up to 60%): Institution name in repo name/description\n" - "- Lower confidence: Topic matches and indirect references" + 'Applies a hierarchical confidence scoring system with multiple factors:\n' + '- Direct ownership (100% confidence): Repository owned by institution GitHub org\n' + '- Core contributors (up to 90%): Repository maintainers affiliated with institution\n' + '- High confidence (up to 90%): Email domains match, OpenAlex affiliations\n' + '- Medium confidence (up to 60%): Institution name in repo name/description\n' + '- Lower confidence: Topic matches and indirect references' ) - - def calculate_confidence(self, repository: Repository, institution_info: Dict[str, Any]) -> Tuple[float, Dict]: + + def calculate_confidence( + self, repository: Repository, institution_info: Dict[str, Any] + ) -> Tuple[float, Dict]: """ Calculate confidence using a hierarchical approach, checking highest confidence factors first and returning as soon as a match is found. """ evidence = {} - + # Get basic institution info institution_name = institution_info.get('name', '') domains = institution_info.get('domains', []) github_orgs = institution_info.get('github_orgs', []) - + if not institution_name: return 0.0, {} - + # LEVEL 1: Direct ownership (100% confidence) direct_ownership = self._check_direct_ownership(repository, github_orgs) if direct_ownership: evidence['direct_ownership'] = direct_ownership return 1.0, evidence - + # LEVEL 1.5: Core contributors (high confidence, up to 90%) - core_contributors = self._check_core_contributors(repository, institution_name, domains) + core_contributors = self._check_core_contributors( + repository, institution_name, domains + ) if core_contributors and core_contributors.get('score', 0) >= 0.8: evidence['core_contributors'] = core_contributors return core_contributors.get('score', 0), evidence - + # LEVEL 2: High confidence factors (up to 90%) email_evidence = self._check_email_domains(repository, domains) if email_evidence and email_evidence.get('score', 0) >= 0.7: evidence['email_domains'] = email_evidence return email_evidence.get('score', 0), evidence - - openalex_evidence = self._check_openalex_affiliations(repository, institution_name) + + openalex_evidence = self._check_openalex_affiliations( + repository, institution_name + ) if openalex_evidence and openalex_evidence.get('score', 0) >= 0.7: evidence['openalex_affiliations'] = openalex_evidence return openalex_evidence.get('score', 0), evidence - + # If we have core contributors and another high factor, combine them if core_contributors and core_contributors.get('score', 0) >= 0.5: if email_evidence or openalex_evidence: evidence['core_contributors'] = core_contributors - + if email_evidence: evidence['email_domains'] = email_evidence - combined_score = min(0.9, (core_contributors.get('score', 0) * 0.6) + - (email_evidence.get('score', 0) * 0.4)) - + combined_score = min( + 0.9, + (core_contributors.get('score', 0) * 0.6) + + (email_evidence.get('score', 0) * 0.4), + ) + evidence['combined_high_confidence'] = { 'core_contributor_score': core_contributors.get('score', 0), 'email_score': email_evidence.get('score', 0), - 'combined_score': combined_score + 'combined_score': combined_score, } - + if combined_score >= 0.7: return combined_score, evidence - + if openalex_evidence: evidence['openalex_affiliations'] = openalex_evidence - combined_score = min(0.9, (core_contributors.get('score', 0) * 0.6) + - (openalex_evidence.get('score', 0) * 0.4)) - + combined_score = min( + 0.9, + (core_contributors.get('score', 0) * 0.6) + + (openalex_evidence.get('score', 0) * 0.4), + ) + evidence['combined_high_confidence'] = { 'core_contributor_score': core_contributors.get('score', 0), 'openalex_score': openalex_evidence.get('score', 0), - 'combined_score': combined_score + 'combined_score': combined_score, } - + if combined_score >= 0.7: return combined_score, evidence - + # Continue with existing code... if email_evidence and openalex_evidence: email_score = email_evidence.get('score', 0) openalex_score = openalex_evidence.get('score', 0) - + if email_score > 0 and openalex_score > 0: combined_score = min(0.9, (email_score * 0.6) + (openalex_score * 0.4)) if combined_score >= 0.7: @@ -116,15 +137,15 @@ def calculate_confidence(self, repository: Repository, institution_info: Dict[st evidence['combined_high_confidence'] = { 'email_score': email_score, 'openalex_score': openalex_score, - 'combined_score': combined_score + 'combined_score': combined_score, } return combined_score, evidence - + # LEVEL 3: Medium confidence factors (up to 60%) naming_evidence = self._check_naming_references(repository, institution_name) if naming_evidence and naming_evidence.get('score', 0) >= 0.4: evidence['naming_references'] = naming_evidence - + # Include any high confidence factors we found (even if they weren't high enough alone) if core_contributors: evidence['core_contributors'] = core_contributors @@ -132,70 +153,74 @@ def calculate_confidence(self, repository: Repository, institution_info: Dict[st evidence['email_domains'] = email_evidence if openalex_evidence: evidence['openalex_affiliations'] = openalex_evidence - + return naming_evidence.get('score', 0), evidence - + # LEVEL 4: Lower confidence factors topic_evidence = self._check_topic_matches(repository, institution_name) - + # Combine all evidence found for a final score combined_score = 0.0 factors_found = 0 - + if core_contributors: combined_score += core_contributors.get('score', 0) * 0.4 # Strong weight evidence['core_contributors'] = core_contributors factors_found += 1 - + if email_evidence: combined_score += email_evidence.get('score', 0) * 0.3 evidence['email_domains'] = email_evidence factors_found += 1 - + if openalex_evidence: combined_score += openalex_evidence.get('score', 0) * 0.3 evidence['openalex_affiliations'] = openalex_evidence factors_found += 1 - + if naming_evidence: combined_score += naming_evidence.get('score', 0) * 0.25 evidence['naming_references'] = naming_evidence factors_found += 1 - + if topic_evidence: combined_score += topic_evidence.get('score', 0) * 0.15 evidence['topic_matches'] = topic_evidence factors_found += 1 - + # Only return a score if we found at least one factor if factors_found > 0: # Adjust for number of factors - more factors = higher confidence if factors_found >= 3: combined_score *= 1.2 evidence['multi_factor_bonus'] = True - - final_score = min(0.7, combined_score) # Cap at 0.7 for combined low confidence + + final_score = min( + 0.7, combined_score + ) # Cap at 0.7 for combined low confidence return final_score, evidence - + return 0.0, {} - - def _check_direct_ownership(self, repository: Repository, github_orgs: List[str]) -> Dict: + + def _check_direct_ownership( + self, repository: Repository, github_orgs: List[str] + ) -> Dict: """Check if the repository is owned by a known institution GitHub organization.""" with get_db_session() as session: owner = None org = session.query(Organization).filter_by(id=repository.owner_id).first() - + if org: owner_login = org.login - owner_type = "Organization" + owner_type = 'Organization' else: user = session.query(User).filter_by(id=repository.owner_id).first() if user: owner_login = user.login - owner_type = "User" + owner_type = 'User' else: return None - + # Check against provided GitHub orgs for org_name in github_orgs: if org_name and owner_login and org_name.lower() == owner_login.lower(): @@ -203,152 +228,178 @@ def _check_direct_ownership(self, repository: Repository, github_orgs: List[str] 'match_type': 'exact_match', 'owner_type': owner_type, 'owner': owner_login, - 'matched_org': org_name + 'matched_org': org_name, } - + return None - - def _check_core_contributors(self, repository: Repository, institution_name: str, institution_domains: List[str] = None) -> Dict: + + def _check_core_contributors( + self, + repository: Repository, + institution_name: str, + institution_domains: List[str] = None, + ) -> Dict: """ Analyze core contributors to determine institutional affiliation. - Core contributors are identified by their commit volume, PR activity, + Core contributors are identified by their commit volume, PR activity, and other engagement metrics. - + Returns higher confidence scores for repositories where core contributors have institutional affiliations. """ with get_db_session() as session: - from sqlalchemy import func, select, or_, desc - + from sqlalchemy import desc, func + # Get repository with eager loading repo_id = repository.id - + # First, identify core contributors by activity level # Count PRs per user try: # Get PR authors for this repository - pr_authors = session.query( - User, - func.count(PullRequest.id).label('pr_count') - ).join( - PullRequest, PullRequest.user_id == User.id - ).filter( - PullRequest.repository_id == repo_id - ).group_by( - User.id - ).order_by( - desc('pr_count') - ).limit(10).all() - + pr_authors = ( + session.query(User, func.count(PullRequest.id).label('pr_count')) + .join(PullRequest, PullRequest.user_id == User.id) + .filter(PullRequest.repository_id == repo_id) + .group_by(User.id) + .order_by(desc('pr_count')) + .limit(10) + .all() + ) + if not pr_authors: return None - + # Analyze core contributors for institutional affiliation matching_contributors = [] total_score = 0.0 - + for user, pr_count in pr_authors: # Calculate "coreness" factor - higher for more active contributors activity_level = pr_count coreness = min(1.0, activity_level / 5) # Cap at 1.0 - + contributor_evidence = {} contributor_score = 0.0 - + # Check profile data - if user.company and institution_name.lower() in user.company.lower(): + if ( + user.company + and institution_name.lower() in user.company.lower() + ): contributor_score += 0.6 - contributor_evidence["company_match"] = True - - if user.location and institution_name.lower() in user.location.lower(): + contributor_evidence['company_match'] = True + + if ( + user.location + and institution_name.lower() in user.location.lower() + ): contributor_score += 0.3 - contributor_evidence["location_match"] = True - + contributor_evidence['location_match'] = True + # Check email domains if available if user.email and institution_domains: - if any(domain.lower() in user.email.lower() for domain in institution_domains): + if any( + domain.lower() in user.email.lower() + for domain in institution_domains + ): contributor_score += 0.8 - contributor_evidence["email_domain_match"] = True - + contributor_evidence['email_domain_match'] = True + # If we have some evidence, consider this a matching contributor if contributor_score > 0: # Weight by coreness - core contributors count more weighted_score = contributor_score * coreness - - matching_contributors.append({ - "login": user.login, - "coreness": coreness, - "evidence": contributor_evidence, - "score": weighted_score - }) - + + matching_contributors.append( + { + 'login': user.login, + 'coreness': coreness, + 'evidence': contributor_evidence, + 'score': weighted_score, + } + ) + total_score += weighted_score - + # Return results if we found matches if matching_contributors: # Scale based on proportion of core contributors that match proportion = len(matching_contributors) / len(pr_authors) - final_score = min(0.9, (total_score / len(pr_authors)) * (1 + proportion)) - + final_score = min( + 0.9, (total_score / len(pr_authors)) * (1 + proportion) + ) + return { - "matching_core_contributors": len(matching_contributors), - "total_core_contributors": len(pr_authors), - "contributors": matching_contributors[:5], # Return top 5 for display - "score": final_score + 'matching_core_contributors': len(matching_contributors), + 'total_core_contributors': len(pr_authors), + 'contributors': matching_contributors[ + :5 + ], # Return top 5 for display + 'score': final_score, } - + except Exception as e: - logger.error(f"Error in core contributor analysis: {e}") + logger.error(f'Error in core contributor analysis: {e}') return None - + return None - - def _check_email_domains(self, repository: Repository, institution_domains: List[str]) -> Dict: + + def _check_email_domains( + self, repository: Repository, institution_domains: List[str] + ) -> Dict: """Check email domains of contributors for matches with institution domains.""" if not institution_domains: return None - + with get_db_session() as session: # Get all contributors with email information - from sqlalchemy import or_, select - + # Create subqueries properly try: # Use a simpler approach that's less likely to cause errors - pr_users = session.query(User).join( - PullRequest, PullRequest.user_id == User.id - ).filter( - PullRequest.repository_id == repository.id, - User.email.isnot(None) - ).all() - - issue_users = session.query(User).join( - Issue, Issue.user_id == User.id - ).filter( - Issue.repository_id == repository.id, - User.email.isnot(None) - ).all() - + pr_users = ( + session.query(User) + .join(PullRequest, PullRequest.user_id == User.id) + .filter( + PullRequest.repository_id == repository.id, + User.email.isnot(None), + ) + .all() + ) + + issue_users = ( + session.query(User) + .join(Issue, Issue.user_id == User.id) + .filter( + Issue.repository_id == repository.id, User.email.isnot(None) + ) + .all() + ) + # Combine all contributors contributors = list(set(pr_users + issue_users)) - + total_contributors = len(contributors) if total_contributors == 0: return None - + # Count contributors with matching domains matching_contributors = [] for contributor in contributors: - if any(domain.lower() in contributor.email.lower() for domain in institution_domains): + if any( + domain.lower() in contributor.email.lower() + for domain in institution_domains + ): matching_contributors.append(contributor.login) - + matching_count = len(matching_contributors) if matching_count == 0: return None - + # Calculate score based on ratio and absolute numbers ratio = matching_count / total_contributors - + # Base score calculation if matching_count >= 5 and ratio >= 0.5: # Strong signal: 5+ contributors and 50%+ have matching domains @@ -359,51 +410,63 @@ def _check_email_domains(self, repository: Repository, institution_domains: List else: # Weaker signal score = 0.3 + (ratio * 0.3) - + return { 'matching_count': matching_count, 'total_contributors': total_contributors, 'ratio': ratio, 'matching_examples': matching_contributors[:5], - 'score': score + 'score': score, } except Exception as e: - logger.error(f"Error in email domain check: {e}") + logger.error(f'Error in email domain check: {e}') return None - - def _check_openalex_affiliations(self, repository: Repository, institution_name: str) -> Dict: + + def _check_openalex_affiliations( + self, repository: Repository, institution_name: str + ) -> Dict: """Check OpenAlex data for authors affiliated with the institution.""" with get_db_session() as session: # Don't rely on lazy loading - get repository with dois explicitly - repo = session.query(Repository).options( - joinedload(Repository.dois) - ).filter(Repository.id == repository.id).first() - + repo = ( + session.query(Repository) + .options(joinedload(Repository.dois)) + .filter(Repository.id == repository.id) + .first() + ) + if not repo or not repo.dois: return None - + # Get DOIs for this repository doi_strings = [doi.doi for doi in repo.dois] - + # Find OpenAlex works with these DOIs - works = session.query(OpenAlexWork).options( - joinedload(OpenAlexWork.authors).joinedload(OpenAlexAuthor.institutions) - ).filter(OpenAlexWork.doi.in_(doi_strings)).all() - + works = ( + session.query(OpenAlexWork) + .options( + joinedload(OpenAlexWork.authors).joinedload( + OpenAlexAuthor.institutions + ) + ) + .filter(OpenAlexWork.doi.in_(doi_strings)) + .all() + ) + if not works: return None - + total_works = len(works) matching_works = 0 matching_authors = set() - + for work in works: work_matches = False - + # Check all authors of this work for author in work.authors: author_matches = False - + # Check all institutions this author is affiliated with for institution in author.institutions: if institution_name.lower() in institution.display_name.lower(): @@ -411,19 +474,19 @@ def _check_openalex_affiliations(self, repository: Repository, institution_name: work_matches = True matching_authors.add(author.display_name) break - + if author_matches: break - + if work_matches: matching_works += 1 - + if matching_works == 0: return None - + # Calculate score based on ratio and absolute numbers ratio = matching_works / total_works - + # Base score calculation if matching_works >= 2 and ratio == 1.0: # All works have institution affiliation and we have 2+ works @@ -434,72 +497,74 @@ def _check_openalex_affiliations(self, repository: Repository, institution_name: else: # Some works have institution affiliation score = 0.5 + (ratio * 0.2) - + return { 'matching_works': matching_works, 'total_works': total_works, 'ratio': ratio, 'matching_authors': list(matching_authors)[:5], - 'score': score + 'score': score, } - - def _check_naming_references(self, repository: Repository, institution_name: str) -> Dict: + + def _check_naming_references( + self, repository: Repository, institution_name: str + ) -> Dict: """Check if repository name, description, or README mentions the institution.""" evidence = {} total_score = 0.0 - + # Check repository name (higher confidence) if repository.name and institution_name.lower() in repository.name.lower(): name_score = 0.5 total_score += name_score - evidence['name_match'] = { - 'text': repository.name, - 'score': name_score - } - + evidence['name_match'] = {'text': repository.name, 'score': name_score} + # Check repository full name (could include organization) - elif repository.full_name and institution_name.lower() in repository.full_name.lower(): + elif ( + repository.full_name + and institution_name.lower() in repository.full_name.lower() + ): fullname_score = 0.4 total_score += fullname_score evidence['fullname_match'] = { 'text': repository.full_name, - 'score': fullname_score + 'score': fullname_score, } - + # Check repository description - if repository.description and institution_name.lower() in repository.description.lower(): + if ( + repository.description + and institution_name.lower() in repository.description.lower() + ): desc_score = 0.3 total_score += desc_score - evidence['description_match'] = { - 'score': desc_score - } - + evidence['description_match'] = {'score': desc_score} + # Cap at 0.6 for naming references final_score = min(0.6, total_score) - + if final_score > 0: evidence['score'] = final_score return evidence - + return None - - def _check_topic_matches(self, repository: Repository, institution_name: str) -> Dict: + + def _check_topic_matches( + self, repository: Repository, institution_name: str + ) -> Dict: """Check for topic matches and other indirect references.""" if not repository.topics: return None - + topics = repository.topics.split(',') matching_topics = [] - + for topic in topics: if institution_name.lower() in topic.lower(): matching_topics.append(topic) - + if matching_topics: score = min(0.3, 0.1 + (len(matching_topics) * 0.05)) - return { - 'matching_topics': matching_topics, - 'score': score - } - - return None \ No newline at end of file + return {'matching_topics': matching_topics, 'score': score} + + return None diff --git a/Older Experiments/scrappy-proof-of-concept/services/acf_framework.py b/Older Experiments/scrappy-proof-of-concept/services/acf_framework.py index 97f4766..8ca2a05 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/acf_framework.py +++ b/Older Experiments/scrappy-proof-of-concept/services/acf_framework.py @@ -9,43 +9,47 @@ import json import logging -import re -from typing import List, Dict, Tuple, Any +from typing import Any, Dict, List, Tuple -from sqlalchemy.orm import joinedload from db.database import get_db_session -from models.models import Repository, DiscoveryEvent +from models.models import DiscoveryEvent, Repository from services.acf_base import AssociationConfidenceFilter +from sqlalchemy.orm import joinedload logger = logging.getLogger(__name__) # Import filter classes after importing the base class from services.acf_filters.comprehensive_filter import ComprehensiveFilter + class NameMatchFilter(AssociationConfidenceFilter): """Filter that checks if repository name, description, or README mentions the institution.""" - + @property def name(self) -> str: - return "Name Match Filter" - + return 'Name Match Filter' + @property def description(self) -> str: - return ("Checks if the repository name, description, or README mentions the institution name. " - "Higher confidence if the match is in the name or owner.") - - def calculate_confidence(self, repository: Repository, institution_info: Dict[str, Any]) -> Tuple[float, Dict]: + return ( + 'Checks if the repository name, description, or README mentions the institution name. ' + 'Higher confidence if the match is in the name or owner.' + ) + + def calculate_confidence( + self, repository: Repository, institution_info: Dict[str, Any] + ) -> Tuple[float, Dict]: institution_name = institution_info.get('name', '') if not institution_name: return 0.0, {} - + evidence = {} total_score = 0.0 - + # Check owner (organization or user) with get_db_session() as session: - from models.models import User, Organization - + from models.models import Organization, User + owner = None org = session.query(Organization).filter_by(id=repository.owner_id).first() if org: @@ -56,33 +60,27 @@ def calculate_confidence(self, repository: Repository, institution_info: Dict[st if user: owner = user evidence['owner_type'] = 'User' - + if owner and institution_name.lower() in owner.login.lower(): score = 0.9 total_score += score - evidence['owner_name_match'] = { - 'match': owner.login, - 'score': score - } - + evidence['owner_name_match'] = {'match': owner.login, 'score': score} + # Check repository name if repository.name and institution_name.lower() in repository.name.lower(): score = 0.7 total_score += score - evidence['repo_name_match'] = { - 'match': repository.name, - 'score': score - } - + evidence['repo_name_match'] = {'match': repository.name, 'score': score} + # Check repository description - if repository.description and institution_name.lower() in repository.description.lower(): + if ( + repository.description + and institution_name.lower() in repository.description.lower() + ): score = 0.3 total_score += score - evidence['description_match'] = { - 'match': True, - 'score': score - } - + evidence['description_match'] = {'match': True, 'score': score} + # Check repository topics if repository.topics: topics_list = repository.topics.split(',') @@ -90,85 +88,95 @@ def calculate_confidence(self, repository: Repository, institution_info: Dict[st if institution_name.lower() in topic.lower(): score = 0.2 total_score += score - evidence['topic_match'] = { - 'match': topic, - 'score': score - } + evidence['topic_match'] = {'match': topic, 'score': score} break - + # Cap the total score at 1.0 final_score = min(1.0, total_score) - + return final_score, evidence class EmailDomainFilter(AssociationConfidenceFilter): """Filter that checks the email domains of contributors against institution domains.""" - + @property def name(self) -> str: - return "Email Domain Filter" - + return 'Email Domain Filter' + @property def description(self) -> str: - return ("Analyzes contributor email addresses to identify institutional domains. " - "Higher confidence with more contributors having matching domains.") - - def calculate_confidence(self, repository: Repository, institution_info: Dict[str, Any]) -> Tuple[float, Dict]: + return ( + 'Analyzes contributor email addresses to identify institutional domains. ' + 'Higher confidence with more contributors having matching domains.' + ) + + def calculate_confidence( + self, repository: Repository, institution_info: Dict[str, Any] + ) -> Tuple[float, Dict]: domains = institution_info.get('domains', []) if not domains: return 0.0, {} - + evidence = {} - + with get_db_session() as session: # Get all contributors with email information - from models.models import User, PullRequest, Issue, IssueComment + from models.models import Issue, IssueComment, PullRequest, User from sqlalchemy import or_ - - contributors_query = ( - session.query(User) - .filter(User.email.isnot(None)) - ) - + + contributors_query = session.query(User).filter(User.email.isnot(None)) + # Find users with PRs, issues, or comments on this repo - pr_users = session.query(User.id).join(PullRequest, PullRequest.user_id == User.id).filter( - PullRequest.repository_id == repository.id - ).subquery() - - issue_users = session.query(User.id).join(Issue, Issue.user_id == User.id).filter( - Issue.repository_id == repository.id - ).subquery() - - comment_users = session.query(User.id).join(IssueComment, IssueComment.user_id == User.id).join( - Issue, IssueComment.issue_id == Issue.id - ).filter(Issue.repository_id == repository.id).subquery() - + pr_users = ( + session.query(User.id) + .join(PullRequest, PullRequest.user_id == User.id) + .filter(PullRequest.repository_id == repository.id) + .subquery() + ) + + issue_users = ( + session.query(User.id) + .join(Issue, Issue.user_id == User.id) + .filter(Issue.repository_id == repository.id) + .subquery() + ) + + comment_users = ( + session.query(User.id) + .join(IssueComment, IssueComment.user_id == User.id) + .join(Issue, IssueComment.issue_id == Issue.id) + .filter(Issue.repository_id == repository.id) + .subquery() + ) + contributors = contributors_query.filter( or_( User.id.in_(pr_users), User.id.in_(issue_users), - User.id.in_(comment_users) + User.id.in_(comment_users), ) ).all() - + total_contributors = len(contributors) if total_contributors == 0: return 0.0, {} - + # Count contributors with matching domains matching_contributors = [] for contributor in contributors: - if any(domain.lower() in contributor.email.lower() for domain in domains): + if any( + domain.lower() in contributor.email.lower() for domain in domains + ): matching_contributors.append(contributor.login) - + matching_count = len(matching_contributors) if matching_count == 0: return 0.0, {} - + # Calculate score based on ratio of matching contributors ratio = matching_count / total_contributors - + # Adjust score based on total contributors if total_contributors >= 10: # More contributors = more confidence in the ratio @@ -177,62 +185,73 @@ def calculate_confidence(self, repository: Repository, institution_info: Dict[st base_score = ratio * 0.9 else: base_score = ratio * 0.8 - + # Higher absolute number of matching contributors increases confidence if matching_count >= 5: # Scale up to 0.95 max final_score = min(0.95, base_score * 1.2) else: final_score = base_score - + evidence = { 'matching_contributors': matching_count, 'total_contributors': total_contributors, 'matching_ratio': ratio, - 'matching_logins': matching_contributors[:5] # Include first 5 for display + 'matching_logins': matching_contributors[ + :5 + ], # Include first 5 for display } - + return final_score, evidence class OpenAlexAffiliationFilter(AssociationConfidenceFilter): """Filter that uses OpenAlex data to check for institution affiliations.""" - + @property def name(self) -> str: - return "OpenAlex Affiliation Filter" - + return 'OpenAlex Affiliation Filter' + @property def description(self) -> str: - return ("Uses OpenAlex data to identify repositories linked to papers with authors " - "affiliated with the institution.") - - def calculate_confidence(self, repository: Repository, institution_info: Dict[str, Any]) -> Tuple[float, Dict]: + return ( + 'Uses OpenAlex data to identify repositories linked to papers with authors ' + 'affiliated with the institution.' + ) + + def calculate_confidence( + self, repository: Repository, institution_info: Dict[str, Any] + ) -> Tuple[float, Dict]: institution_name = institution_info.get('name', '') if not institution_name or not repository.dois: return 0.0, {} - + evidence = {} - + with get_db_session() as session: # Get DOIs for this repository doi_strings = [doi.doi for doi in repository.dois] - + # Find OpenAlex works with these DOIs from models.models import OpenAlexWork - works = session.query(OpenAlexWork).filter(OpenAlexWork.doi.in_(doi_strings)).all() - + + works = ( + session.query(OpenAlexWork) + .filter(OpenAlexWork.doi.in_(doi_strings)) + .all() + ) + if not works: return 0.0, {} - + total_works = len(works) matching_works = 0 matching_details = [] - + for work in works: work_matches = False work_authors = [] - + # Check all authors of this work for author in work.authors: author_matches = False @@ -243,108 +262,113 @@ def calculate_confidence(self, repository: Repository, institution_info: Dict[st work_matches = True work_authors.append(author.display_name) break - + if author_matches: break - + if work_matches: matching_works += 1 - matching_details.append({ - 'title': work.title, - 'doi': work.doi, - 'authors': work_authors[:3] # First 3 matching authors - }) - + matching_details.append( + { + 'title': work.title, + 'doi': work.doi, + 'authors': work_authors[:3], # First 3 matching authors + } + ) + if matching_works == 0: return 0.0, {} - + # Calculate score based on ratio of matching works ratio = matching_works / total_works - + # Adjust score based on number of works if total_works >= 3: # More works = more confidence base_score = ratio else: base_score = ratio * 0.8 - + # Cap at 0.95 final_score = min(0.95, base_score) - + evidence = { 'matching_works': matching_works, 'total_works': total_works, 'matching_ratio': ratio, - 'work_details': matching_details[:3] # Include first 3 for display + 'work_details': matching_details[:3], # Include first 3 for display } - + return final_score, evidence class CombinedFilter(AssociationConfidenceFilter): """Filter that combines multiple methods for a comprehensive score.""" - + @property def name(self) -> str: - return "Combined Filter" - + return 'Combined Filter' + @property def description(self) -> str: - return ("Combines multiple filtering methods: name matching, email domains, " - "and OpenAlex affiliations for a comprehensive score.") - - def calculate_confidence(self, repository: Repository, institution_info: Dict[str, Any]) -> Tuple[float, Dict]: - filters = [ - NameMatchFilter(), - EmailDomainFilter(), - OpenAlexAffiliationFilter() - ] - + return ( + 'Combines multiple filtering methods: name matching, email domains, ' + 'and OpenAlex affiliations for a comprehensive score.' + ) + + def calculate_confidence( + self, repository: Repository, institution_info: Dict[str, Any] + ) -> Tuple[float, Dict]: + filters = [NameMatchFilter(), EmailDomainFilter(), OpenAlexAffiliationFilter()] + scores = [] evidence = {} - + for filter_obj in filters: - score, filter_evidence = filter_obj.calculate_confidence(repository, institution_info) + score, filter_evidence = filter_obj.calculate_confidence( + repository, institution_info + ) if score > 0: filter_name = filter_obj.name scores.append((filter_name, score)) evidence[filter_name] = filter_evidence - + if not scores: return 0.0, {} - + # Calculate weighted combined score # Weight OpenAlex higher than email domains, which are weighted higher than name matching weights = { - "Name Match Filter": 0.3, - "Email Domain Filter": 0.35, - "OpenAlex Affiliation Filter": 0.45 + 'Name Match Filter': 0.3, + 'Email Domain Filter': 0.35, + 'OpenAlex Affiliation Filter': 0.45, } - + weighted_sum = 0 weight_total = 0 - + for filter_name, score in scores: weight = weights.get(filter_name, 0.3) weighted_sum += score * weight weight_total += weight - + if weight_total == 0: return 0.0, {} - + # Normalize the final score final_score = min(1.0, weighted_sum / weight_total) - + # Add individual scores to evidence - evidence["component_scores"] = {name: score for name, score in scores} - evidence["final_score"] = final_score - + evidence['component_scores'] = {name: score for name, score in scores} + evidence['final_score'] = final_score + return final_score, evidence + def get_available_filters() -> Dict[str, AssociationConfidenceFilter]: """Return a dictionary of all available ACF implementations.""" filters = {} - + # Add all filter implementations for filter_class in [ NameMatchFilter, @@ -355,101 +379,116 @@ def get_available_filters() -> Dict[str, AssociationConfidenceFilter]: ]: filter_instance = filter_class() filters[filter_instance.name] = filter_instance - + return filters + def get_filter_by_name(name: str) -> AssociationConfidenceFilter: """Get a specific filter by name.""" filters = get_available_filters() return filters.get(name) + def find_keyword_matches(keywords: List[str]) -> Dict[str, Dict]: """ Find which keywords from the provided list have been used in discovery events. - + Args: keywords: List of keywords to check - + Returns: Dictionary mapping each found keyword to its discovery statistics """ results = {} - + with get_db_session() as session: for keyword in keywords: # Find discovery events that used this keyword - events = session.query(DiscoveryEvent).filter( - DiscoveryEvent.keyword == keyword - ).all() - + events = ( + session.query(DiscoveryEvent) + .filter(DiscoveryEvent.keyword == keyword) + .all() + ) + if events: # Get list of unique repository IDs discovered with this keyword repo_event_ids = [ - event.object_id for event in events + event.object_id + for event in events if event.object_type == 'Repository' ] - + # Get the most recent discovery date latest_event = max(events, key=lambda e: e.timestamp) - + results[keyword] = { 'last_run': latest_event.timestamp, 'repository_count': len(set(repo_event_ids)), - 'repository_ids': list(set(repo_event_ids)) + 'repository_ids': list(set(repo_event_ids)), } - + return results + def get_repositories_from_keywords(keywords: List[str]) -> List[Repository]: """ Get all repositories that were discovered using any of the provided keywords. - + Args: keywords: List of keywords to check - + Returns: List of Repository objects """ repo_ids = set() - + with get_db_session() as session: for keyword in keywords: # Find discovery events for this keyword - events = session.query(DiscoveryEvent).filter( - DiscoveryEvent.keyword == keyword, - DiscoveryEvent.object_type == 'Repository' - ).all() - + events = ( + session.query(DiscoveryEvent) + .filter( + DiscoveryEvent.keyword == keyword, + DiscoveryEvent.object_type == 'Repository', + ) + .all() + ) + # Add repository IDs to the set for event in events: repo_ids.add(event.object_id) - + if not repo_ids: return [] - + # Get the actual Repository objects with eager loading of dois relationship - repositories = session.query(Repository).options( - joinedload(Repository.dois) - ).filter( - Repository.id.in_(list(repo_ids)) - ).all() - + repositories = ( + session.query(Repository) + .options(joinedload(Repository.dois)) + .filter(Repository.id.in_(list(repo_ids))) + .all() + ) + return repositories -def apply_filter(filter_name: str, repositories: List[Repository], - institution_info: Dict[str, Any], - store_results: bool = True, - keywords: List[str] = None) -> List[Tuple[Repository, float, Dict]]: + +def apply_filter( + filter_name: str, + repositories: List[Repository], + institution_info: Dict[str, Any], + store_results: bool = True, + keywords: List[str] = None, +) -> List[Tuple[Repository, float, Dict]]: """ Apply a specific ACF to a list of repositories. - + Args: filter_name: Name of the filter to apply repositories: List of Repository objects to filter institution_info: Dictionary with institution information store_results: Whether to store the analysis results in the database keywords: List of keywords that led to these repositories - + Returns: List of tuples (repository, confidence_score, evidence_dict) sorted by confidence score (highest first) @@ -457,7 +496,7 @@ def apply_filter(filter_name: str, repositories: List[Repository], filter_instance = get_filter_by_name(filter_name) if not filter_instance: raise ValueError(f"Filter '{filter_name}' not found") - + # Use a session context for calculating confidence scores results = [] with get_db_session() as session: @@ -465,35 +504,43 @@ def apply_filter(filter_name: str, repositories: List[Repository], repo_ids = [repo.id for repo in repositories] if not repo_ids: return [] - - fresh_repos = session.query(Repository).options( - joinedload(Repository.dois) - ).filter( - Repository.id.in_(repo_ids) - ).all() - + + fresh_repos = ( + session.query(Repository) + .options(joinedload(Repository.dois)) + .filter(Repository.id.in_(repo_ids)) + .all() + ) + for repo in fresh_repos: - confidence, evidence = filter_instance.calculate_confidence(repo, institution_info) + confidence, evidence = filter_instance.calculate_confidence( + repo, institution_info + ) if confidence > 0: results.append((repo, confidence, evidence)) - + # Sort by confidence score (highest first) sorted_results = sorted(results, key=lambda x: x[1], reverse=True) - + # Store the analysis results if requested if store_results: - store_analysis_results(repositories, filter_name, institution_info, results, keywords) - + store_analysis_results( + repositories, filter_name, institution_info, results, keywords + ) + return sorted_results -def store_analysis_results(repositories: List[Repository], - filter_name: str, - institution_info: Dict[str, Any], - results: List[Tuple[Repository, float, Dict]], - keywords: List[str] = None): + +def store_analysis_results( + repositories: List[Repository], + filter_name: str, + institution_info: Dict[str, Any], + results: List[Tuple[Repository, float, Dict]], + keywords: List[str] = None, +): """ Store repository-institution confidence analysis results in the database. - + Args: repositories: List of all repositories that were analyzed filter_name: Name of the filter that was applied @@ -502,13 +549,13 @@ def store_analysis_results(repositories: List[Repository], keywords: List of keywords that led to these repositories """ from models.models import RepositoryInstitutionAnalysis - + institution_name = institution_info.get('name', 'Unknown Institution') - keywords_str = ",".join(keywords) if keywords else None - + keywords_str = ','.join(keywords) if keywords else None + # Create a dictionary for quick lookup of results result_dict = {repo.id: (score, evidence) for repo, score, evidence in results} - + with get_db_session() as session: # Process each repository that was analyzed for repo in repositories: @@ -518,7 +565,7 @@ def store_analysis_results(repositories: List[Repository], else: # For repositories that didn't meet the threshold, store a 0 score score, evidence = 0.0, {} - + # Create a new analysis record analysis = RepositoryInstitutionAnalysis( repository_id=repo.id, @@ -526,14 +573,16 @@ def store_analysis_results(repositories: List[Repository], filter_name=filter_name, confidence_score=score, evidence=json.dumps(evidence) if evidence else None, - keywords_used=keywords_str + keywords_used=keywords_str, ) - + session.add(analysis) - + session.commit() - - logger.info(f"Stored analysis results for {len(repositories)} repositories against {institution_name}") + + logger.info( + f'Stored analysis results for {len(repositories)} repositories against {institution_name}' + ) # Sort by confidence score (highest first) - return sorted(results, key=lambda x: x[1], reverse=True) \ No newline at end of file + return sorted(results, key=lambda x: x[1], reverse=True) diff --git a/Older Experiments/scrappy-proof-of-concept/services/discovery.py b/Older Experiments/scrappy-proof-of-concept/services/discovery.py index 2b2e5a3..24000c9 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/discovery.py +++ b/Older Experiments/scrappy-proof-of-concept/services/discovery.py @@ -1,48 +1,61 @@ # services/discovery.py -import uuid import logging +import uuid from datetime import datetime, timezone + from db.database import get_db_session from models.models import DiscoveryEvent logger = logging.getLogger(__name__) + def start_new_chain(): """ Start a new discovery chain by generating a new UUID. Returns the new chain id. """ new_chain_id = str(uuid.uuid4()) - logger.info(f"Started new discovery chain: {new_chain_id}") + logger.info(f'Started new discovery chain: {new_chain_id}') return new_chain_id -def record_discovery(record, method, details, trigger_input=None, keyword=None, chain_id=None, branch_id=None, step=1): + +def record_discovery( + record, + method, + details, + trigger_input=None, + keyword=None, + chain_id=None, + branch_id=None, + step=1, +): """ Record a discovery event into the audit table using an explicit step number. This function adds a DiscoveryEvent to the session for the given record. """ from sqlalchemy.orm import object_session + session = object_session(record) if session is None: session = get_db_session().__enter__() - + ingestion_type = None if trigger_input: - ingestion_type = "keyword ingestion" if keyword else "direct ingestion" + ingestion_type = 'keyword ingestion' if keyword else 'direct ingestion' object_type = record.__class__.__name__ - object_id = getattr(record, "id", None) - if object_id is None and hasattr(record, "sha"): + object_id = getattr(record, 'id', None) + if object_id is None and hasattr(record, 'sha'): object_id = record.sha if object_id is None: - object_id = "unknown" + object_id = 'unknown' if branch_id is None: branch_id = str(uuid.uuid4()) - + if chain_id is None: - chain_id = "unknown" - + chain_id = 'unknown' + event = DiscoveryEvent( chain_id=chain_id, branch_id=branch_id, @@ -51,12 +64,12 @@ def record_discovery(record, method, details, trigger_input=None, keyword=None, details=details, timestamp=datetime.now(timezone.utc), ingestion_type=ingestion_type, - url=trigger_input if ingestion_type == "direct ingestion" else None, - keyword=keyword if ingestion_type == "keyword ingestion" else None, + url=trigger_input if ingestion_type == 'direct ingestion' else None, + keyword=keyword if ingestion_type == 'keyword ingestion' else None, object_type=object_type, - object_id=str(object_id) + object_id=str(object_id), ) - + session.add(event) - logger.info(f"Queued discovery event: {event}") - # Do not commit here; rely on the outer session \ No newline at end of file + logger.info(f'Queued discovery event: {event}') + # Do not commit here; rely on the outer session diff --git a/Older Experiments/scrappy-proof-of-concept/services/entity_service.py b/Older Experiments/scrappy-proof-of-concept/services/entity_service.py index e52aba6..151809c 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/entity_service.py +++ b/Older Experiments/scrappy-proof-of-concept/services/entity_service.py @@ -1,292 +1,387 @@ # services/entity_service.py import logging -from datetime import datetime, timezone -from utils.common import save_json_field, parse_datetime, get_current_time -from models.models import User, Organization, Repository, DOI +from models.models import DOI, Organization, Repository, User from services.discovery import record_discovery +from utils.common import get_current_time, parse_datetime, save_json_field logger = logging.getLogger(__name__) -def update_or_create_user(session, client, user_data, discovery_method="direct_ingestion", - discovery_details=None, trigger_input=None, keyword=None, - chain_id=None, branch_id=None, step=1): + +def update_or_create_user( + session, + client, + user_data, + discovery_method='direct_ingestion', + discovery_details=None, + trigger_input=None, + keyword=None, + chain_id=None, + branch_id=None, + step=1, +): """ Create or update a User record. """ if not user_data: - logger.warning("No user data provided; skipping user creation.") + logger.warning('No user data provided; skipping user creation.') return None - + login = user_data.get('login', 'Unknown') if discovery_details is None: discovery_details = f"User '{login}' discovered during repository ingestion." - - logger.info(f"Updating or creating user: {login}") - user = session.query(User).filter_by(id=user_data["id"]).first() + + logger.info(f'Updating or creating user: {login}') + user = session.query(User).filter_by(id=user_data['id']).first() detailed_data = client.get_user(login) - + if detailed_data: if user: - user.login = detailed_data.get("login") - user.name = detailed_data.get("name") - user.bio = detailed_data.get("bio") - user.avatar_url = detailed_data.get("avatar_url") - user.html_url = detailed_data.get("html_url") - user.type = detailed_data.get("type", "User") - user.site_admin = detailed_data.get("site_admin", False) - user.created_at = parse_datetime(detailed_data.get("created_at")) - user.updated_at = parse_datetime(detailed_data.get("updated_at")) - user.public_repos = detailed_data.get("public_repos") - user.public_gists = detailed_data.get("public_gists") - user.followers = detailed_data.get("followers") - user.following = detailed_data.get("following") - user.email = detailed_data.get("email") - user.blog = detailed_data.get("blog") - user.company = detailed_data.get("company") - user.location = detailed_data.get("location") - user.twitter_username = detailed_data.get("twitter_username") + user.login = detailed_data.get('login') + user.name = detailed_data.get('name') + user.bio = detailed_data.get('bio') + user.avatar_url = detailed_data.get('avatar_url') + user.html_url = detailed_data.get('html_url') + user.type = detailed_data.get('type', 'User') + user.site_admin = detailed_data.get('site_admin', False) + user.created_at = parse_datetime(detailed_data.get('created_at')) + user.updated_at = parse_datetime(detailed_data.get('updated_at')) + user.public_repos = detailed_data.get('public_repos') + user.public_gists = detailed_data.get('public_gists') + user.followers = detailed_data.get('followers') + user.following = detailed_data.get('following') + user.email = detailed_data.get('email') + user.blog = detailed_data.get('blog') + user.company = detailed_data.get('company') + user.location = detailed_data.get('location') + user.twitter_username = detailed_data.get('twitter_username') user.raw_data = save_json_field(detailed_data) user.ingested_at = get_current_time() else: user = User( - id=detailed_data["id"], - login=detailed_data["login"], - name=detailed_data.get("name"), - bio=detailed_data.get("bio"), - avatar_url=detailed_data.get("avatar_url"), - html_url=detailed_data.get("html_url"), - type=detailed_data.get("type", "User"), - site_admin=detailed_data.get("site_admin", False), - created_at=parse_datetime(detailed_data.get("created_at")), - updated_at=parse_datetime(detailed_data.get("updated_at")), - public_repos=detailed_data.get("public_repos"), - public_gists=detailed_data.get("public_gists"), - followers=detailed_data.get("followers"), - following=detailed_data.get("following"), - email=detailed_data.get("email"), - blog=detailed_data.get("blog"), - company=detailed_data.get("company"), - location=detailed_data.get("location"), - twitter_username=detailed_data.get("twitter_username"), - raw_data=save_json_field(detailed_data) + id=detailed_data['id'], + login=detailed_data['login'], + name=detailed_data.get('name'), + bio=detailed_data.get('bio'), + avatar_url=detailed_data.get('avatar_url'), + html_url=detailed_data.get('html_url'), + type=detailed_data.get('type', 'User'), + site_admin=detailed_data.get('site_admin', False), + created_at=parse_datetime(detailed_data.get('created_at')), + updated_at=parse_datetime(detailed_data.get('updated_at')), + public_repos=detailed_data.get('public_repos'), + public_gists=detailed_data.get('public_gists'), + followers=detailed_data.get('followers'), + following=detailed_data.get('following'), + email=detailed_data.get('email'), + blog=detailed_data.get('blog'), + company=detailed_data.get('company'), + location=detailed_data.get('location'), + twitter_username=detailed_data.get('twitter_username'), + raw_data=save_json_field(detailed_data), ) user.ingested_at = get_current_time() session.add(user) session.commit() - record_discovery(user, discovery_method, discovery_details, - trigger_input=trigger_input, keyword=keyword, - chain_id=chain_id, branch_id=branch_id, step=step) + record_discovery( + user, + discovery_method, + discovery_details, + trigger_input=trigger_input, + keyword=keyword, + chain_id=chain_id, + branch_id=branch_id, + step=step, + ) return user - + if not user: user = User( - id=user_data["id"], - login=user_data["login"], - raw_data=save_json_field(user_data) + id=user_data['id'], + login=user_data['login'], + raw_data=save_json_field(user_data), ) user.ingested_at = get_current_time() session.add(user) session.commit() - record_discovery(user, discovery_method, discovery_details, - trigger_input=trigger_input, keyword=keyword, - chain_id=chain_id, branch_id=branch_id, step=step) + record_discovery( + user, + discovery_method, + discovery_details, + trigger_input=trigger_input, + keyword=keyword, + chain_id=chain_id, + branch_id=branch_id, + step=step, + ) return user -def update_or_create_org(session, client, org_data, discovery_method="direct_ingestion", - discovery_details="Organization discovered during repository ingestion", - trigger_input=None, keyword=None, chain_id=None, branch_id=None, step=1): + +def update_or_create_org( + session, + client, + org_data, + discovery_method='direct_ingestion', + discovery_details='Organization discovered during repository ingestion', + trigger_input=None, + keyword=None, + chain_id=None, + branch_id=None, + step=1, +): """ Create or update an Organization record. """ - logger.info(f"Updating or creating organization: {org_data['login']}") - org = session.query(Organization).filter_by(id=org_data["id"]).first() - detailed_data = client.get_organization(org_data["login"]) - + logger.info(f'Updating or creating organization: {org_data["login"]}') + org = session.query(Organization).filter_by(id=org_data['id']).first() + detailed_data = client.get_organization(org_data['login']) + if detailed_data: if org: - org.login = detailed_data.get("login") - org.name = detailed_data.get("name") - org.description = detailed_data.get("description") + org.login = detailed_data.get('login') + org.name = detailed_data.get('name') + org.description = detailed_data.get('description') org.raw_data = save_json_field(detailed_data) org.ingested_at = get_current_time() else: org = Organization( - id=detailed_data["id"], - login=detailed_data.get("login"), - name=detailed_data.get("name"), - description=detailed_data.get("description"), - raw_data=save_json_field(detailed_data) + id=detailed_data['id'], + login=detailed_data.get('login'), + name=detailed_data.get('name'), + description=detailed_data.get('description'), + raw_data=save_json_field(detailed_data), ) org.ingested_at = get_current_time() session.add(org) session.commit() - record_discovery(org, discovery_method, discovery_details, - trigger_input=trigger_input, keyword=keyword, - chain_id=chain_id, branch_id=branch_id, step=step) + record_discovery( + org, + discovery_method, + discovery_details, + trigger_input=trigger_input, + keyword=keyword, + chain_id=chain_id, + branch_id=branch_id, + step=step, + ) return org - + if not org: org = Organization( - id=org_data["id"], - login=org_data["login"], - raw_data=save_json_field(org_data) + id=org_data['id'], + login=org_data['login'], + raw_data=save_json_field(org_data), ) org.ingested_at = get_current_time() session.add(org) session.commit() - record_discovery(org, discovery_method, discovery_details, - trigger_input=trigger_input, keyword=keyword, - chain_id=chain_id, branch_id=branch_id, step=step) + record_discovery( + org, + discovery_method, + discovery_details, + trigger_input=trigger_input, + keyword=keyword, + chain_id=chain_id, + branch_id=branch_id, + step=step, + ) return org -def update_or_create_repository(session, client, repo_data, discovery_method="direct_ingestion", - discovery_details=None, trigger_input=None, keyword=None, - chain_id=None, branch_id=None, step=1): + +def update_or_create_repository( + session, + client, + repo_data, + discovery_method='direct_ingestion', + discovery_details=None, + trigger_input=None, + keyword=None, + chain_id=None, + branch_id=None, + step=1, +): """ Create or update a Repository record. """ - repo_id = repo_data["id"] + repo_id = repo_data['id'] full_name = repo_data.get('full_name') - logger.info(f"Updating or creating repository id={repo_id}, full_name={full_name}") - + logger.info(f'Updating or creating repository id={repo_id}, full_name={full_name}') + if discovery_details is None: - discovery_details = f"Repository {full_name} discovered during ingestion" - - topics = ",".join(repo_data.get("topics", [])) + discovery_details = f'Repository {full_name} discovered during ingestion' + + topics = ','.join(repo_data.get('topics', [])) repository = session.query(Repository).filter_by(id=repo_id).first() - + if repository: - repository.name = repo_data.get("name") - repository.full_name = repo_data.get("full_name") - repository.owner_id = repo_data["owner"]["id"] - repository.private = repo_data.get("private", False) - repository.description = repo_data.get("description") - repository.homepage = repo_data.get("homepage") - repository.language = repo_data.get("language") + repository.name = repo_data.get('name') + repository.full_name = repo_data.get('full_name') + repository.owner_id = repo_data['owner']['id'] + repository.private = repo_data.get('private', False) + repository.description = repo_data.get('description') + repository.homepage = repo_data.get('homepage') + repository.language = repo_data.get('language') repository.topics = topics - repository.license = save_json_field(repo_data.get("license")) - repository.visibility = repo_data.get("visibility") - repository.default_branch = repo_data.get("default_branch") - repository.archived = repo_data.get("archived", False) - repository.disabled = repo_data.get("disabled", False) - repository.fork = repo_data.get("fork", False) - repository.forks_count = repo_data.get("forks_count") - repository.network_count = repo_data.get("network_count") - repository.watchers_count = repo_data.get("watchers_count") - repository.stargazers_count = repo_data.get("stargazers_count") - repository.subscribers_count = repo_data.get("subscribers_count") - repository.html_url = repo_data.get("html_url") - repository.clone_url = repo_data.get("clone_url") - repository.ssh_url = repo_data.get("ssh_url") - repository.svn_url = repo_data.get("svn_url") - repository.git_url = repo_data.get("git_url") - repository.mirror_url = repo_data.get("mirror_url") - repository.issues_url = repo_data.get("issues_url") - repository.pulls_url = repo_data.get("pulls_url") - repository.commits_url = repo_data.get("commits_url") - repository.branches_url = repo_data.get("branches_url") - repository.tags_url = repo_data.get("tags_url") - repository.contributors_url = repo_data.get("contributors_url") - repository.collaborators_url = repo_data.get("collaborators_url") - repository.downloads_url = repo_data.get("downloads_url") - repository.size = repo_data.get("size") - repository.open_issues_count = repo_data.get("open_issues_count") - repository.has_issues = repo_data.get("has_issues", False) - repository.has_wiki = repo_data.get("has_wiki", False) - repository.has_downloads = repo_data.get("has_downloads", False) - repository.has_projects = repo_data.get("has_projects", False) - repository.has_pages = repo_data.get("has_pages", False) - repository.is_template = repo_data.get("is_template", False) + repository.license = save_json_field(repo_data.get('license')) + repository.visibility = repo_data.get('visibility') + repository.default_branch = repo_data.get('default_branch') + repository.archived = repo_data.get('archived', False) + repository.disabled = repo_data.get('disabled', False) + repository.fork = repo_data.get('fork', False) + repository.forks_count = repo_data.get('forks_count') + repository.network_count = repo_data.get('network_count') + repository.watchers_count = repo_data.get('watchers_count') + repository.stargazers_count = repo_data.get('stargazers_count') + repository.subscribers_count = repo_data.get('subscribers_count') + repository.html_url = repo_data.get('html_url') + repository.clone_url = repo_data.get('clone_url') + repository.ssh_url = repo_data.get('ssh_url') + repository.svn_url = repo_data.get('svn_url') + repository.git_url = repo_data.get('git_url') + repository.mirror_url = repo_data.get('mirror_url') + repository.issues_url = repo_data.get('issues_url') + repository.pulls_url = repo_data.get('pulls_url') + repository.commits_url = repo_data.get('commits_url') + repository.branches_url = repo_data.get('branches_url') + repository.tags_url = repo_data.get('tags_url') + repository.contributors_url = repo_data.get('contributors_url') + repository.collaborators_url = repo_data.get('collaborators_url') + repository.downloads_url = repo_data.get('downloads_url') + repository.size = repo_data.get('size') + repository.open_issues_count = repo_data.get('open_issues_count') + repository.has_issues = repo_data.get('has_issues', False) + repository.has_wiki = repo_data.get('has_wiki', False) + repository.has_downloads = repo_data.get('has_downloads', False) + repository.has_projects = repo_data.get('has_projects', False) + repository.has_pages = repo_data.get('has_pages', False) + repository.is_template = repo_data.get('is_template', False) repository.raw_data = save_json_field(repo_data) repository.ingested_at = get_current_time() else: repository = Repository( - id=repo_data["id"], - name=repo_data.get("name"), - full_name=repo_data.get("full_name"), - owner_id=repo_data["owner"]["id"], - private=repo_data.get("private", False), - description=repo_data.get("description"), - homepage=repo_data.get("homepage"), - language=repo_data.get("language"), + id=repo_data['id'], + name=repo_data.get('name'), + full_name=repo_data.get('full_name'), + owner_id=repo_data['owner']['id'], + private=repo_data.get('private', False), + description=repo_data.get('description'), + homepage=repo_data.get('homepage'), + language=repo_data.get('language'), topics=topics, - license=save_json_field(repo_data.get("license")), - visibility=repo_data.get("visibility"), - default_branch=repo_data.get("default_branch"), - archived=repo_data.get("archived", False), - disabled=repo_data.get("disabled", False), - fork=repo_data.get("fork", False), - forks_count=repo_data.get("forks_count"), - network_count=repo_data.get("network_count"), - watchers_count=repo_data.get("watchers_count"), - stargazers_count=repo_data.get("stargazers_count"), - subscribers_count=repo_data.get("subscribers_count"), - html_url=repo_data.get("html_url"), - clone_url=repo_data.get("clone_url"), - ssh_url=repo_data.get("ssh_url"), - svn_url=repo_data.get("svn_url"), - git_url=repo_data.get("git_url"), - mirror_url=repo_data.get("mirror_url"), - issues_url=repo_data.get("issues_url"), - pulls_url=repo_data.get("pulls_url"), - commits_url=repo_data.get("commits_url"), - branches_url=repo_data.get("branches_url"), - tags_url=repo_data.get("tags_url"), - contributors_url=repo_data.get("contributors_url"), - collaborators_url=repo_data.get("collaborators_url"), - downloads_url=repo_data.get("downloads_url"), - size=repo_data.get("size"), - open_issues_count=repo_data.get("open_issues_count"), - has_issues=repo_data.get("has_issues", False), - has_wiki=repo_data.get("has_wiki", False), - has_downloads=repo_data.get("has_downloads", False), - has_projects=repo_data.get("has_projects", False), - has_pages=repo_data.get("has_pages", False), - is_template=repo_data.get("is_template", False), - raw_data=save_json_field(repo_data) + license=save_json_field(repo_data.get('license')), + visibility=repo_data.get('visibility'), + default_branch=repo_data.get('default_branch'), + archived=repo_data.get('archived', False), + disabled=repo_data.get('disabled', False), + fork=repo_data.get('fork', False), + forks_count=repo_data.get('forks_count'), + network_count=repo_data.get('network_count'), + watchers_count=repo_data.get('watchers_count'), + stargazers_count=repo_data.get('stargazers_count'), + subscribers_count=repo_data.get('subscribers_count'), + html_url=repo_data.get('html_url'), + clone_url=repo_data.get('clone_url'), + ssh_url=repo_data.get('ssh_url'), + svn_url=repo_data.get('svn_url'), + git_url=repo_data.get('git_url'), + mirror_url=repo_data.get('mirror_url'), + issues_url=repo_data.get('issues_url'), + pulls_url=repo_data.get('pulls_url'), + commits_url=repo_data.get('commits_url'), + branches_url=repo_data.get('branches_url'), + tags_url=repo_data.get('tags_url'), + contributors_url=repo_data.get('contributors_url'), + collaborators_url=repo_data.get('collaborators_url'), + downloads_url=repo_data.get('downloads_url'), + size=repo_data.get('size'), + open_issues_count=repo_data.get('open_issues_count'), + has_issues=repo_data.get('has_issues', False), + has_wiki=repo_data.get('has_wiki', False), + has_downloads=repo_data.get('has_downloads', False), + has_projects=repo_data.get('has_projects', False), + has_pages=repo_data.get('has_pages', False), + is_template=repo_data.get('is_template', False), + raw_data=save_json_field(repo_data), ) repository.ingested_at = get_current_time() session.add(repository) - + session.commit() - record_discovery(repository, discovery_method, discovery_details, - trigger_input=trigger_input, keyword=keyword, - chain_id=chain_id, branch_id=branch_id, step=step) + record_discovery( + repository, + discovery_method, + discovery_details, + trigger_input=trigger_input, + keyword=keyword, + chain_id=chain_id, + branch_id=branch_id, + step=step, + ) return repository -def store_doi(session, repository_id, doi_string, source="UNKNOWN", doi_metadata=None, - discovery_method="direct_ingestion", discovery_details=None, - trigger_input=None, keyword=None, chain_id=None, branch_id=None, step=1): + +def store_doi( + session, + repository_id, + doi_string, + source='UNKNOWN', + doi_metadata=None, + discovery_method='direct_ingestion', + discovery_details=None, + trigger_input=None, + keyword=None, + chain_id=None, + branch_id=None, + step=1, +): """ Create or update a DOI record. """ from utils.common import clean_doi - + doi_string = clean_doi(doi_string) - + if discovery_details is None: repository = session.query(Repository).filter_by(id=repository_id).first() - repo_name = repository.full_name if repository else f"repository ID {repository_id}" - discovery_details = f"DOI '{doi_string}' discovered from {source} in {repo_name}" - - existing = session.query(DOI).filter_by(repository_id=repository_id, doi=doi_string).first() + repo_name = ( + repository.full_name if repository else f'repository ID {repository_id}' + ) + discovery_details = ( + f"DOI '{doi_string}' discovered from {source} in {repo_name}" + ) + + existing = ( + session.query(DOI) + .filter_by(repository_id=repository_id, doi=doi_string) + .first() + ) if not existing: new_doi = DOI( repository_id=repository_id, doi=doi_string, source=source, - doi_metadata=doi_metadata + doi_metadata=doi_metadata, ) new_doi.ingested_at = get_current_time() session.add(new_doi) session.commit() - logger.info(f"Stored new DOI '{doi_string}' for repo={repository_id} from {source}") - - record_discovery(new_doi, discovery_method, discovery_details, - trigger_input=trigger_input, keyword=keyword, - chain_id=chain_id, branch_id=branch_id, step=step) + logger.info( + f"Stored new DOI '{doi_string}' for repo={repository_id} from {source}" + ) + + record_discovery( + new_doi, + discovery_method, + discovery_details, + trigger_input=trigger_input, + keyword=keyword, + chain_id=chain_id, + branch_id=branch_id, + step=step, + ) return new_doi else: - logger.info(f"DOI '{doi_string}' already exists for repo={repository_id}; skipping.") - return existing \ No newline at end of file + logger.info( + f"DOI '{doi_string}' already exists for repo={repository_id}; skipping." + ) + return existing diff --git a/Older Experiments/scrappy-proof-of-concept/services/github_ingestion.py b/Older Experiments/scrappy-proof-of-concept/services/github_ingestion.py index 11a39bf..8b04956 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/github_ingestion.py +++ b/Older Experiments/scrappy-proof-of-concept/services/github_ingestion.py @@ -1,58 +1,104 @@ # services/github_ingestion.py import base64 -import yaml import logging -from datetime import datetime, timezone import uuid -from db.database import get_db_session +import yaml from clients.github_client import GitHubClient -from utils.common import parse_datetime, save_json_field, extract_dois_from_text, get_current_time -from services.discovery import record_discovery, start_new_chain -from services.entity_service import update_or_create_repository, update_or_create_org, update_or_create_user, store_doi from models.models import ( - Repository, Branch, Tag, Commit, Label, Milestone, Release, Webhook, - Event, Workflow, WorkflowRun, Issue, IssueComment, PullRequest, PRReviewComment, - PullRequestReview, DiscoveryEvent, DOI + Branch, + Commit, + Event, + Issue, + IssueComment, + Label, + Milestone, + PRReviewComment, + PullRequest, + PullRequestReview, + Release, + Tag, + Webhook, + Workflow, + WorkflowRun, +) +from services.discovery import record_discovery, start_new_chain +from services.entity_service import ( + store_doi, + update_or_create_org, + update_or_create_repository, + update_or_create_user, +) +from utils.common import ( + extract_dois_from_text, + get_current_time, + parse_datetime, + save_json_field, ) logger = logging.getLogger(__name__) -def parse_citation_cff(session, client, owner, repo_name, repository, chain_id=None, branch_id=None, trigger_input=None, keyword=None): + +def parse_citation_cff( + session, + client, + owner, + repo_name, + repository, + chain_id=None, + branch_id=None, + trigger_input=None, + keyword=None, +): """ Parse CITATION.cff file from a repository and extract DOI information. """ cff_json = client.get_citation_cff(owner, repo_name) - if not cff_json or "content" not in cff_json: - logger.info(f"No CITATION.cff found or content is missing for {owner}/{repo_name}.") + if not cff_json or 'content' not in cff_json: + logger.info( + f'No CITATION.cff found or content is missing for {owner}/{repo_name}.' + ) return None - + try: - cff_decoded = base64.b64decode(cff_json["content"]).decode("utf-8", errors="ignore") + cff_decoded = base64.b64decode(cff_json['content']).decode( + 'utf-8', errors='ignore' + ) cff_data = yaml.safe_load(cff_decoded) - if "doi" in cff_data: - doi_str = cff_data["doi"] + if 'doi' in cff_data: + doi_str = cff_data['doi'] discovery_details = f"DOI discovered from CITATION.cff in repository '{repository.full_name}'" doi_obj = store_doi( - session, repository.id, doi_str, source="CITATION.cff", - discovery_method="citation_doi_ingestion", + session, + repository.id, + doi_str, + source='CITATION.cff', + discovery_method='citation_doi_ingestion', discovery_details=discovery_details, trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=branch_id, - step=2 + step=2, ) return doi_str except Exception as e: - logger.warning(f"Error parsing CITATION.cff for {owner}/{repo_name}: {e}") - + logger.warning(f'Error parsing CITATION.cff for {owner}/{repo_name}: {e}') + return None -def ingest_github_repository(session, owner: str, repo_name: str, token: str = None, - discovery_method: str = "direct_ingestion", - discovery_details: str = None, trigger_input: str = None, - chain_id: str = None, keyword: str = None): + +def ingest_github_repository( + session, + owner: str, + repo_name: str, + token: str = None, + discovery_method: str = 'direct_ingestion', + discovery_details: str = None, + trigger_input: str = None, + chain_id: str = None, + keyword: str = None, +): """ Ingest a repository from GitHub and record its discovery events. This function performs all GitHub-specific ingestion (repository data, branches, @@ -62,577 +108,620 @@ def ingest_github_repository(session, owner: str, repo_name: str, token: str = N client = GitHubClient(token=token, default_timeout=30) repo_data = client.get_repository(owner, repo_name) if not repo_data: - raise ValueError(f"Failed to fetch repository data for {owner}/{repo_name}.") - + raise ValueError(f'Failed to fetch repository data for {owner}/{repo_name}.') + # Start a new discovery chain for this ingestion session if not provided. if chain_id is None: chain_id = start_new_chain() - + # Generate base branch ID for this repository ingestion base_branch_id = str(uuid.uuid4()) - + # Record the repository (generation event) as step 1. if discovery_details is None: - discovery_details = f"Repository URL: https://github.com/{owner}/{repo_name}" - + discovery_details = f'Repository URL: https://github.com/{owner}/{repo_name}' + repository = update_or_create_repository( - session, client, repo_data, + session, + client, + repo_data, discovery_method=discovery_method, discovery_details=discovery_details, trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=base_branch_id, - step=1 + step=1, ) - + # Record the repository owner as step 2. - if repo_data["owner"]["type"] == "Organization": + if repo_data['owner']['type'] == 'Organization': org = update_or_create_org( - session, - client, - repo_data["owner"], - discovery_method="repository_owner_ingestion", + session, + client, + repo_data['owner'], + discovery_method='repository_owner_ingestion', discovery_details=f"Organization discovered as owner of repository '{repository.full_name}'", trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=base_branch_id, - step=2 + step=2, ) else: user = update_or_create_user( - session, - client, - repo_data["owner"], - discovery_method="repository_owner_ingestion", + session, + client, + repo_data['owner'], + discovery_method='repository_owner_ingestion', discovery_details=f"User discovered as owner of repository '{repository.full_name}'", trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=base_branch_id, - step=2 + step=2, ) - + # Record branches (step 2) branches = client.get_branches(owner, repo_name) for branch_data in branches: - exists = session.query(Branch).filter_by(name=branch_data["name"], repository_id=repository.id).first() + exists = ( + session.query(Branch) + .filter_by(name=branch_data['name'], repository_id=repository.id) + .first() + ) if not exists: new_branch = Branch( - name=branch_data["name"], - commit_sha=branch_data["commit"]["sha"], - repository_id=repository.id + name=branch_data['name'], + commit_sha=branch_data['commit']['sha'], + repository_id=repository.id, ) new_branch.ingested_at = get_current_time() session.add(new_branch) record_discovery( - new_branch, - "branch_ingestion", - f"Branch from repo {repository.full_name}", + new_branch, + 'branch_ingestion', + f'Branch from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + # Record tags (step 2) tags = client.get_tags(owner, repo_name) for tag_data in tags: - exists = session.query(Tag).filter_by(name=tag_data["name"], repository_id=repository.id).first() + exists = ( + session.query(Tag) + .filter_by(name=tag_data['name'], repository_id=repository.id) + .first() + ) if not exists: new_tag = Tag( - name=tag_data["name"], - commit_sha=tag_data["commit"]["sha"], - repository_id=repository.id + name=tag_data['name'], + commit_sha=tag_data['commit']['sha'], + repository_id=repository.id, ) new_tag.ingested_at = get_current_time() session.add(new_tag) record_discovery( - new_tag, - "tag_ingestion", - f"Tag from repo {repository.full_name}", + new_tag, + 'tag_ingestion', + f'Tag from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + # Record commits (step 2) commits = client.get_commits(owner, repo_name) for commit_data in commits[:100]: - sha = commit_data["sha"] + sha = commit_data['sha'] exists = session.query(Commit).filter_by(sha=sha).first() if not exists: - commit_info = commit_data.get("commit", {}) - author_info = commit_info.get("author", {}) - committer_info = commit_info.get("committer", {}) + commit_info = commit_data.get('commit', {}) + author_info = commit_info.get('author', {}) + committer_info = commit_info.get('committer', {}) commit_obj = Commit( sha=sha, - message=commit_info.get("message"), - author_name=author_info.get("name"), - author_email=author_info.get("email"), - committer_name=committer_info.get("name"), - committer_email=committer_info.get("email"), - date=parse_datetime(author_info.get("date")), + message=commit_info.get('message'), + author_name=author_info.get('name'), + author_email=author_info.get('email'), + committer_name=committer_info.get('name'), + committer_email=committer_info.get('email'), + date=parse_datetime(author_info.get('date')), repository_id=repository.id, - raw_data=save_json_field(commit_data) + raw_data=save_json_field(commit_data), ) commit_obj.ingested_at = get_current_time() session.add(commit_obj) record_discovery( - commit_obj, - "commit_ingestion", - f"Commit from repo {repository.full_name}", + commit_obj, + 'commit_ingestion', + f'Commit from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + # Record labels (step 2) labels = client.get_labels(owner, repo_name) for label_data in labels: - if not session.query(Label).filter_by(id=label_data["id"]).first(): + if not session.query(Label).filter_by(id=label_data['id']).first(): label = Label( - id=label_data["id"], - name=label_data["name"], - color=label_data.get("color"), - description=label_data.get("description"), + id=label_data['id'], + name=label_data['name'], + color=label_data.get('color'), + description=label_data.get('description'), repository_id=repository.id, - raw_data=save_json_field(label_data) + raw_data=save_json_field(label_data), ) label.ingested_at = get_current_time() session.add(label) record_discovery( - label, - "label_ingestion", - f"Label from repo {repository.full_name}", + label, + 'label_ingestion', + f'Label from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + # Record milestones (step 2) milestones = client.get_milestones(owner, repo_name) for ms_data in milestones: - if not session.query(Milestone).filter_by(id=ms_data["id"]).first(): + if not session.query(Milestone).filter_by(id=ms_data['id']).first(): milestone = Milestone( - id=ms_data["id"], - title=ms_data["title"], - description=ms_data.get("description"), - state=ms_data.get("state"), - due_on=parse_datetime(ms_data.get("due_on")), + id=ms_data['id'], + title=ms_data['title'], + description=ms_data.get('description'), + state=ms_data.get('state'), + due_on=parse_datetime(ms_data.get('due_on')), repository_id=repository.id, - raw_data=save_json_field(ms_data) + raw_data=save_json_field(ms_data), ) milestone.ingested_at = get_current_time() session.add(milestone) record_discovery( - milestone, - "milestone_ingestion", - f"Milestone from repo {repository.full_name}", + milestone, + 'milestone_ingestion', + f'Milestone from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + # Record releases (step 2) releases = client.get_releases(owner, repo_name) for rel_data in releases: - if not session.query(Release).filter_by(id=rel_data["id"]).first(): + if not session.query(Release).filter_by(id=rel_data['id']).first(): release = Release( - id=rel_data["id"], - tag_name=rel_data.get("tag_name"), - name=rel_data.get("name"), - body=rel_data.get("body"), - draft=rel_data.get("draft", False), - prerelease=rel_data.get("prerelease", False), - created_at=parse_datetime(rel_data.get("created_at")), - published_at=parse_datetime(rel_data.get("published_at")), + id=rel_data['id'], + tag_name=rel_data.get('tag_name'), + name=rel_data.get('name'), + body=rel_data.get('body'), + draft=rel_data.get('draft', False), + prerelease=rel_data.get('prerelease', False), + created_at=parse_datetime(rel_data.get('created_at')), + published_at=parse_datetime(rel_data.get('published_at')), repository_id=repository.id, - raw_data=save_json_field(rel_data) + raw_data=save_json_field(rel_data), ) release.ingested_at = get_current_time() session.add(release) record_discovery( - release, - "release_ingestion", - f"Release from repo {repository.full_name}", + release, + 'release_ingestion', + f'Release from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + # Record webhooks (step 2) webhooks = client.get_webhooks(owner, repo_name) for hook_data in webhooks: - if not session.query(Webhook).filter_by(id=hook_data["id"]).first(): + if not session.query(Webhook).filter_by(id=hook_data['id']).first(): webhook = Webhook( - id=hook_data["id"], - name=hook_data.get("name"), - config=save_json_field(hook_data.get("config")), - events=",".join(hook_data.get("events", [])), - active=hook_data.get("active", False), + id=hook_data['id'], + name=hook_data.get('name'), + config=save_json_field(hook_data.get('config')), + events=','.join(hook_data.get('events', [])), + active=hook_data.get('active', False), repository_id=repository.id, - raw_data=save_json_field(hook_data) + raw_data=save_json_field(hook_data), ) webhook.ingested_at = get_current_time() session.add(webhook) record_discovery( - webhook, - "webhook_ingestion", - f"Webhook from repo {repository.full_name}", + webhook, + 'webhook_ingestion', + f'Webhook from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + # Record events (step 2) events = client.get_events(owner, repo_name) for event_data in events: event_obj = Event( - type=event_data.get("type"), - created_at=parse_datetime(event_data.get("created_at")), + type=event_data.get('type'), + created_at=parse_datetime(event_data.get('created_at')), repository_id=repository.id, - raw_data=save_json_field(event_data) + raw_data=save_json_field(event_data), ) event_obj.ingested_at = get_current_time() session.add(event_obj) record_discovery( - event_obj, - "event_ingestion", - f"Event from repo {repository.full_name}", + event_obj, + 'event_ingestion', + f'Event from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + # Record collaborators (step 2) collaborators = client.get_collaborators(owner, repo_name) for collab in collaborators: collab_user = update_or_create_user( - session, - client, + session, + client, collab, - discovery_method="collaborator_ingestion", + discovery_method='collaborator_ingestion', discovery_details=f"User discovered as collaborator on repository '{repository.full_name}'", trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=base_branch_id, - step=2 + step=2, ) - + # Record workflows (step 2) workflows = client.get_workflows(owner, repo_name) for wf in workflows: - if not session.query(Workflow).filter_by(id=wf["id"]).first(): + if not session.query(Workflow).filter_by(id=wf['id']).first(): workflow = Workflow( - id=wf["id"], - name=wf.get("name"), - state=wf.get("state"), + id=wf['id'], + name=wf.get('name'), + state=wf.get('state'), repository_id=repository.id, - raw_data=save_json_field(wf) + raw_data=save_json_field(wf), ) workflow.ingested_at = get_current_time() session.add(workflow) record_discovery( - workflow, - "workflow_ingestion", - f"Workflow from repo {repository.full_name}", + workflow, + 'workflow_ingestion', + f'Workflow from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + # Record workflow runs (step 2) workflow_runs = client.get_workflow_runs(owner, repo_name) for run in workflow_runs: - if not session.query(WorkflowRun).filter_by(id=run["id"]).first(): + if not session.query(WorkflowRun).filter_by(id=run['id']).first(): wrun = WorkflowRun( - id=run["id"], - name=run.get("name"), - status=run.get("status"), - conclusion=run.get("conclusion"), - created_at=parse_datetime(run.get("created_at")), - updated_at=parse_datetime(run.get("updated_at")), + id=run['id'], + name=run.get('name'), + status=run.get('status'), + conclusion=run.get('conclusion'), + created_at=parse_datetime(run.get('created_at')), + updated_at=parse_datetime(run.get('updated_at')), repository_id=repository.id, - raw_data=save_json_field(run) + raw_data=save_json_field(run), ) wrun.ingested_at = get_current_time() session.add(wrun) record_discovery( - wrun, - "workflow_run_ingestion", - f"Workflow run from repo {repository.full_name}", + wrun, + 'workflow_run_ingestion', + f'Workflow run from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + # Process issues and their comments (step 2) - issues_url = f"{client.BASE_URL}/repos/{owner}/{repo_name}/issues" - issues = client.get_all_pages(issues_url, params={"state": "all"}) + issues_url = f'{client.BASE_URL}/repos/{owner}/{repo_name}/issues' + issues = client.get_all_pages(issues_url, params={'state': 'all'}) for issue_data in issues: - if "pull_request" in issue_data: + if 'pull_request' in issue_data: continue user = update_or_create_user( - session, client, issue_data["user"], - discovery_method="issue_ingestion", + session, + client, + issue_data['user'], + discovery_method='issue_ingestion', discovery_details=f"Issue discovered from issue {issue_data['number']} on repository '{repository.full_name}'", trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=base_branch_id, - step=2 + step=2, ) - - if not session.query(Issue).filter_by(id=issue_data["id"]).first(): + + if not session.query(Issue).filter_by(id=issue_data['id']).first(): issue = Issue( - id=issue_data["id"], - number=issue_data["number"], - title=issue_data["title"], - body=issue_data.get("body"), - state=issue_data["state"], - created_at=parse_datetime(issue_data["created_at"]), - updated_at=parse_datetime(issue_data["updated_at"]), - closed_at=parse_datetime(issue_data.get("closed_at")), + id=issue_data['id'], + number=issue_data['number'], + title=issue_data['title'], + body=issue_data.get('body'), + state=issue_data['state'], + created_at=parse_datetime(issue_data['created_at']), + updated_at=parse_datetime(issue_data['updated_at']), + closed_at=parse_datetime(issue_data.get('closed_at')), user_id=user.id if user else None, repository_id=repository.id, - raw_data=save_json_field(issue_data) + raw_data=save_json_field(issue_data), ) issue.ingested_at = get_current_time() session.add(issue) record_discovery( - issue, - "issue_ingestion", - f"Issue from repo {repository.full_name}", + issue, + 'issue_ingestion', + f'Issue from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + session.commit() - comments_url = f"{client.BASE_URL}/repos/{owner}/{repo_name}/issues/{issue_data['number']}/comments" + comments_url = f'{client.BASE_URL}/repos/{owner}/{repo_name}/issues/{issue_data["number"]}/comments' comments = client.get_all_pages(comments_url) for comment_data in comments: comment_user = update_or_create_user( - session, client, comment_data.get("user"), - discovery_method="issue_comment_ingestion", - discovery_details=f"Issue comment on issue {issue.number} from repo {repository.full_name}", + session, + client, + comment_data.get('user'), + discovery_method='issue_comment_ingestion', + discovery_details=f'Issue comment on issue {issue.number} from repo {repository.full_name}', trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=base_branch_id, - step=2 + step=2, ) - + if comment_user is None: continue - if not session.query(IssueComment).filter_by(id=comment_data["id"]).first(): + if ( + not session.query(IssueComment) + .filter_by(id=comment_data['id']) + .first() + ): comment = IssueComment( - id=comment_data["id"], - body=comment_data["body"], - created_at=parse_datetime(comment_data["created_at"]), - updated_at=parse_datetime(comment_data["updated_at"]), + id=comment_data['id'], + body=comment_data['body'], + created_at=parse_datetime(comment_data['created_at']), + updated_at=parse_datetime(comment_data['updated_at']), user_id=comment_user.id, issue_id=issue.id, - raw_data=save_json_field(comment_data) + raw_data=save_json_field(comment_data), ) comment.ingested_at = get_current_time() session.add(comment) record_discovery( - comment, - "issue_comment_ingestion", - f"Issue comment on issue {issue.number}", + comment, + 'issue_comment_ingestion', + f'Issue comment on issue {issue.number}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) session.commit() - + # Process pull requests and their comments/reviews (step 2) - prs_url = f"{client.BASE_URL}/repos/{owner}/{repo_name}/pulls" - pull_requests = client.get_all_pages(prs_url, params={"state": "all"}) + prs_url = f'{client.BASE_URL}/repos/{owner}/{repo_name}/pulls' + pull_requests = client.get_all_pages(prs_url, params={'state': 'all'}) for pr_data in pull_requests: user = update_or_create_user( - session, client, pr_data["user"], - discovery_method="pr_ingestion", - discovery_details=f"PR from repo {repository.full_name}", + session, + client, + pr_data['user'], + discovery_method='pr_ingestion', + discovery_details=f'PR from repo {repository.full_name}', trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=base_branch_id, - step=2 + step=2, ) - - if not session.query(PullRequest).filter_by(id=pr_data["id"]).first(): + + if not session.query(PullRequest).filter_by(id=pr_data['id']).first(): pr = PullRequest( - id=pr_data["id"], - number=pr_data["number"], - title=pr_data["title"], - body=pr_data.get("body"), - state=pr_data["state"], - created_at=parse_datetime(pr_data["created_at"]), - updated_at=parse_datetime(pr_data["updated_at"]), - merged_at=parse_datetime(pr_data.get("merged_at")), + id=pr_data['id'], + number=pr_data['number'], + title=pr_data['title'], + body=pr_data.get('body'), + state=pr_data['state'], + created_at=parse_datetime(pr_data['created_at']), + updated_at=parse_datetime(pr_data['updated_at']), + merged_at=parse_datetime(pr_data.get('merged_at')), user_id=user.id if user else None, repository_id=repository.id, - raw_data=save_json_field(pr_data) + raw_data=save_json_field(pr_data), ) pr.ingested_at = get_current_time() session.add(pr) record_discovery( - pr, - "pr_ingestion", - f"PR from repo {repository.full_name}", + pr, + 'pr_ingestion', + f'PR from repo {repository.full_name}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - + session.commit() - pr_comments_url = f"{client.BASE_URL}/repos/{owner}/{repo_name}/pulls/{pr_data['number']}/comments" + pr_comments_url = f'{client.BASE_URL}/repos/{owner}/{repo_name}/pulls/{pr_data["number"]}/comments' pr_comments = client.get_all_pages(pr_comments_url) for pr_comment_data in pr_comments: comment_user = update_or_create_user( - session, client, pr_comment_data.get("user"), - discovery_method="pr_comment_ingestion", - discovery_details=f"User discovered from PR comment on PR {pr_data['number']} in repo {repository.full_name}", + session, + client, + pr_comment_data.get('user'), + discovery_method='pr_comment_ingestion', + discovery_details=f'User discovered from PR comment on PR {pr_data["number"]} in repo {repository.full_name}', trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=base_branch_id, - step=2 + step=2, ) - + if comment_user is None: continue - if not session.query(PRReviewComment).filter_by(id=pr_comment_data["id"]).first(): + if ( + not session.query(PRReviewComment) + .filter_by(id=pr_comment_data['id']) + .first() + ): pr_comment = PRReviewComment( - id=pr_comment_data["id"], - body=pr_comment_data["body"], - created_at=parse_datetime(pr_comment_data["created_at"]), - updated_at=parse_datetime(pr_comment_data["updated_at"]), + id=pr_comment_data['id'], + body=pr_comment_data['body'], + created_at=parse_datetime(pr_comment_data['created_at']), + updated_at=parse_datetime(pr_comment_data['updated_at']), user_id=comment_user.id, pr_id=pr.id, - raw_data=save_json_field(pr_comment_data) + raw_data=save_json_field(pr_comment_data), ) pr_comment.ingested_at = get_current_time() session.add(pr_comment) record_discovery( - pr_comment, - "pr_comment_ingestion", - f"PR comment on PR {pr.number}", + pr_comment, + 'pr_comment_ingestion', + f'PR comment on PR {pr.number}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) - pr_reviews_url = f"{client.BASE_URL}/repos/{owner}/{repo_name}/pulls/{pr_data['number']}/reviews" + pr_reviews_url = f'{client.BASE_URL}/repos/{owner}/{repo_name}/pulls/{pr_data["number"]}/reviews' pr_reviews = client.get(pr_reviews_url) if pr_reviews and isinstance(pr_reviews, list): for review_data in pr_reviews: - if not session.query(PullRequestReview).filter_by(id=review_data["id"]).first(): + if ( + not session.query(PullRequestReview) + .filter_by(id=review_data['id']) + .first() + ): review_user = update_or_create_user( - session, client, review_data.get("user"), - discovery_method="pr_review_ingestion", - discovery_details=f"User discovered from PR review on PR {pr_data['number']} in repo {repository.full_name}", + session, + client, + review_data.get('user'), + discovery_method='pr_review_ingestion', + discovery_details=f'User discovered from PR review on PR {pr_data["number"]} in repo {repository.full_name}', trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=base_branch_id, - step=2 + step=2, ) - + if review_user is None: continue new_review = PullRequestReview( - id=review_data["id"], + id=review_data['id'], user_id=review_user.id, pr_id=pr.id, - state=review_data["state"], - submitted_at=parse_datetime(review_data.get("submitted_at")), - body=review_data.get("body"), - raw_data=save_json_field(review_data) + state=review_data['state'], + submitted_at=parse_datetime( + review_data.get('submitted_at') + ), + body=review_data.get('body'), + raw_data=save_json_field(review_data), ) new_review.ingested_at = get_current_time() session.add(new_review) record_discovery( - new_review, - "pr_review_ingestion", - f"PR review on PR {pr.number}", + new_review, + 'pr_review_ingestion', + f'PR review on PR {pr.number}', chain_id=chain_id, - trigger_input=trigger_input, + trigger_input=trigger_input, keyword=keyword, - branch_id=base_branch_id, - step=2 + branch_id=base_branch_id, + step=2, ) session.commit() - + # Process Readme and CITATION.cff readme = client.get_readme(owner, repo_name) readme_dois = [] - if readme and "content" in readme: - decoded_readme = base64.b64decode(readme["content"]).decode("utf-8", errors="ignore") + if readme and 'content' in readme: + decoded_readme = base64.b64decode(readme['content']).decode( + 'utf-8', errors='ignore' + ) readme_dois = extract_dois_from_text(decoded_readme) for doi_str in readme_dois: doi_obj = store_doi( - session, repository.id, doi_str, source="README", - discovery_method="readme_doi_ingestion", + session, + repository.id, + doi_str, + source='README', + discovery_method='readme_doi_ingestion', discovery_details=f"DOI discovered from README in repository '{repository.full_name}'", trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=base_branch_id, - step=2 + step=2, ) - repository.raw_data = "\nReadme: " + save_json_field(readme) + repository.raw_data = '\nReadme: ' + save_json_field(readme) repository.ingested_at = get_current_time() - - logger.info("Attempting to fetch CITATION.cff...") + + logger.info('Attempting to fetch CITATION.cff...') citation_doi = parse_citation_cff( - session, client, owner, repo_name, repository, + session, + client, + owner, + repo_name, + repository, chain_id=chain_id, branch_id=base_branch_id, trigger_input=trigger_input, - keyword=keyword + keyword=keyword, ) - + new_dois = set(readme_dois) if citation_doi: new_dois.add(citation_doi) if not new_dois: if repository.dois: repository.dois.clear() - - logger.info("Fetching discussions...") + + logger.info('Fetching discussions...') client.get_discussions(owner, repo_name) - logger.info("GitHub repository ingestion complete.") - - return repository, base_branch_id \ No newline at end of file + logger.info('GitHub repository ingestion complete.') + + return repository, base_branch_id diff --git a/Older Experiments/scrappy-proof-of-concept/services/ingestion_service.py b/Older Experiments/scrappy-proof-of-concept/services/ingestion_service.py index 54fd75b..00ae728 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/ingestion_service.py +++ b/Older Experiments/scrappy-proof-of-concept/services/ingestion_service.py @@ -1,35 +1,39 @@ # services/ingestion_service.py -import time import logging +import time + +from db.database import get_db_session +from models.models import DiscoveryEvent, Repository +from services.discovery import start_new_chain from services.github_ingestion import ingest_github_repository from services.openalex_ingestion import ingest_openalex_data from utils.repo_finder import search_repositories_by_date_ranges -from db.database import get_db_session -from models.models import Repository, DiscoveryEvent -from services.discovery import start_new_chain logger = logging.getLogger(__name__) + def get_ingestion_counts(): - from models.models import Repository, OpenAlexWork, User, Organization, DOI + from models.models import DOI, OpenAlexWork, Organization, Repository, User + with get_db_session() as session: counts = { - "repositories": session.query(Repository).count(), - "works": session.query(OpenAlexWork).count(), - "people": session.query(User).count(), - "organizations": session.query(Organization).count(), - "dois": session.query(DOI).count() + 'repositories': session.query(Repository).count(), + 'works': session.query(OpenAlexWork).count(), + 'people': session.query(User).count(), + 'organizations': session.query(Organization).count(), + 'dois': session.query(DOI).count(), } return counts + def print_ingestion_summary(pre_counts=None, post_counts=None): total_counts = post_counts if post_counts is not None else get_ingestion_counts() - summary = "\nIngestion Summary:\n" - summary += f"Total repositories in database: {total_counts['repositories']}\n" - summary += f"Total works in database: {total_counts['works']}\n" - summary += f"Total people in database: {total_counts['people']}\n" - summary += f"Total organizations in database: {total_counts['organizations']}\n" - summary += f"Total DOIs in database: {total_counts['dois']}\n" + summary = '\nIngestion Summary:\n' + summary += f'Total repositories in database: {total_counts["repositories"]}\n' + summary += f'Total works in database: {total_counts["works"]}\n' + summary += f'Total people in database: {total_counts["people"]}\n' + summary += f'Total organizations in database: {total_counts["organizations"]}\n' + summary += f'Total DOIs in database: {total_counts["dois"]}\n' if pre_counts is not None: run_repos = total_counts['repositories'] - pre_counts['repositories'] @@ -37,36 +41,44 @@ def print_ingestion_summary(pre_counts=None, post_counts=None): run_people = total_counts['people'] - pre_counts['people'] run_orgs = total_counts['organizations'] - pre_counts['organizations'] run_dois = total_counts['dois'] - pre_counts['dois'] - summary += "\nAdded during most recent run:\n" - summary += f"Repositories added: {run_repos}\n" - summary += f"Works added: {run_works}\n" - summary += f"People added: {run_people}\n" - summary += f"Organizations added: {run_orgs}\n" - summary += f"DOIs added: {run_dois}\n" + summary += '\nAdded during most recent run:\n' + summary += f'Repositories added: {run_repos}\n' + summary += f'Works added: {run_works}\n' + summary += f'People added: {run_people}\n' + summary += f'Organizations added: {run_orgs}\n' + summary += f'DOIs added: {run_dois}\n' return summary + def check_repository_exists(owner, repo_name): """ Check if a repository with the given owner and name exists in the database. Returns the Repository object if found, None otherwise. """ with get_db_session() as session: - full_name = f"{owner}/{repo_name}" + full_name = f'{owner}/{repo_name}' repo = session.query(Repository).filter_by(full_name=full_name).first() return repo + def get_discovery_events(repo_id): """ Get discovery events for a repository. Returns a list of DiscoveryEvent objects. """ with get_db_session() as session: - events = session.query(DiscoveryEvent).filter( - DiscoveryEvent.object_type == 'Repository', - DiscoveryEvent.object_id == str(repo_id) - ).order_by(DiscoveryEvent.timestamp).all() + events = ( + session.query(DiscoveryEvent) + .filter( + DiscoveryEvent.object_type == 'Repository', + DiscoveryEvent.object_id == str(repo_id), + ) + .order_by(DiscoveryEvent.timestamp) + .all() + ) return events + def get_repository_doi_counts(repo_id): """ Get counts of DOIs for a repository. @@ -77,9 +89,15 @@ def get_repository_doi_counts(repo_id): return 0 return len(repo.dois) -def ingest_repository(owner: str, repo_name: str, token: str = None, - discovery_method: str = "direct_ingestion", - discovery_details: str = None, trigger_input: str = None): + +def ingest_repository( + owner: str, + repo_name: str, + token: str = None, + discovery_method: str = 'direct_ingestion', + discovery_details: str = None, + trigger_input: str = None, +): """ Ingest a repository by delegating GitHub ingestion to the dedicated module and then processing OpenAlex data. @@ -87,7 +105,7 @@ def ingest_repository(owner: str, repo_name: str, token: str = None, """ # Generate a chain ID for this ingestion session chain_id = start_new_chain() - + with get_db_session() as session: repository, base_branch_id = ingest_github_repository( session=session, @@ -97,7 +115,7 @@ def ingest_repository(owner: str, repo_name: str, token: str = None, discovery_method=discovery_method, discovery_details=discovery_details, trigger_input=trigger_input, - chain_id=chain_id + chain_id=chain_id, ) ingest_openalex_data( @@ -108,25 +126,28 @@ def ingest_repository(owner: str, repo_name: str, token: str = None, trigger_input=trigger_input, chain_id=chain_id, branch_id=base_branch_id, - keyword=None if discovery_method != "keyword_ingestion" else trigger_input + keyword=None if discovery_method != 'keyword_ingestion' else trigger_input, ) - - logging.info(f"Repository {repository.full_name} ingested successfully.") + + logging.info(f'Repository {repository.full_name} ingested successfully.') return repository -def search_and_ingest_repositories(token: str, keywords: str, trigger_input: str = None): + +def search_and_ingest_repositories( + token: str, keywords: str, trigger_input: str = None +): from clients.github_client import GitHubClient - + client = GitHubClient(token=token, default_timeout=30) repositories_data = search_repositories_by_date_ranges(client, keywords) ingested = [] - + # Create a single chain ID for this search session chain_id = start_new_chain() for repo_data in repositories_data: - owner = repo_data["owner"]["login"] - repo_name = repo_data["name"] + owner = repo_data['owner']['login'] + repo_name = repo_data['name'] detailed_discovery = f"Repository discovered via keyword search '{keywords}'" try: @@ -136,28 +157,28 @@ def search_and_ingest_repositories(token: str, keywords: str, trigger_input: str owner=owner, repo_name=repo_name, token=token, - discovery_method="keyword_ingestion", + discovery_method='keyword_ingestion', discovery_details=detailed_discovery, trigger_input=trigger_input, chain_id=chain_id, - keyword=keywords + keyword=keywords, ) ingest_openalex_data( session=session, repository=repository, - discovery_method="keyword_ingestion", + discovery_method='keyword_ingestion', discovery_details=detailed_discovery, trigger_input=trigger_input, chain_id=chain_id, branch_id=base_branch_id, - keyword=keywords + keyword=keywords, ) - + ingested.append(repository) except Exception as e: - logging.error(f"Error ingesting {owner}/{repo_name}: {e}") + logging.error(f'Error ingesting {owner}/{repo_name}: {e}') time.sleep(1) - return ingested \ No newline at end of file + return ingested diff --git a/Older Experiments/scrappy-proof-of-concept/services/institution_analysis.py b/Older Experiments/scrappy-proof-of-concept/services/institution_analysis.py index f7b574e..47a17ca 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/institution_analysis.py +++ b/Older Experiments/scrappy-proof-of-concept/services/institution_analysis.py @@ -8,32 +8,32 @@ import logging import uuid from datetime import datetime -from typing import Dict, List, Any, Optional, Tuple, Union +from typing import Dict, List -from sqlalchemy.orm import joinedload from db.database import get_db_session from models.models import ( - Repository, User, OpenAlexAuthor, AnalysisSession, SurfacingResult, - SurfacedRepository, SurfacedPerson, ACFResult, ACFRepositoryResult, - ACFPersonResult + ACFResult, + AnalysisSession, + SurfacingResult, ) logger = logging.getLogger(__name__) + class InstitutionAnalysisManager: """ Manages the workflow for institutional analysis. - + This class handles the state transitions between the three phases: 1. Surfacing phase - discovering repositories/people potentially associated with the institution 2. ACF phase - applying confidence filters to rank the discoveries 3. Analysis phase - analyzing the high-confidence matches """ - - def __init__(self, institution_name: str, analysis_type: str = "repository"): + + def __init__(self, institution_name: str, analysis_type: str = 'repository'): """ Initialize a new institution analysis session. - + Args: institution_name: Name of the institution analysis_type: Either "repository" or "people" @@ -41,158 +41,175 @@ def __init__(self, institution_name: str, analysis_type: str = "repository"): self.institution_name = institution_name self.analysis_type = analysis_type self.session_id = str(uuid.uuid4()) - self.current_phase = "initiated" + self.current_phase = 'initiated' self.institution_info = { - "name": institution_name, - "domains": [], - "github_orgs": [] + 'name': institution_name, + 'domains': [], + 'github_orgs': [], } self.surfacing_id = None self.acf_id = None - + # Create a database record for this session with get_db_session() as session: new_session = AnalysisSession( session_id=self.session_id, institution_name=institution_name, analysis_type=analysis_type, - status="initiated", - parameters=json.dumps(self.institution_info) + status='initiated', + parameters=json.dumps(self.institution_info), ) session.add(new_session) session.commit() self.db_session_id = new_session.id - - def set_institution_info(self, domains: List[str] = None, github_orgs: List[str] = None): + + def set_institution_info( + self, domains: List[str] = None, github_orgs: List[str] = None + ): """ Set additional institution information. - + Args: domains: List of email domains associated with the institution github_orgs: List of GitHub organizations associated with the institution """ if domains: - self.institution_info["domains"] = domains + self.institution_info['domains'] = domains if github_orgs: - self.institution_info["github_orgs"] = github_orgs - + self.institution_info['github_orgs'] = github_orgs + # Update the session record with get_db_session() as session: - db_session = session.query(AnalysisSession).filter_by(id=self.db_session_id).first() + db_session = ( + session.query(AnalysisSession).filter_by(id=self.db_session_id).first() + ) if db_session: db_session.parameters = json.dumps(self.institution_info) db_session.last_updated = datetime.now() - + def get_past_sessions(self) -> List[Dict]: """ Get past analysis sessions for this institution. - + Returns: List of session records with summary information """ with get_db_session() as session: - past_sessions = session.query(AnalysisSession).filter( - AnalysisSession.institution_name == self.institution_name, - AnalysisSession.analysis_type == self.analysis_type - ).order_by( - AnalysisSession.last_updated.desc() - ).all() - + past_sessions = ( + session.query(AnalysisSession) + .filter( + AnalysisSession.institution_name == self.institution_name, + AnalysisSession.analysis_type == self.analysis_type, + ) + .order_by(AnalysisSession.last_updated.desc()) + .all() + ) + results = [] for session_record in past_sessions: - surfacing_count = session.query(SurfacingResult).filter_by( - session_id=session_record.id - ).count() - - acf_count = session.query(ACFResult).filter_by( - session_id=session_record.id - ).count() - - results.append({ - "id": session_record.id, - "session_id": session_record.session_id, - "created_at": session_record.created_at, - "last_updated": session_record.last_updated, - "status": session_record.status, - "surfacing_count": surfacing_count, - "acf_count": acf_count - }) - + surfacing_count = ( + session.query(SurfacingResult) + .filter_by(session_id=session_record.id) + .count() + ) + + acf_count = ( + session.query(ACFResult) + .filter_by(session_id=session_record.id) + .count() + ) + + results.append( + { + 'id': session_record.id, + 'session_id': session_record.session_id, + 'created_at': session_record.created_at, + 'last_updated': session_record.last_updated, + 'status': session_record.status, + 'surfacing_count': surfacing_count, + 'acf_count': acf_count, + } + ) + return results - + def load_session(self, session_id: str) -> bool: """ Load an existing session. - + Args: session_id: UUID of the session to load - + Returns: True if session was loaded successfully, False otherwise """ with get_db_session() as session: - existing = session.query(AnalysisSession).filter_by( - session_id=session_id - ).first() - + existing = ( + session.query(AnalysisSession).filter_by(session_id=session_id).first() + ) + if not existing: - logger.error(f"Session {session_id} not found") + logger.error(f'Session {session_id} not found') return False - + self.session_id = existing.session_id self.db_session_id = existing.id self.institution_name = existing.institution_name self.analysis_type = existing.analysis_type self.current_phase = existing.status - + try: self.institution_info = json.loads(existing.parameters) except (json.JSONDecodeError, TypeError): self.institution_info = { - "name": existing.institution_name, - "domains": [], - "github_orgs": [] + 'name': existing.institution_name, + 'domains': [], + 'github_orgs': [], } - + # Find the most recent surfacing and ACF IDs - latest_surfacing = session.query(SurfacingResult).filter_by( - session_id=existing.id - ).order_by( - SurfacingResult.run_at.desc() - ).first() - + latest_surfacing = ( + session.query(SurfacingResult) + .filter_by(session_id=existing.id) + .order_by(SurfacingResult.run_at.desc()) + .first() + ) + if latest_surfacing: self.surfacing_id = latest_surfacing.id - - latest_acf = session.query(ACFResult).filter_by( - session_id=existing.id - ).order_by( - ACFResult.run_at.desc() - ).first() - + + latest_acf = ( + session.query(ACFResult) + .filter_by(session_id=existing.id) + .order_by(ACFResult.run_at.desc()) + .first() + ) + if latest_acf: self.acf_id = latest_acf.id - + return True - + def set_phase(self, phase: str): """ Update the current phase of analysis. - + Args: phase: One of "initiated", "surfacing", "acf", "analysis", "completed" """ - valid_phases = ["initiated", "surfacing", "acf", "analysis", "completed"] + valid_phases = ['initiated', 'surfacing', 'acf', 'analysis', 'completed'] if phase not in valid_phases: - logger.error(f"Invalid phase: {phase}") + logger.error(f'Invalid phase: {phase}') return - + self.current_phase = phase - + with get_db_session() as session: - db_session = session.query(AnalysisSession).filter_by(id=self.db_session_id).first() + db_session = ( + session.query(AnalysisSession).filter_by(id=self.db_session_id).first() + ) if db_session: db_session.status = phase db_session.last_updated = datetime.now() - - # More methods will be implemented in the next steps \ No newline at end of file + + # More methods will be implemented in the next steps diff --git a/Older Experiments/scrappy-proof-of-concept/services/institution_analysis_impl/person_acf.py b/Older Experiments/scrappy-proof-of-concept/services/institution_analysis_impl/person_acf.py index c9c6326..e984d1d 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/institution_analysis_impl/person_acf.py +++ b/Older Experiments/scrappy-proof-of-concept/services/institution_analysis_impl/person_acf.py @@ -3,34 +3,32 @@ Association Confidence Filters (ACF) for people-institution associations. """ -import json import logging -from abc import ABC, abstractmethod -from typing import Dict, List, Any, Optional, Tuple, Union +from abc import abstractmethod +from typing import Any, Dict, Tuple -from sqlalchemy import or_, and_ -from sqlalchemy.orm import joinedload from db.database import get_db_session -from models.models import ( - User, OpenAlexAuthor, OpenAlexInstitution, OpenAlexWork, - SurfacedPerson, ACFResult, ACFPersonResult -) +from models.models import OpenAlexAuthor, SurfacedPerson, User from services.acf_base import AssociationConfidenceFilter +from sqlalchemy.orm import joinedload logger = logging.getLogger(__name__) + class PersonAssociationConfidenceFilter(AssociationConfidenceFilter): """Base class for person-institution association confidence filters.""" - + @abstractmethod - def calculate_confidence(self, person: SurfacedPerson, institution_info: Dict[str, Any]) -> Tuple[float, Dict]: + def calculate_confidence( + self, person: SurfacedPerson, institution_info: Dict[str, Any] + ) -> Tuple[float, Dict]: """ Calculate a confidence score (0.0-1.0) that a person is associated with the institution. - + Args: person: The SurfacedPerson object to analyze institution_info: Dictionary containing institution data (name, domains, etc.) - + Returns: Tuple of (confidence_score, evidence_dict) - confidence_score: Float from 0.0 to 1.0 @@ -41,167 +39,182 @@ def calculate_confidence(self, person: SurfacedPerson, institution_info: Dict[st class EmailDomainPersonFilter(PersonAssociationConfidenceFilter): """Filter that checks if a person's email domain matches the institution.""" - + @property def name(self) -> str: - return "Email Domain Person Filter" - + return 'Email Domain Person Filter' + @property def description(self) -> str: return "Checks if the person's email domain matches the institution" - - def calculate_confidence(self, person: SurfacedPerson, institution_info: Dict[str, Any]) -> Tuple[float, Dict]: + + def calculate_confidence( + self, person: SurfacedPerson, institution_info: Dict[str, Any] + ) -> Tuple[float, Dict]: domains = institution_info.get('domains', []) if not domains: return 0.0, {} - + evidence = {} - + # Get the user if available with get_db_session() as session: user = None if person.user_id: user = session.query(User).filter_by(id=person.user_id).first() - + if not user or not user.email: return 0.0, {} - + # Check if email domain matches any institution domain user_domain = user.email.split('@')[-1].lower() - + for domain in domains: if domain.lower() == user_domain: evidence['email_match'] = { 'email': user.email, - 'matching_domain': domain + 'matching_domain': domain, } return 0.9, evidence # High confidence for exact domain match - + # Check for subdomain match (e.g., cs.stanford.edu matches stanford.edu) - if user_domain.endswith(f".{domain.lower()}"): + if user_domain.endswith(f'.{domain.lower()}'): evidence['subdomain_match'] = { 'email': user.email, 'user_domain': user_domain, - 'institution_domain': domain + 'institution_domain': domain, } return 0.85, evidence # Slightly lower confidence for subdomain - + return 0.0, {} class ProfilePersonFilter(PersonAssociationConfidenceFilter): """Filter that analyzes a person's profile information for institution mentions.""" - + @property def name(self) -> str: - return "Profile Person Filter" - + return 'Profile Person Filter' + @property def description(self) -> str: return "Analyzes a person's profile information for institution mentions" - - def calculate_confidence(self, person: SurfacedPerson, institution_info: Dict[str, Any]) -> Tuple[float, Dict]: + + def calculate_confidence( + self, person: SurfacedPerson, institution_info: Dict[str, Any] + ) -> Tuple[float, Dict]: institution_name = institution_info.get('name', '') if not institution_name: return 0.0, {} - + evidence = {} total_score = 0.0 - + # Get the user if available with get_db_session() as session: user = None if person.user_id: user = session.query(User).filter_by(id=person.user_id).first() - + if not user: return 0.0, {} - + # Check company field if user.company and institution_name.lower() in user.company.lower(): company_score = 0.8 evidence['company_match'] = { 'company': user.company, - 'score': company_score + 'score': company_score, } total_score = max(total_score, company_score) - + # Check bio field if user.bio and institution_name.lower() in user.bio.lower(): bio_score = 0.6 evidence['bio_match'] = { - 'bio_excerpt': user.bio[:100] + '...' if len(user.bio) > 100 else user.bio, - 'score': bio_score + 'bio_excerpt': user.bio[:100] + '...' + if len(user.bio) > 100 + else user.bio, + 'score': bio_score, } total_score = max(total_score, bio_score) - + # Check location field if user.location and institution_name.lower() in user.location.lower(): location_score = 0.5 evidence['location_match'] = { 'location': user.location, - 'score': location_score + 'score': location_score, } total_score = max(total_score, location_score) - + if evidence: return total_score, evidence - + return 0.0, {} class OpenAlexPersonFilter(PersonAssociationConfidenceFilter): """Filter that checks OpenAlex data for institution affiliations.""" - + @property def name(self) -> str: - return "OpenAlex Person Filter" - + return 'OpenAlex Person Filter' + @property def description(self) -> str: - return "Checks OpenAlex data for institution affiliations" - - def calculate_confidence(self, person: SurfacedPerson, institution_info: Dict[str, Any]) -> Tuple[float, Dict]: + return 'Checks OpenAlex data for institution affiliations' + + def calculate_confidence( + self, person: SurfacedPerson, institution_info: Dict[str, Any] + ) -> Tuple[float, Dict]: institution_name = institution_info.get('name', '') if not institution_name: return 0.0, {} - + evidence = {} - + with get_db_session() as session: # Get the OpenAlex author if available author = None if person.openalex_author_id: - author = session.query(OpenAlexAuthor).options( - joinedload(OpenAlexAuthor.institutions), - joinedload(OpenAlexAuthor.works) - ).filter_by(id=person.openalex_author_id).first() - + author = ( + session.query(OpenAlexAuthor) + .options( + joinedload(OpenAlexAuthor.institutions), + joinedload(OpenAlexAuthor.works), + ) + .filter_by(id=person.openalex_author_id) + .first() + ) + if not author: # Try to find the GitHub user in OpenAlex by name if person.user_id and person.name: user = session.query(User).filter_by(id=person.user_id).first() if user and user.name: - authors = session.query(OpenAlexAuthor).filter( - OpenAlexAuthor.display_name.ilike(f"%{user.name}%") - ).all() - + authors = ( + session.query(OpenAlexAuthor) + .filter(OpenAlexAuthor.display_name.ilike(f'%{user.name}%')) + .all() + ) + if authors: # Use the first match for simplicity author = authors[0] - + if not author: return 0.0, {} - + # Check for institution affiliations for institution in author.institutions: if institution_name.lower() in institution.display_name.lower(): evidence['institution_affiliation'] = { 'institution': institution.display_name, - 'openalex_id': institution.openalex_id + 'openalex_id': institution.openalex_id, } return 0.9, evidence # High confidence for institution affiliation - + # Check works for institution mentions matching_works = [] for work in author.works: @@ -209,78 +222,84 @@ def calculate_confidence(self, person: SurfacedPerson, institution_info: Dict[st for coauthor in work.authors: for institution in coauthor.institutions: if institution_name.lower() in institution.display_name.lower(): - matching_works.append({ - 'title': work.title, - 'year': work.publication_year, - 'coauthor': coauthor.display_name - }) + matching_works.append( + { + 'title': work.title, + 'year': work.publication_year, + 'coauthor': coauthor.display_name, + } + ) break if matching_works: break - + if matching_works: evidence['coauthor_affiliations'] = { 'matching_works': matching_works[:3] # Limit to first 3 works } return 0.7, evidence # Medium confidence for coauthor affiliations - + return 0.0, {} class CombinedPersonFilter(PersonAssociationConfidenceFilter): """Filter that combines multiple methods for a comprehensive person score.""" - + @property def name(self) -> str: - return "Combined Person Filter" - + return 'Combined Person Filter' + @property def description(self) -> str: - return "Combines multiple filtering methods for a comprehensive person score" - - def calculate_confidence(self, person: SurfacedPerson, institution_info: Dict[str, Any]) -> Tuple[float, Dict]: + return 'Combines multiple filtering methods for a comprehensive person score' + + def calculate_confidence( + self, person: SurfacedPerson, institution_info: Dict[str, Any] + ) -> Tuple[float, Dict]: filters = [ EmailDomainPersonFilter(), ProfilePersonFilter(), - OpenAlexPersonFilter() + OpenAlexPersonFilter(), ] - + scores = [] evidence = {} - + for filter_obj in filters: - score, filter_evidence = filter_obj.calculate_confidence(person, institution_info) + score, filter_evidence = filter_obj.calculate_confidence( + person, institution_info + ) if score > 0: filter_name = filter_obj.name scores.append((filter_name, score)) evidence[filter_name] = filter_evidence - + if not scores: return 0.0, {} - + # Calculate weighted combined score weights = { - "Email Domain Person Filter": 0.5, - "Profile Person Filter": 0.3, - "OpenAlex Person Filter": 0.4 + 'Email Domain Person Filter': 0.5, + 'Profile Person Filter': 0.3, + 'OpenAlex Person Filter': 0.4, } - + weighted_sum = 0 weight_total = 0 - + for filter_name, score in scores: weight = weights.get(filter_name, 0.3) weighted_sum += score * weight weight_total += weight - + if weight_total == 0: return 0.0, {} - + # Normalize the final score final_score = min(1.0, weighted_sum / weight_total) - + # Add individual scores to evidence - evidence["component_scores"] = {name: score for name, score in scores} - evidence["final_score"] = final_score - - return final_score, evidence \ No newline at end of file + evidence['component_scores'] = {name: score for name, score in scores} + evidence['final_score'] = final_score + + return final_score, evidence diff --git a/Older Experiments/scrappy-proof-of-concept/services/institution_analysis_impl/surfacing.py b/Older Experiments/scrappy-proof-of-concept/services/institution_analysis_impl/surfacing.py index caf4186..8273839 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/institution_analysis_impl/surfacing.py +++ b/Older Experiments/scrappy-proof-of-concept/services/institution_analysis_impl/surfacing.py @@ -7,47 +7,55 @@ import logging from abc import ABC, abstractmethod from datetime import datetime -from typing import Dict, List, Any, Optional, Tuple, Union, Set +from typing import Any, Dict, List -from sqlalchemy import or_, and_ -from sqlalchemy.orm import joinedload -from db.database import get_db_session from clients.github_client import GitHubClient +from db.database import get_db_session from models.models import ( - Repository, User, Organization, OpenAlexAuthor, OpenAlexInstitution, - OpenAlexWork, AnalysisSession, SurfacingResult, SurfacedRepository, - SurfacedPerson + OpenAlexAuthor, + OpenAlexInstitution, + Repository, + SurfacedPerson, + SurfacedRepository, + SurfacingResult, + User, ) -from services.acf_framework import find_keyword_matches, get_repositories_from_keywords +from services.acf_framework import get_repositories_from_keywords from utils.repo_finder import search_repositories_by_date_ranges logger = logging.getLogger(__name__) + class BaseSurfacingAlgorithm(ABC): """Base class for all surfacing algorithms.""" - + @property @abstractmethod def name(self) -> str: """Return the name of the algorithm.""" pass - + @property @abstractmethod def description(self) -> str: """Return a description of how the algorithm works.""" pass - + @abstractmethod - def run(self, session_id: int, institution_info: Dict[str, Any], parameters: Dict[str, Any]) -> int: + def run( + self, + session_id: int, + institution_info: Dict[str, Any], + parameters: Dict[str, Any], + ) -> int: """ Run the surfacing algorithm and store results. - + Args: session_id: ID of the analysis session institution_info: Dictionary with institution information parameters: Algorithm-specific parameters - + Returns: ID of the surfacing result record """ @@ -56,111 +64,134 @@ def run(self, session_id: int, institution_info: Dict[str, Any], parameters: Dic class KeywordRepositorySurfacing(BaseSurfacingAlgorithm): """Find repositories using keywords related to the institution.""" - + @property def name(self) -> str: - return "Keyword Repository Surfacing" - + return 'Keyword Repository Surfacing' + @property def description(self) -> str: - return "Find repositories using keywords related to the institution" - - def run(self, session_id: int, institution_info: Dict[str, Any], parameters: Dict[str, Any]) -> int: + return 'Find repositories using keywords related to the institution' + + def run( + self, + session_id: int, + institution_info: Dict[str, Any], + parameters: Dict[str, Any], + ) -> int: """Run the keyword-based repository surfacing algorithm.""" - institution_name = institution_info.get("name", "") + institution_name = institution_info.get('name', '') if not institution_name: - raise ValueError("Institution name is required") - + raise ValueError('Institution name is required') + # Get keywords from parameters - keywords = parameters.get("keywords", []) + keywords = parameters.get('keywords', []) if not keywords: # Generate default keywords if none provided keywords = self._generate_default_keywords(institution_name) - + # Record the start of surfacing with get_db_session() as session: surfacing_result = SurfacingResult( session_id=session_id, algorithm=self.name, parameters=json.dumps(parameters), - run_at=datetime.now() + run_at=datetime.now(), ) session.add(surfacing_result) session.commit() surfacing_id = surfacing_result.id - + # First, check if these keywords have been used before existing_repositories = get_repositories_from_keywords(keywords) - + # If a GitHub token is provided, search for additional repositories - if "github_token" in parameters: - token = parameters["github_token"] + if 'github_token' in parameters: + token = parameters['github_token'] client = GitHubClient(token=token) - + # For each keyword, search GitHub for keyword in keywords: # Use the repo_finder module to search repositories repo_data_list = search_repositories_by_date_ranges(client, keyword) - + for repo_data in repo_data_list: - owner = repo_data.get("owner", {}).get("login") - name = repo_data.get("name") - + owner = repo_data.get('owner', {}).get('login') + name = repo_data.get('name') + if owner and name: # Check if we already have this repository in our database with get_db_session() as session: - full_name = f"{owner}/{name}" - repo = session.query(Repository).filter_by(full_name=full_name).first() - + full_name = f'{owner}/{name}' + repo = ( + session.query(Repository) + .filter_by(full_name=full_name) + .first() + ) + if repo: # Check if we already added this repo to the current surfacing - existing = session.query(SurfacedRepository).filter_by( - surfacing_id=surfacing_id, repository_id=repo.id - ).first() - + existing = ( + session.query(SurfacedRepository) + .filter_by( + surfacing_id=surfacing_id, repository_id=repo.id + ) + .first() + ) + if not existing: # Add to surfaced repositories surfaced_repo = SurfacedRepository( surfacing_id=surfacing_id, repository_id=repo.id, - discovery_method="keyword_search", - discovery_details=f"Found via keyword search: {keyword}", - surface_score=0.5 # Initial relevance score + discovery_method='keyword_search', + discovery_details=f'Found via keyword search: {keyword}', + surface_score=0.5, # Initial relevance score ) session.add(surfaced_repo) - + # Add all existing repositories from our database that match the keywords with get_db_session() as session: for repo in existing_repositories: # Check if we already added this repo - existing = session.query(SurfacedRepository).filter_by( - surfacing_id=surfacing_id, repository_id=repo.id - ).first() - + existing = ( + session.query(SurfacedRepository) + .filter_by(surfacing_id=surfacing_id, repository_id=repo.id) + .first() + ) + if not existing: # Add to surfaced repositories surfaced_repo = SurfacedRepository( surfacing_id=surfacing_id, repository_id=repo.id, - discovery_method="keyword_history", - discovery_details=f"Found in database from past keyword searches: {', '.join(keywords)}", - surface_score=0.7 # Higher score for existing repos + discovery_method='keyword_history', + discovery_details=f'Found in database from past keyword searches: {", ".join(keywords)}', + surface_score=0.7, # Higher score for existing repos ) session.add(surfaced_repo) - + # Update the result count - result_count = session.query(SurfacedRepository).filter_by(surfacing_id=surfacing_id).count() - surfacing_result = session.query(SurfacingResult).filter_by(id=surfacing_id).first() + result_count = ( + session.query(SurfacedRepository) + .filter_by(surfacing_id=surfacing_id) + .count() + ) + surfacing_result = ( + session.query(SurfacingResult).filter_by(id=surfacing_id).first() + ) if surfacing_result: surfacing_result.result_count = result_count - surfacing_result.result_summary = json.dumps({"keywords": keywords, "count": result_count}) - + surfacing_result.result_summary = json.dumps( + {'keywords': keywords, 'count': result_count} + ) + return surfacing_id - + def _generate_default_keywords(self, institution_name: str) -> List[str]: """Generate default keywords based on institution name.""" keywords = [institution_name] - + # Add variations name_parts = institution_name.split() if len(name_parts) > 1: @@ -168,140 +199,167 @@ def _generate_default_keywords(self, institution_name: str) -> List[str]: abbr = ''.join(part[0] for part in name_parts if part[0].isupper()) if len(abbr) > 1: keywords.append(abbr) - + # Add just the first part (often the place name) keywords.append(name_parts[0]) - + return keywords class DomainRepositorySurfacing(BaseSurfacingAlgorithm): """Find repositories with contributors from institution domains.""" - + @property def name(self) -> str: - return "Domain Repository Surfacing" - + return 'Domain Repository Surfacing' + @property def description(self) -> str: - return "Find repositories with contributors from institution domains" - - def run(self, session_id: int, institution_info: Dict[str, Any], parameters: Dict[str, Any]) -> int: + return 'Find repositories with contributors from institution domains' + + def run( + self, + session_id: int, + institution_info: Dict[str, Any], + parameters: Dict[str, Any], + ) -> int: """Run the domain-based repository surfacing algorithm.""" - domains = institution_info.get("domains", []) + domains = institution_info.get('domains', []) if not domains: - raise ValueError("Institution domains are required for domain surfacing") - + raise ValueError('Institution domains are required for domain surfacing') + # Record the start of surfacing with get_db_session() as session: surfacing_result = SurfacingResult( session_id=session_id, algorithm=self.name, parameters=json.dumps(parameters), - run_at=datetime.now() + run_at=datetime.now(), ) session.add(surfacing_result) session.commit() surfacing_id = surfacing_result.id - + # Find users with matching email domains with get_db_session() as session: matching_users = [] - + for domain in domains: - users = session.query(User).filter( - User.email.isnot(None), - User.email.like(f"%@{domain}") - ).all() - + users = ( + session.query(User) + .filter(User.email.isnot(None), User.email.like(f'%@{domain}')) + .all() + ) + matching_users.extend(users) - + # Find repositories these users have contributed to repositories = set() - + for user in matching_users: # Check pull requests - prs = session.query(Repository).join( - Repository.pull_requests - ).filter( - Repository.pull_requests.any(user_id=user.id) - ).all() - + prs = ( + session.query(Repository) + .join(Repository.pull_requests) + .filter(Repository.pull_requests.any(user_id=user.id)) + .all() + ) + repositories.update(prs) - + # Check issues - issues = session.query(Repository).join( - Repository.issues - ).filter( - Repository.issues.any(user_id=user.id) - ).all() - + issues = ( + session.query(Repository) + .join(Repository.issues) + .filter(Repository.issues.any(user_id=user.id)) + .all() + ) + repositories.update(issues) - + # Add the found repositories to surfaced repositories for repo in repositories: # Check if we already added this repo - existing = session.query(SurfacedRepository).filter_by( - surfacing_id=surfacing_id, repository_id=repo.id - ).first() - + existing = ( + session.query(SurfacedRepository) + .filter_by(surfacing_id=surfacing_id, repository_id=repo.id) + .first() + ) + if not existing: # Add to surfaced repositories surfaced_repo = SurfacedRepository( surfacing_id=surfacing_id, repository_id=repo.id, - discovery_method="domain_contributor", - discovery_details=f"Found via contributors with institution email domains: {', '.join(domains)}", - surface_score=0.8 # High score for domain matches + discovery_method='domain_contributor', + discovery_details=f'Found via contributors with institution email domains: {", ".join(domains)}', + surface_score=0.8, # High score for domain matches ) session.add(surfaced_repo) - + # Update the result count - result_count = session.query(SurfacedRepository).filter_by(surfacing_id=surfacing_id).count() - surfacing_result = session.query(SurfacingResult).filter_by(id=surfacing_id).first() + result_count = ( + session.query(SurfacedRepository) + .filter_by(surfacing_id=surfacing_id) + .count() + ) + surfacing_result = ( + session.query(SurfacingResult).filter_by(id=surfacing_id).first() + ) if surfacing_result: surfacing_result.result_count = result_count - surfacing_result.result_summary = json.dumps({"domains": domains, "count": result_count}) - + surfacing_result.result_summary = json.dumps( + {'domains': domains, 'count': result_count} + ) + return surfacing_id + class DomainPeopleSurfacing(BaseSurfacingAlgorithm): """Find people with email domains matching the institution.""" - + @property def name(self) -> str: - return "Domain People Surfacing" - + return 'Domain People Surfacing' + @property def description(self) -> str: - return "Find GitHub users with email domains matching the institution" - - def run(self, session_id: int, institution_info: Dict[str, Any], parameters: Dict[str, Any]) -> int: + return 'Find GitHub users with email domains matching the institution' + + def run( + self, + session_id: int, + institution_info: Dict[str, Any], + parameters: Dict[str, Any], + ) -> int: """Run the domain-based people surfacing algorithm.""" - domains = institution_info.get("domains", []) + domains = institution_info.get('domains', []) if not domains: - raise ValueError("Institution domains are required for domain people surfacing") - + raise ValueError( + 'Institution domains are required for domain people surfacing' + ) + # Record the start of surfacing with get_db_session() as session: surfacing_result = SurfacingResult( session_id=session_id, algorithm=self.name, parameters=json.dumps(parameters), - run_at=datetime.now() + run_at=datetime.now(), ) session.add(surfacing_result) session.commit() surfacing_id = surfacing_result.id - + # Find users with matching email domains with get_db_session() as session: for domain in domains: - users = session.query(User).filter( - User.email.isnot(None), - User.email.like(f"%@{domain}") - ).all() - + users = ( + session.query(User) + .filter(User.email.isnot(None), User.email.like(f'%@{domain}')) + .all() + ) + for user in users: # Add to surfaced people surfaced_person = SurfacedPerson( @@ -309,285 +367,368 @@ def run(self, session_id: int, institution_info: Dict[str, Any], parameters: Dic user_id=user.id, name=user.name or user.login, email=user.email, - discovery_method="email_domain", - discovery_details=f"Email domain match: {domain}", - surface_score=0.9 # High score for email domain matches + discovery_method='email_domain', + discovery_details=f'Email domain match: {domain}', + surface_score=0.9, # High score for email domain matches ) session.add(surfaced_person) - + # Update the result count - result_count = session.query(SurfacedPerson).filter_by(surfacing_id=surfacing_id).count() - surfacing_result = session.query(SurfacingResult).filter_by(id=surfacing_id).first() + result_count = ( + session.query(SurfacedPerson) + .filter_by(surfacing_id=surfacing_id) + .count() + ) + surfacing_result = ( + session.query(SurfacingResult).filter_by(id=surfacing_id).first() + ) if surfacing_result: surfacing_result.result_count = result_count - surfacing_result.result_summary = json.dumps({"domains": domains, "count": result_count}) - + surfacing_result.result_summary = json.dumps( + {'domains': domains, 'count': result_count} + ) + return surfacing_id class ProfilePeopleSurfacing(BaseSurfacingAlgorithm): """Find people with profiles mentioning the institution.""" - + @property def name(self) -> str: - return "Profile People Surfacing" - + return 'Profile People Surfacing' + @property def description(self) -> str: - return "Find GitHub users with profiles mentioning the institution" - - def run(self, session_id: int, institution_info: Dict[str, Any], parameters: Dict[str, Any]) -> int: + return 'Find GitHub users with profiles mentioning the institution' + + def run( + self, + session_id: int, + institution_info: Dict[str, Any], + parameters: Dict[str, Any], + ) -> int: """Run the profile-based people surfacing algorithm.""" - institution_name = institution_info.get("name", "") + institution_name = institution_info.get('name', '') if not institution_name: - raise ValueError("Institution name is required") - + raise ValueError('Institution name is required') + # Record the start of surfacing with get_db_session() as session: surfacing_result = SurfacingResult( session_id=session_id, algorithm=self.name, parameters=json.dumps(parameters), - run_at=datetime.now() + run_at=datetime.now(), ) session.add(surfacing_result) session.commit() surfacing_id = surfacing_result.id - + # Find users with profiles mentioning the institution with get_db_session() as session: # Search in company field - company_users = session.query(User).filter( - User.company.isnot(None), - User.company.ilike(f"%{institution_name}%") - ).all() - + company_users = ( + session.query(User) + .filter( + User.company.isnot(None), + User.company.ilike(f'%{institution_name}%'), + ) + .all() + ) + # Search in bio field - bio_users = session.query(User).filter( - User.bio.isnot(None), - User.bio.ilike(f"%{institution_name}%") - ).all() - + bio_users = ( + session.query(User) + .filter(User.bio.isnot(None), User.bio.ilike(f'%{institution_name}%')) + .all() + ) + # Search in location field (for universities often named after locations) - location_users = session.query(User).filter( - User.location.isnot(None), - User.location.ilike(f"%{institution_name}%") - ).all() - + location_users = ( + session.query(User) + .filter( + User.location.isnot(None), + User.location.ilike(f'%{institution_name}%'), + ) + .all() + ) + # Combine results all_users = set(company_users + bio_users + location_users) - + for user in all_users: # Calculate score and details score = 0.0 details = [] - + if user.company and institution_name.lower() in user.company.lower(): score = max(score, 0.8) - details.append(f"Company match: {user.company}") - + details.append(f'Company match: {user.company}') + if user.bio and institution_name.lower() in user.bio.lower(): score = max(score, 0.6) - details.append(f"Bio match: mentions institution") - + details.append('Bio match: mentions institution') + if user.location and institution_name.lower() in user.location.lower(): score = max(score, 0.4) - details.append(f"Location match: {user.location}") - + details.append(f'Location match: {user.location}') + # Add to surfaced people surfaced_person = SurfacedPerson( surfacing_id=surfacing_id, user_id=user.id, name=user.name or user.login, email=user.email, - discovery_method="profile_mention", - discovery_details="; ".join(details), - surface_score=score + discovery_method='profile_mention', + discovery_details='; '.join(details), + surface_score=score, ) session.add(surfaced_person) - + # Update the result count - result_count = session.query(SurfacedPerson).filter_by(surfacing_id=surfacing_id).count() - surfacing_result = session.query(SurfacingResult).filter_by(id=surfacing_id).first() + result_count = ( + session.query(SurfacedPerson) + .filter_by(surfacing_id=surfacing_id) + .count() + ) + surfacing_result = ( + session.query(SurfacingResult).filter_by(id=surfacing_id).first() + ) if surfacing_result: surfacing_result.result_count = result_count - surfacing_result.result_summary = json.dumps({"institution": institution_name, "count": result_count}) - + surfacing_result.result_summary = json.dumps( + {'institution': institution_name, 'count': result_count} + ) + return surfacing_id class OpenAlexPeopleSurfacing(BaseSurfacingAlgorithm): """Find people from OpenAlex data that are affiliated with the institution.""" - + @property def name(self) -> str: - return "OpenAlex People Surfacing" - + return 'OpenAlex People Surfacing' + @property def description(self) -> str: - return "Find authors in OpenAlex that are affiliated with the institution" - - def run(self, session_id: int, institution_info: Dict[str, Any], parameters: Dict[str, Any]) -> int: + return 'Find authors in OpenAlex that are affiliated with the institution' + + def run( + self, + session_id: int, + institution_info: Dict[str, Any], + parameters: Dict[str, Any], + ) -> int: """Run the OpenAlex-based people surfacing algorithm.""" - institution_name = institution_info.get("name", "") + institution_name = institution_info.get('name', '') if not institution_name: - raise ValueError("Institution name is required") - + raise ValueError('Institution name is required') + # Record the start of surfacing with get_db_session() as session: surfacing_result = SurfacingResult( session_id=session_id, algorithm=self.name, parameters=json.dumps(parameters), - run_at=datetime.now() + run_at=datetime.now(), ) session.add(surfacing_result) session.commit() surfacing_id = surfacing_result.id - + # Find OpenAlex institutions matching the name with get_db_session() as session: - openalex_institutions = session.query(OpenAlexInstitution).filter( - OpenAlexInstitution.display_name.ilike(f"%{institution_name}%") - ).all() - + openalex_institutions = ( + session.query(OpenAlexInstitution) + .filter(OpenAlexInstitution.display_name.ilike(f'%{institution_name}%')) + .all() + ) + if not openalex_institutions: # No matching institutions found - surfacing_result = session.query(SurfacingResult).filter_by(id=surfacing_id).first() + surfacing_result = ( + session.query(SurfacingResult).filter_by(id=surfacing_id).first() + ) if surfacing_result: surfacing_result.result_count = 0 - surfacing_result.result_summary = json.dumps({"error": "No matching OpenAlex institutions found"}) + surfacing_result.result_summary = json.dumps( + {'error': 'No matching OpenAlex institutions found'} + ) return surfacing_id - + # Find authors affiliated with these institutions for institution in openalex_institutions: - authors = session.query(OpenAlexAuthor).filter( - OpenAlexAuthor.institutions.any(id=institution.id) - ).all() - + authors = ( + session.query(OpenAlexAuthor) + .filter(OpenAlexAuthor.institutions.any(id=institution.id)) + .all() + ) + for author in authors: # Add to surfaced people surfaced_person = SurfacedPerson( surfacing_id=surfacing_id, openalex_author_id=author.id, name=author.display_name, - discovery_method="openalex_affiliation", - discovery_details=f"Affiliated with {institution.display_name} in OpenAlex", - surface_score=0.85 # High score for OpenAlex affiliations + discovery_method='openalex_affiliation', + discovery_details=f'Affiliated with {institution.display_name} in OpenAlex', + surface_score=0.85, # High score for OpenAlex affiliations ) session.add(surfaced_person) - + # Update the result count - result_count = session.query(SurfacedPerson).filter_by(surfacing_id=surfacing_id).count() - surfacing_result = session.query(SurfacingResult).filter_by(id=surfacing_id).first() + result_count = ( + session.query(SurfacedPerson) + .filter_by(surfacing_id=surfacing_id) + .count() + ) + surfacing_result = ( + session.query(SurfacingResult).filter_by(id=surfacing_id).first() + ) if surfacing_result: surfacing_result.result_count = result_count - surfacing_result.result_summary = json.dumps({ - "institution": institution_name, - "openalex_institutions": [inst.display_name for inst in openalex_institutions], - "count": result_count - }) - + surfacing_result.result_summary = json.dumps( + { + 'institution': institution_name, + 'openalex_institutions': [ + inst.display_name for inst in openalex_institutions + ], + 'count': result_count, + } + ) + return surfacing_id - + + class NameRepositorySurfacing(BaseSurfacingAlgorithm): """Find repositories with names related to the institution.""" - + @property def name(self) -> str: - return "Name Repository Surfacing" - + return 'Name Repository Surfacing' + @property def description(self) -> str: - return "Find repositories with names or descriptions mentioning the institution" - - def run(self, session_id: int, institution_info: Dict[str, Any], parameters: Dict[str, Any]) -> int: + return 'Find repositories with names or descriptions mentioning the institution' + + def run( + self, + session_id: int, + institution_info: Dict[str, Any], + parameters: Dict[str, Any], + ) -> int: """Run the name-based repository surfacing algorithm.""" - institution_name = institution_info.get("name", "") + institution_name = institution_info.get('name', '') if not institution_name: - raise ValueError("Institution name is required") - + raise ValueError('Institution name is required') + # Generate variations of the name to search for name_variations = self._generate_name_variations(institution_name) - + # Record the start of surfacing with get_db_session() as session: surfacing_result = SurfacingResult( session_id=session_id, algorithm=self.name, parameters=json.dumps(parameters), - run_at=datetime.now() + run_at=datetime.now(), ) session.add(surfacing_result) session.commit() surfacing_id = surfacing_result.id - + # Search repositories by name and description with get_db_session() as session: for name_var in name_variations: # Search by full_name - name_repos = session.query(Repository).filter( - Repository.full_name.ilike(f"%{name_var}%") - ).all() - + name_repos = ( + session.query(Repository) + .filter(Repository.full_name.ilike(f'%{name_var}%')) + .all() + ) + # Search by description - desc_repos = session.query(Repository).filter( - Repository.description.isnot(None), - Repository.description.ilike(f"%{name_var}%") - ).all() - + desc_repos = ( + session.query(Repository) + .filter( + Repository.description.isnot(None), + Repository.description.ilike(f'%{name_var}%'), + ) + .all() + ) + # Combine results repositories = set(name_repos + desc_repos) - + # Add the found repositories for repo in repositories: # Check if we already added this repo - existing = session.query(SurfacedRepository).filter_by( - surfacing_id=surfacing_id, repository_id=repo.id - ).first() - + existing = ( + session.query(SurfacedRepository) + .filter_by(surfacing_id=surfacing_id, repository_id=repo.id) + .first() + ) + if not existing: # Calculate surface score based on match location score = 0.0 details = [] - - if repo.full_name and name_var.lower() in repo.full_name.lower(): + + if ( + repo.full_name + and name_var.lower() in repo.full_name.lower() + ): score = max(score, 0.9) - details.append(f"Name match: {repo.full_name}") - - if repo.description and name_var.lower() in repo.description.lower(): + details.append(f'Name match: {repo.full_name}') + + if ( + repo.description + and name_var.lower() in repo.description.lower() + ): score = max(score, 0.7) - details.append(f"Description match: {name_var} in description") - + details.append( + f'Description match: {name_var} in description' + ) + # Add to surfaced repositories surfaced_repo = SurfacedRepository( surfacing_id=surfacing_id, repository_id=repo.id, - discovery_method="name_match", - discovery_details="; ".join(details), - surface_score=score + discovery_method='name_match', + discovery_details='; '.join(details), + surface_score=score, ) session.add(surfaced_repo) - + # Update the result count - result_count = session.query(SurfacedRepository).filter_by(surfacing_id=surfacing_id).count() - surfacing_result = session.query(SurfacingResult).filter_by(id=surfacing_id).first() + result_count = ( + session.query(SurfacedRepository) + .filter_by(surfacing_id=surfacing_id) + .count() + ) + surfacing_result = ( + session.query(SurfacingResult).filter_by(id=surfacing_id).first() + ) if surfacing_result: surfacing_result.result_count = result_count - surfacing_result.result_summary = json.dumps({"name_variations": name_variations, "count": result_count}) - + surfacing_result.result_summary = json.dumps( + {'name_variations': name_variations, 'count': result_count} + ) + return surfacing_id - + def _generate_name_variations(self, institution_name: str) -> List[str]: """Generate variations of the institution name for searching.""" variations = [institution_name] - + # Add parts of the name parts = institution_name.split() if len(parts) > 1: for part in parts: if len(part) > 3: # Only add parts that are reasonably long variations.append(part) - + # Remove duplicates - return list(set(variations)) \ No newline at end of file + return list(set(variations)) diff --git a/Older Experiments/scrappy-proof-of-concept/services/openalex_ingestion.py b/Older Experiments/scrappy-proof-of-concept/services/openalex_ingestion.py index b0a585d..192c217 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/openalex_ingestion.py +++ b/Older Experiments/scrappy-proof-of-concept/services/openalex_ingestion.py @@ -1,20 +1,34 @@ # services/openalex_ingestion.py import json -import time import logging -from datetime import datetime, timezone +import time from clients.openalex_client import OpenAlexClient -from utils.common import clean_doi, get_current_time, parse_datetime from models.models import ( - OpenAlexWork, OpenAlexAuthor, OpenAlexVenue, - OpenAlexTopic, OpenAlexInstitution + OpenAlexAuthor, + OpenAlexInstitution, + OpenAlexTopic, + OpenAlexVenue, + OpenAlexWork, ) from services.discovery import record_discovery +from utils.common import clean_doi, get_current_time logger = logging.getLogger(__name__) -def process_authors(session, work, work_data, discovery_method, discovery_details, trigger_input, keyword, chain_id, branch_id, step): + +def process_authors( + session, + work, + work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + step, +): """ Process authors from work data and link them to the work. """ @@ -23,80 +37,96 @@ def process_authors(session, work, work_data, discovery_method, discovery_detail author_data = authorship.get('author', {}) if not author_data or not author_data.get('id'): continue - + author_id = author_data.get('id') # If the ID is a URL, extract just the ID part if author_id.startswith('https://'): author_id = author_id.split('/')[-1] - + author = session.query(OpenAlexAuthor).filter_by(openalex_id=author_id).first() - + if not author: author = OpenAlexAuthor( openalex_id=author_id, display_name=author_data.get('display_name'), orcid=author_data.get('orcid'), works_count=author_data.get('works_count'), - raw_data=json.dumps(author_data) + raw_data=json.dumps(author_data), ) author.ingested_at = get_current_time() session.add(author) session.flush() # Get the ID without committing - + record_discovery( author, discovery_method, - f"{discovery_details}; Author discovered from work {work.openalex_id}", + f'{discovery_details}; Author discovered from work {work.openalex_id}', trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=branch_id, - step=step+1 + step=step + 1, ) - + # Process institutions for this author institutions = authorship.get('institutions', []) for inst_data in institutions: if not inst_data or not inst_data.get('id'): continue - + inst_id = inst_data.get('id') # If the ID is a URL, extract just the ID part if inst_id.startswith('https://'): inst_id = inst_id.split('/')[-1] - - institution = session.query(OpenAlexInstitution).filter_by(openalex_id=inst_id).first() - + + institution = ( + session.query(OpenAlexInstitution) + .filter_by(openalex_id=inst_id) + .first() + ) + if not institution: institution = OpenAlexInstitution( openalex_id=inst_id, display_name=inst_data.get('display_name'), country_code=inst_data.get('country_code'), url=inst_data.get('url'), - raw_data=json.dumps(inst_data) + raw_data=json.dumps(inst_data), ) institution.ingested_at = get_current_time() session.add(institution) session.flush() # Get the ID without committing - + record_discovery( institution, discovery_method, - f"{discovery_details}; Institution discovered from author {author.openalex_id}", + f'{discovery_details}; Institution discovered from author {author.openalex_id}', trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=branch_id, - step=step+2 + step=step + 2, ) - + if institution not in author.institutions: author.institutions.append(institution) - + if author not in work.authors: work.authors.append(author) -def process_topics(session, work, work_data, discovery_method, discovery_details, trigger_input, keyword, chain_id, branch_id, step): + +def process_topics( + session, + work, + work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + step, +): """ Process topics from work data and link them to the work. """ @@ -104,19 +134,19 @@ def process_topics(session, work, work_data, discovery_method, discovery_details for topic_data in topics_data: if not topic_data or not topic_data.get('id'): continue - + topic_id = topic_data.get('id') # If the ID is a URL, extract just the ID part if topic_id.startswith('https://'): topic_id = topic_id.split('/')[-1] - + topic = session.query(OpenAlexTopic).filter_by(openalex_id=topic_id).first() - + if not topic: domain_data = topic_data.get('domain', {}) field_data = topic_data.get('field', {}) subfield_data = topic_data.get('subfield', {}) - + topic = OpenAlexTopic( openalex_id=topic_id, display_name=topic_data.get('display_name'), @@ -128,27 +158,39 @@ def process_topics(session, work, work_data, discovery_method, discovery_details subfield_id=subfield_data.get('id'), subfield_display_name=subfield_data.get('display_name'), works_count=topic_data.get('works_count'), - raw_data=json.dumps(topic_data) + raw_data=json.dumps(topic_data), ) topic.ingested_at = get_current_time() session.add(topic) session.flush() # Get the ID without committing - + record_discovery( topic, discovery_method, - f"{discovery_details}; Topic discovered from work {work.openalex_id}", + f'{discovery_details}; Topic discovered from work {work.openalex_id}', trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=branch_id, - step=step+1 + step=step + 1, ) - + if topic not in work.topics: work.topics.append(topic) -def process_venue(session, work, work_data, discovery_method, discovery_details, trigger_input, keyword, chain_id, branch_id, step): + +def process_venue( + session, + work, + work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + step, +): """ Process venue from work data and link it to the work. """ @@ -156,54 +198,63 @@ def process_venue(session, work, work_data, discovery_method, discovery_details, venue_data = primary_location.get('source', {}) if not venue_data or not venue_data.get('id'): return - + venue_id = venue_data.get('id') # If the ID is a URL, extract just the ID part if venue_id.startswith('https://'): venue_id = venue_id.split('/')[-1] - + venue = session.query(OpenAlexVenue).filter_by(openalex_id=venue_id).first() - + if not venue: venue = OpenAlexVenue( openalex_id=venue_id, display_name=venue_data.get('display_name'), publisher=venue_data.get('publisher'), url=venue_data.get('url'), - raw_data=json.dumps(venue_data) + raw_data=json.dumps(venue_data), ) venue.ingested_at = get_current_time() session.add(venue) session.flush() # Get the ID without committing - + record_discovery( venue, discovery_method, - f"{discovery_details}; Venue discovered from work {work.openalex_id}", + f'{discovery_details}; Venue discovered from work {work.openalex_id}', trigger_input=trigger_input, keyword=keyword, chain_id=chain_id, branch_id=branch_id, - step=step+1 + step=step + 1, ) - + work.venue_id = venue.id -def update_or_create_openalex_work(session, work_data, fully_fetched=True, - discovery_method="direct_ingestion", - discovery_details="Work discovered during repository ingestion", - trigger_input=None, keyword=None, chain_id=None, branch_id=None, step=1): + +def update_or_create_openalex_work( + session, + work_data, + fully_fetched=True, + discovery_method='direct_ingestion', + discovery_details='Work discovered during repository ingestion', + trigger_input=None, + keyword=None, + chain_id=None, + branch_id=None, + step=1, +): """ Create or update an OpenAlexWork record based on work_data. """ openalex_id = work_data.get('id') doi = work_data.get('doi') if doi: - doi = doi.replace("https://doi.org/", "").strip() + doi = doi.replace('https://doi.org/', '').strip() existing = session.query(OpenAlexWork).filter_by(openalex_id=openalex_id).first() if existing: existing.ingested_at = get_current_time() - + # If we're fully fetching an existing work that wasn't fully fetched before, # update its data and process relations if fully_fetched and not existing.fully_fetched: @@ -215,22 +266,57 @@ def update_or_create_openalex_work(session, work_data, fully_fetched=True, existing.url = work_data.get('url') existing.fully_fetched = True existing.raw_data = json.dumps(work_data) - + # Process relations - process_authors(session, existing, work_data, discovery_method, discovery_details, - trigger_input, keyword, chain_id, branch_id, step) - process_topics(session, existing, work_data, discovery_method, discovery_details, - trigger_input, keyword, chain_id, branch_id, step) - process_venue(session, existing, work_data, discovery_method, discovery_details, - trigger_input, keyword, chain_id, branch_id, step) - + process_authors( + session, + existing, + work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + step, + ) + process_topics( + session, + existing, + work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + step, + ) + process_venue( + session, + existing, + work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + step, + ) + record_discovery( - existing, discovery_method, discovery_details, - trigger_input=trigger_input, keyword=keyword, - chain_id=chain_id, branch_id=branch_id, step=step + existing, + discovery_method, + discovery_details, + trigger_input=trigger_input, + keyword=keyword, + chain_id=chain_id, + branch_id=branch_id, + step=step, ) return existing - + work = OpenAlexWork( openalex_id=openalex_id, doi=doi, @@ -240,30 +326,75 @@ def update_or_create_openalex_work(session, work_data, fully_fetched=True, type=work_data.get('type'), url=work_data.get('url'), fully_fetched=fully_fetched, - raw_data=json.dumps(work_data) + raw_data=json.dumps(work_data), ) work.ingested_at = get_current_time() session.add(work) session.commit() # Commit to ensure work has an ID - + # Process relations for fully fetched works if fully_fetched: - process_authors(session, work, work_data, discovery_method, discovery_details, - trigger_input, keyword, chain_id, branch_id, step) - process_topics(session, work, work_data, discovery_method, discovery_details, - trigger_input, keyword, chain_id, branch_id, step) - process_venue(session, work, work_data, discovery_method, discovery_details, - trigger_input, keyword, chain_id, branch_id, step) - + process_authors( + session, + work, + work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + step, + ) + process_topics( + session, + work, + work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + step, + ) + process_venue( + session, + work, + work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + step, + ) + record_discovery( - work, discovery_method, discovery_details, - trigger_input=trigger_input, keyword=keyword, - chain_id=chain_id, branch_id=branch_id, step=step + work, + discovery_method, + discovery_details, + trigger_input=trigger_input, + keyword=keyword, + chain_id=chain_id, + branch_id=branch_id, + step=step, ) return work -def ingest_openalex_data(session, repository, discovery_method, discovery_details, - trigger_input=None, keyword=None, chain_id=None, branch_id=None, step=1): + +def ingest_openalex_data( + session, + repository, + discovery_method, + discovery_details, + trigger_input=None, + keyword=None, + chain_id=None, + branch_id=None, + step=1, +): """ Ingest OpenAlex works using all DOIs associated with a repository. For each DOI: @@ -277,12 +408,12 @@ def ingest_openalex_data(session, repository, discovery_method, discovery_detail for doi_obj in repository.dois: doi_str = doi_obj.doi cleaned = clean_doi(doi_str) - logger.info(f"Processing DOI: {doi_str} (cleaned: {cleaned})") + logger.info(f'Processing DOI: {doi_str} (cleaned: {cleaned})') start_time = time.time() work_data = client_oa.get_work_by_doi(doi_str) elapsed = time.time() - start_time - logger.info(f"Query for OpenAlex work took {elapsed:.2f} seconds.") - + logger.info(f'Query for OpenAlex work took {elapsed:.2f} seconds.') + if work_data: work = update_or_create_openalex_work( session, @@ -294,29 +425,29 @@ def ingest_openalex_data(session, repository, discovery_method, discovery_detail keyword=keyword, chain_id=chain_id, branch_id=branch_id, - step=step+1 # Increment step for work creation + step=step + 1, # Increment step for work creation ) - + try: work_data_dict = json.loads(work.raw_data) except Exception: work_data_dict = {} - + # Process referenced works - references = work_data_dict.get("referenced_works", []) + references = work_data_dict.get('referenced_works', []) for ref_id in references: - cited_work = session.query(OpenAlexWork).filter_by(openalex_id=ref_id).first() + cited_work = ( + session.query(OpenAlexWork).filter_by(openalex_id=ref_id).first() + ) if not cited_work: # Create a stub record for the cited work cited_work = OpenAlexWork( - openalex_id=ref_id, - fully_fetched=False, - raw_data="{}" + openalex_id=ref_id, fully_fetched=False, raw_data='{}' ) cited_work.ingested_at = get_current_time() session.add(cited_work) session.commit() - + # Fetch full data for works that haven't been fully fetched yet if not cited_work.fully_fetched: # Fetch full data for the referenced work @@ -324,28 +455,64 @@ def ingest_openalex_data(session, repository, discovery_method, discovery_detail if full_work_data: # Update the stub record with full data cited_work.doi = full_work_data.get('doi') - if cited_work.doi and cited_work.doi.startswith('https://doi.org/'): - cited_work.doi = cited_work.doi.replace('https://doi.org/', '') + if cited_work.doi and cited_work.doi.startswith( + 'https://doi.org/' + ): + cited_work.doi = cited_work.doi.replace( + 'https://doi.org/', '' + ) cited_work.title = full_work_data.get('title') - cited_work.publication_year = full_work_data.get('publication_year') + cited_work.publication_year = full_work_data.get( + 'publication_year' + ) cited_work.abstract = full_work_data.get('abstract') or None cited_work.type = full_work_data.get('type') cited_work.url = full_work_data.get('url') cited_work.fully_fetched = True cited_work.raw_data = json.dumps(full_work_data) - + # Process relations for the newly fetched work - using current step + 2 current_step = step + 2 # Increment for references - process_authors(session, cited_work, full_work_data, discovery_method, discovery_details, - trigger_input, keyword, chain_id, branch_id, current_step) - process_topics(session, cited_work, full_work_data, discovery_method, discovery_details, - trigger_input, keyword, chain_id, branch_id, current_step) - process_venue(session, cited_work, full_work_data, discovery_method, discovery_details, - trigger_input, keyword, chain_id, branch_id, current_step) - + process_authors( + session, + cited_work, + full_work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + current_step, + ) + process_topics( + session, + cited_work, + full_work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + current_step, + ) + process_venue( + session, + cited_work, + full_work_data, + discovery_method, + discovery_details, + trigger_input, + keyword, + chain_id, + branch_id, + current_step, + ) + # Add a delay to avoid hitting rate limits time.sleep(0.5) - + if cited_work not in work.cited_works: record_discovery( cited_work, @@ -355,19 +522,21 @@ def ingest_openalex_data(session, repository, discovery_method, discovery_detail keyword=keyword, chain_id=chain_id, branch_id=branch_id, - step=step+2 # Increment step for citations + step=step + 2, # Increment step for citations ) work.cited_works.append(cited_work) - + # Process citing works (NEW) - logger.info(f"Fetching works citing {work.openalex_id}...") + logger.info(f'Fetching works citing {work.openalex_id}...') citing_works_data = client_oa.get_citing_works(work.openalex_id) - logger.info(f"Found {len(citing_works_data)} works citing {work.openalex_id}") - + logger.info( + f'Found {len(citing_works_data)} works citing {work.openalex_id}' + ) + for citing_work_data in citing_works_data: if not citing_work_data.get('id'): continue - + citing_work = update_or_create_openalex_work( session, citing_work_data, @@ -378,9 +547,9 @@ def ingest_openalex_data(session, repository, discovery_method, discovery_detail keyword=keyword, chain_id=chain_id, branch_id=branch_id, - step=step+2 # Same level as references + step=step + 2, # Same level as references ) - + # Establish the citation relationship - this citing work cites our work if work not in citing_work.cited_works: citing_work.cited_works.append(work) @@ -392,18 +561,24 @@ def ingest_openalex_data(session, repository, discovery_method, discovery_detail keyword=keyword, chain_id=chain_id, branch_id=branch_id, - step=step+2 + step=step + 2, ) - + # Add a delay to avoid hitting rate limits time.sleep(0.2) - + session.commit() - + for author in work.authors: - additional_works = client_oa.get_additional_works_for_author(author.openalex_id, per_page=5) + additional_works = client_oa.get_additional_works_for_author( + author.openalex_id, per_page=5 + ) for add_work_data in additional_works: - if not session.query(OpenAlexWork).filter_by(openalex_id=add_work_data.get('id')).first(): + if ( + not session.query(OpenAlexWork) + .filter_by(openalex_id=add_work_data.get('id')) + .first() + ): update_or_create_openalex_work( session, add_work_data, @@ -414,7 +589,7 @@ def ingest_openalex_data(session, repository, discovery_method, discovery_detail keyword=keyword, chain_id=chain_id, branch_id=branch_id, - step=step+3 # Increment step for author's works + step=step + 3, # Increment step for author's works ) else: - logger.error(f"Failed to fetch work for DOI {doi_str} from OpenAlex.") \ No newline at end of file + logger.error(f'Failed to fetch work for DOI {doi_str} from OpenAlex.') diff --git a/Older Experiments/scrappy-proof-of-concept/services/query_service.py b/Older Experiments/scrappy-proof-of-concept/services/query_service.py index 67d4180..7cfb7f1 100644 --- a/Older Experiments/scrappy-proof-of-concept/services/query_service.py +++ b/Older Experiments/scrappy-proof-of-concept/services/query_service.py @@ -1,35 +1,44 @@ # services/query_service.py from db.database import get_db_session -from models.models import User, PullRequest, Repository, OpenAlexWork, OpenAlexInstitution, OpenAlexAuthor -from sqlalchemy import desc, func, select +from models.models import ( + OpenAlexAuthor, + OpenAlexInstitution, + OpenAlexWork, + PullRequest, + Repository, + User, +) +from sqlalchemy import desc, func + def get_top_contributors(repo_id: int, limit: int = 10): with get_db_session() as session: results = ( session.query( - User.login.label("user_login"), - func.count(PullRequest.id).label("merged_count") + User.login.label('user_login'), + func.count(PullRequest.id).label('merged_count'), ) .join(PullRequest, PullRequest.user_id == User.id) .filter(PullRequest.merged_at.isnot(None)) .filter(PullRequest.repository_id == repo_id) .group_by(User.login) - .order_by(desc("merged_count")) + .order_by(desc('merged_count')) .limit(limit) .all() ) return results + def get_institutions_with_doi(repo_id: int): with get_db_session() as session: repository = session.query(Repository).filter_by(id=repo_id).first() if not repository: - raise ValueError(f"Repository with ID {repo_id} not found.") + raise ValueError(f'Repository with ID {repo_id} not found.') doi_list = [doi_obj.doi for doi_obj in repository.dois] institutions = ( session.query( OpenAlexInstitution.display_name, - func.count(func.distinct(OpenAlexAuthor.id)).label("author_count") + func.count(func.distinct(OpenAlexAuthor.id)).label('author_count'), ) .join(OpenAlexAuthor, OpenAlexInstitution.authors) .join(OpenAlexWork, OpenAlexAuthor.works) diff --git a/Older Experiments/scrappy-proof-of-concept/utils/common.py b/Older Experiments/scrappy-proof-of-concept/utils/common.py index c46ab2c..f93131c 100644 --- a/Older Experiments/scrappy-proof-of-concept/utils/common.py +++ b/Older Experiments/scrappy-proof-of-concept/utils/common.py @@ -1,37 +1,42 @@ # utils/common.py -import re import json -from dateutil import parser +import re from datetime import datetime, timezone +from dateutil import parser + + def parse_github_url(url: str) -> tuple: """ Extracts (owner, repo) from a GitHub URL. Example: "https://github.com/user/repo.git" -> ("user", "repo") """ - pattern = r"github\.com/([^/]+)/([^/]+)" + pattern = r'github\.com/([^/]+)/([^/]+)' match = re.search(pattern, url) if match: owner, repo = match.groups() # Remove .git suffix if present - repo = repo.replace(".git", "") + repo = repo.replace('.git', '') return owner, repo return None, None + def clean_doi(doi: str) -> str: """ Clean DOI string by stripping whitespace and unwanted trailing characters. """ - return doi.strip().rstrip(").,;") + return doi.strip().rstrip(').,;') + def extract_dois_from_text(text: str): """ Extract all DOI strings from a given text. """ - pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+" + pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+' return re.findall(pattern, text, flags=re.IGNORECASE) + def parse_datetime(dt_str: str): """ Parse an ISO formatted datetime string. @@ -43,12 +48,14 @@ def parse_datetime(dt_str: str): return None return None + def save_json_field(data): """ Convert data to a JSON string if data is present. """ return json.dumps(data) if data else None + def get_current_time(): """ Get current UTC time. diff --git a/Older Experiments/scrappy-proof-of-concept/utils/filters.py b/Older Experiments/scrappy-proof-of-concept/utils/filters.py index ec1de0e..8d54eca 100644 --- a/Older Experiments/scrappy-proof-of-concept/utils/filters.py +++ b/Older Experiments/scrappy-proof-of-concept/utils/filters.py @@ -1,6 +1,7 @@ # filters.py from models.models import Repository + def filter_has_doi(query): """ Return repositories that have at least one associated DOI. @@ -8,6 +9,7 @@ def filter_has_doi(query): """ return query.join(Repository.dois).distinct() + def filter_has_stars(query): """ Return repositories that have at least one star. @@ -15,6 +17,7 @@ def filter_has_stars(query): """ return query.filter(Repository.stargazers_count > 0) + def filter_has_contributors(query): """ Return repositories that have at least one contributor. @@ -23,6 +26,7 @@ def filter_has_contributors(query): """ return query.join(Repository.pull_requests).distinct() + def filter_has_forks(query): """ Return repositories that have at least one fork. diff --git a/Older Experiments/scrappy-proof-of-concept/utils/logging_config.py b/Older Experiments/scrappy-proof-of-concept/utils/logging_config.py index 5e6a4b2..ad4006d 100644 --- a/Older Experiments/scrappy-proof-of-concept/utils/logging_config.py +++ b/Older Experiments/scrappy-proof-of-concept/utils/logging_config.py @@ -1,7 +1,9 @@ # utils/logging_config.py import logging + from config import LOG_LEVEL + def setup_logging(): """ Configures logging for the application. @@ -10,7 +12,5 @@ def setup_logging(): logging.basicConfig( level=LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - handlers=[ - logging.StreamHandler() - ] + handlers=[logging.StreamHandler()], ) diff --git a/Older Experiments/scrappy-proof-of-concept/utils/repo_finder.py b/Older Experiments/scrappy-proof-of-concept/utils/repo_finder.py index 9e27657..8a075bf 100644 --- a/Older Experiments/scrappy-proof-of-concept/utils/repo_finder.py +++ b/Older Experiments/scrappy-proof-of-concept/utils/repo_finder.py @@ -2,26 +2,28 @@ import logging import time from datetime import datetime, timedelta + from dateutil.relativedelta import relativedelta logger = logging.getLogger(__name__) + def search_repositories(client, keywords): """ Search GitHub repositories using the GitHub API. Returns a list of repository JSON objects. """ - search_url = f"{client.BASE_URL}/search/repositories" + search_url = f'{client.BASE_URL}/search/repositories' all_repositories = [] per_page = 100 page = 1 while True: - params = {"q": keywords, "per_page": per_page, "page": page} - logger.info(f"Searching repositories: page {page}") + params = {'q': keywords, 'per_page': per_page, 'page': page} + logger.info(f'Searching repositories: page {page}') results = client.get(search_url, params=params) - if not results or "items" not in results: + if not results or 'items' not in results: break - items = results["items"] + items = results['items'] all_repositories.extend(items) if len(items) < per_page: break @@ -29,37 +31,46 @@ def search_repositories(client, keywords): time.sleep(1) return all_repositories -def search_repositories_in_range(client, keywords, start_date, end_date, threshold=1000): + +def search_repositories_in_range( + client, keywords, start_date, end_date, threshold=1000 +): """ Search GitHub repositories with the given keywords created between start_date and end_date. If the total_count is >= threshold, subdivide the range recursively. Logs the count for each date range and returns the repository JSON objects. """ - query = f"\"{keywords}\" created:{start_date.strftime('%Y-%m-%d')}..{end_date.strftime('%Y-%m-%d')}" - search_url = f"{client.BASE_URL}/search/repositories" - params = {"q": query, "per_page": 1, "page": 1} + query = f'"{keywords}" created:{start_date.strftime("%Y-%m-%d")}..{end_date.strftime("%Y-%m-%d")}' + search_url = f'{client.BASE_URL}/search/repositories' + params = {'q': query, 'per_page': 1, 'page': 1} response = client.get(search_url, params=params) if not response: return [] - total_count = response.get("total_count", 0) - logger.info(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')} -> {total_count} repos found") + total_count = response.get('total_count', 0) + logger.info( + f'Date Range: {start_date.strftime("%Y-%m-%d")} to {end_date.strftime("%Y-%m-%d")} -> {total_count} repos found' + ) if total_count >= threshold: mid_timedelta = (end_date - start_date) / 2 mid_date = start_date + mid_timedelta - left_repos = search_repositories_in_range(client, keywords, start_date, mid_date, threshold) + left_repos = search_repositories_in_range( + client, keywords, start_date, mid_date, threshold + ) right_start = mid_date + timedelta(days=1) - right_repos = search_repositories_in_range(client, keywords, right_start, end_date, threshold) + right_repos = search_repositories_in_range( + client, keywords, right_start, end_date, threshold + ) return left_repos + right_repos else: all_repositories = [] per_page = 100 page = 1 while True: - params = {"q": query, "per_page": per_page, "page": page} + params = {'q': query, 'per_page': per_page, 'page': page} results = client.get(search_url, params=params) - if not results or "items" not in results: + if not results or 'items' not in results: break - items = results["items"] + items = results['items'] all_repositories.extend(items) if len(items) < per_page: break @@ -67,6 +78,7 @@ def search_repositories_in_range(client, keywords, start_date, end_date, thresho time.sleep(1) return all_repositories + def search_repositories_by_date_ranges(client, keywords): """ Generate date-range chunks starting from the current time back to January 1st of the current year at 00:01 @@ -78,17 +90,25 @@ def search_repositories_by_date_ranges(client, keywords): current_year_boundary = datetime(now.year, 1, 1, 0, 1) first_chunk_start = current_year_boundary first_chunk_end = now - repos = search_repositories_in_range(client, keywords, first_chunk_start, first_chunk_end) + repos = search_repositories_in_range( + client, keywords, first_chunk_start, first_chunk_end + ) chunks_results.extend(repos) - logger.info(f"First chunk (current year): {first_chunk_start.strftime('%Y-%m-%d')} to {first_chunk_end.strftime('%Y-%m-%d')} -> {len(repos)} repos found") - + logger.info( + f'First chunk (current year): {first_chunk_start.strftime("%Y-%m-%d")} to {first_chunk_end.strftime("%Y-%m-%d")} -> {len(repos)} repos found' + ) + next_end = current_year_boundary - timedelta(seconds=1) while True: next_start = next_end - relativedelta(years=1) + timedelta(seconds=1) if next_start.year < 2008: break - repos_chunk = search_repositories_in_range(client, keywords, next_start, next_end) - logger.info(f"12-month chunk: {next_start.strftime('%Y-%m-%d')} to {next_end.strftime('%Y-%m-%d')} -> {len(repos_chunk)} repos found") + repos_chunk = search_repositories_in_range( + client, keywords, next_start, next_end + ) + logger.info( + f'12-month chunk: {next_start.strftime("%Y-%m-%d")} to {next_end.strftime("%Y-%m-%d")} -> {len(repos_chunk)} repos found' + ) chunks_results.extend(repos_chunk) next_end = next_start - timedelta(seconds=1) - return chunks_results \ No newline at end of file + return chunks_results diff --git a/Older Experiments/scripts/ecosyst.ms-api.py b/Older Experiments/scripts/ecosyst.ms-api.py index 58636e5..aae7cbc 100644 --- a/Older Experiments/scripts/ecosyst.ms-api.py +++ b/Older Experiments/scripts/ecosyst.ms-api.py @@ -260,7 +260,7 @@ def process_paper_mentions(paper_mentions_url): proj_dict = project_response.json() if 'ecosystem' in proj_dict: paper_mentions.append( - f"{proj_dict['ecosystem']}:{proj_dict['name']}" + f'{proj_dict["ecosystem"]}:{proj_dict["name"]}' ) except requests.exceptions.RequestException as e: print(f'Request failed for project {project_url}: {e}') @@ -305,7 +305,7 @@ def process_project(project_u): { 'ID': project_dict['czi_id'], 'Label': 'Project', - 'Name': f"{project_dict['ecosystem']}:{project_dict['name']}", + 'Name': f'{project_dict["ecosystem"]}:{project_dict["name"]}', 'Homepage': home, 'repository_url': repo, } @@ -317,21 +317,21 @@ def process_project(project_u): [shared_resources.project_df, this_project] ).drop_duplicates(subset=['ID']) - project_mentions_url = f"{project_dict['mentions_url']}?page=1&per_page=1000" + project_mentions_url = f'{project_dict["mentions_url"]}?page=1&per_page=1000' mentions_response = requests.get( project_mentions_url, headers=headers, timeout=10 ) mentions_dict = mentions_response.json() print(f'Querying: {project_u}') print( - f"There are {mentions_response.headers['total-pages']} pages of mentions to fetch." + f'There are {mentions_response.headers["total-pages"]} pages of mentions to fetch.' ) - print(f"For a total of: {mentions_response.headers['total-count']} papers") + print(f'For a total of: {mentions_response.headers["total-count"]} papers') paper_urls_list = [] total_pages = int(mentions_response.headers['total-pages']) for page_num in range(1, total_pages + 1): project_mentions_url = ( - f"{project_dict['mentions_url']}?page={page_num}&per_page=1000" + f'{project_dict["mentions_url"]}?page={page_num}&per_page=1000' ) mentions_dict = requests.get( project_mentions_url, headers=headers, timeout=10 diff --git a/Older Experiments/scripts/repo_cite/repo_cite.py b/Older Experiments/scripts/repo_cite/repo_cite.py index 61b0152..f6cd4c3 100644 --- a/Older Experiments/scripts/repo_cite/repo_cite.py +++ b/Older Experiments/scripts/repo_cite/repo_cite.py @@ -13,17 +13,18 @@ - RECORD_LIMIT and MAX_DEPTH are now read as integers (with 0 meaning “all”) to keep their type consistent. """ -import requests import json +import logging +import os import re import time -import logging -from urllib.parse import quote from collections import deque from datetime import datetime, timedelta +from typing import Any, Dict, Optional +from urllib.parse import quote + +import requests from dotenv import load_dotenv -import os -from typing import Optional, Dict, Any # Configure logging logging.basicConfig( @@ -31,8 +32,8 @@ format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('repo_cite.log'), # More descriptive log file name - logging.StreamHandler() # Also log to console - ] + logging.StreamHandler(), # Also log to console + ], ) # Load environment variables from .env file @@ -49,10 +50,14 @@ VISITED_PAPERS: set = set() # Set your email for OpenAlex API rate limit increase, sourced from the .env file if available -OPENALEX_EMAIL: str = os.getenv('OPENALEX_EMAIL', 'your.email@example.com') # Replace in your .env file +OPENALEX_EMAIL: str = os.getenv( + 'OPENALEX_EMAIL', 'your.email@example.com' +) # Replace in your .env file # GitHub personal access token (read from .env file) -GITHUB_TOKEN: Optional[str] = os.getenv('GITHUB_TOKEN') # Ensure your .env file has GITHUB_TOKEN= +GITHUB_TOKEN: Optional[str] = os.getenv( + 'GITHUB_TOKEN' +) # Ensure your .env file has GITHUB_TOKEN= # Set the number of records to retrieve per API call (0 means all) try: @@ -72,6 +77,7 @@ # Delay between retries (in seconds) RETRY_DELAY: int = 5 + def get_doi_from_github_repo(repo_owner: str, repo_name: str) -> Optional[str]: """ Fetch the DOI from a GitHub repository. @@ -85,21 +91,21 @@ def get_doi_from_github_repo(repo_owner: str, repo_name: str) -> Optional[str]: This function searches for a 'CITATION.cff' file in the repository first. If not found, it then searches the 'README.md' for DOI patterns. - + TODO: - Extend this function to search for a '.zenodo.json' file, which may also contain metadata. - + Note: Scanning 'README.md' may sometimes capture DOIs unrelated to the software’s own citation. """ logging.info(f"Fetching DOI from GitHub repository '{repo_owner}/{repo_name}'") - url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents" + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/contents' headers = {} if GITHUB_TOKEN: headers['Authorization'] = f'token {GITHUB_TOKEN}' response = requests.get(url, headers=headers) if response.status_code != 200: - logging.error(f"Error fetching repository contents: {response.status_code}") + logging.error(f'Error fetching repository contents: {response.status_code}') return None contents = response.json() # Search for CITATION.cff @@ -120,9 +126,13 @@ def get_doi_from_github_repo(repo_owner: str, repo_name: str) -> Optional[str]: return doi else: # Not treating inability to fetch CITATION.cff as an error - logging.info("Unable to fetch 'CITATION.cff' content; continuing search in README.md") + logging.info( + "Unable to fetch 'CITATION.cff' content; continuing search in README.md" + ) else: - logging.info("'CITATION.cff' does not have a download URL; continuing search") + logging.info( + "'CITATION.cff' does not have a download URL; continuing search" + ) # If CITATION.cff not found or DOI not found, try README.md for item in contents: if item['name'].lower() == 'readme.md': @@ -132,7 +142,11 @@ def get_doi_from_github_repo(repo_owner: str, repo_name: str) -> Optional[str]: readme_response = requests.get(readme_url, headers=headers) if readme_response.status_code == 200: readme_content = readme_response.text - doi_matches = re.findall(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', readme_content, re.IGNORECASE) + doi_matches = re.findall( + r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', + readme_content, + re.IGNORECASE, + ) if doi_matches: doi = doi_matches[0] logging.info(f"DOI found in 'README.md': {doi}") @@ -140,9 +154,10 @@ def get_doi_from_github_repo(repo_owner: str, repo_name: str) -> Optional[str]: else: logging.error("Error fetching 'README.md'") return None - logging.warning("DOI not found in the repository") + logging.warning('DOI not found in the repository') # Implicitly returns None + def get_paper_details(doi: str) -> Optional[dict]: """ Fetch paper details from OpenAlex using the DOI. @@ -153,16 +168,19 @@ def get_paper_details(doi: str) -> Optional[dict]: Returns: Optional[dict]: Paper data as a dictionary if retrieval is successful; otherwise, None. """ - logging.info(f"Fetching paper details for DOI: {doi}") - url = f"https://api.openalex.org/works/doi:{quote(doi)}?mailto={OPENALEX_EMAIL}" + logging.info(f'Fetching paper details for DOI: {doi}') + url = f'https://api.openalex.org/works/doi:{quote(doi)}?mailto={OPENALEX_EMAIL}' response = make_api_request(url) if response is None: return None paper_data = response.json() - logging.debug(f"Paper data retrieved: {paper_data}") + logging.debug(f'Paper data retrieved: {paper_data}') return paper_data -def make_api_request(url: str, headers: Optional[dict] = None, params: Optional[dict] = None) -> Optional[requests.Response]: + +def make_api_request( + url: str, headers: Optional[dict] = None, params: Optional[dict] = None +) -> Optional[requests.Response]: """ Make an API request with retry logic and exponential backoff. @@ -185,19 +203,26 @@ def make_api_request(url: str, headers: Optional[dict] = None, params: Optional[ elif response.status_code in [429, 500, 502, 503, 504]: retries += 1 sleep_time = RETRY_DELAY * (2 ** (retries - 1)) - logging.warning(f"API request failed with status {response.status_code}. Retrying in {sleep_time} seconds...") + logging.warning( + f'API request failed with status {response.status_code}. Retrying in {sleep_time} seconds...' + ) time.sleep(sleep_time) else: - logging.error(f"API request failed with status {response.status_code}. URL: {url}") + logging.error( + f'API request failed with status {response.status_code}. URL: {url}' + ) return None except requests.exceptions.RequestException as e: retries += 1 sleep_time = RETRY_DELAY * (2 ** (retries - 1)) - logging.warning(f"Request exception: {e}. Retrying in {sleep_time} seconds...") + logging.warning( + f'Request exception: {e}. Retrying in {sleep_time} seconds...' + ) time.sleep(sleep_time) - logging.error(f"Failed to retrieve data after {MAX_RETRIES} attempts.") + logging.error(f'Failed to retrieve data after {MAX_RETRIES} attempts.') return None + def process_paper_data(paper_data: dict) -> None: """ Process and store paper data from OpenAlex. @@ -210,9 +235,9 @@ def process_paper_data(paper_data: dict) -> None: """ openalex_id = paper_data.get('id') if openalex_id in PAPERS_DICT: - logging.debug(f"Paper {openalex_id} already processed") + logging.debug(f'Paper {openalex_id} already processed') return - logging.info(f"Processing paper {openalex_id}") + logging.info(f'Processing paper {openalex_id}') title = paper_data.get('title') doi = paper_data.get('doi') publication_date = paper_data.get('publication_date') @@ -236,11 +261,11 @@ def process_paper_data(paper_data: dict) -> None: for concept in concepts: topic_id = concept.get('id') if topic_id and topic_id not in TOPICS_DICT: - logging.info(f"Adding topic {topic_id}") + logging.info(f'Adding topic {topic_id}') topic_node = { 'id': topic_id, 'name': concept.get('display_name'), - 'type': 'topic' + 'type': 'topic', } TOPICS_DICT[topic_id] = topic_node if topic_id: @@ -253,25 +278,25 @@ def process_paper_data(paper_data: dict) -> None: author_data = author_entry.get('author', {}) author_id = author_data.get('id') if author_id and author_id not in AUTHORS_DICT: - logging.info(f"Adding author {author_id}") + logging.info(f'Adding author {author_id}') author_node = { 'id': author_id, 'name': author_data.get('display_name'), 'orcid': author_data.get('orcid'), 'affiliations': [], 'type': 'person', - 'papers_authored': [] + 'papers_authored': [], } # Process affiliations affiliations_data = author_entry.get('institutions', []) for inst_data in affiliations_data: inst_id = inst_data.get('id') if inst_id and inst_id not in INSTITUTIONS_DICT: - logging.info(f"Adding institution {inst_id}") + logging.info(f'Adding institution {inst_id}') institution_node = { 'id': inst_id, 'name': inst_data.get('display_name'), - 'type': 'institution' + 'type': 'institution', } INSTITUTIONS_DICT[inst_id] = institution_node if inst_id: @@ -293,7 +318,7 @@ def process_paper_data(paper_data: dict) -> None: 'authors': authors, 'topics': topics, 'cited_by': [], - 'references': [] + 'references': [], } # Process references @@ -303,7 +328,8 @@ def process_paper_data(paper_data: dict) -> None: # TODO: Consider retaining additional metadata from referenced_works if needed. PAPERS_DICT[openalex_id] = paper_node - logging.debug(f"Paper node created: {paper_node}") + logging.debug(f'Paper node created: {paper_node}') + def get_papers_by_author(author_id: str) -> None: """ @@ -312,40 +338,45 @@ def get_papers_by_author(author_id: str) -> None: Parameters: author_id (str): The OpenAlex identifier for the author. """ - logging.info(f"Fetching papers authored by {author_id}") + logging.info(f'Fetching papers authored by {author_id}') page = 1 per_page = 200 # Maximum allowed per-page value records_retrieved = 0 while True: params = { - "filter": f"authorships.author.id:{author_id}", - "page": page, - "per-page": per_page, - "mailto": OPENALEX_EMAIL + 'filter': f'authorships.author.id:{author_id}', + 'page': page, + 'per-page': per_page, + 'mailto': OPENALEX_EMAIL, } - url = "https://api.openalex.org/works" + url = 'https://api.openalex.org/works' response = make_api_request(url, params=params) if response is None: break data = response.json() works = data.get('results', []) if not works: - logging.info(f"No more papers found for author {author_id}") + logging.info(f'No more papers found for author {author_id}') break for work in works: process_paper_data(work) records_retrieved += 1 if RECORD_LIMIT != 0 and records_retrieved >= RECORD_LIMIT: - logging.info(f"Reached record limit ({RECORD_LIMIT}) for author {author_id}") + logging.info( + f'Reached record limit ({RECORD_LIMIT}) for author {author_id}' + ) return - if data.get('meta', {}).get('next_page') and (RECORD_LIMIT == 0 or records_retrieved < RECORD_LIMIT): + if data.get('meta', {}).get('next_page') and ( + RECORD_LIMIT == 0 or records_retrieved < RECORD_LIMIT + ): page += 1 - logging.debug(f"Moving to page {page} for author {author_id}") + logging.debug(f'Moving to page {page} for author {author_id}') time.sleep(1) # Respect rate limits else: break + def iterative_citation_gathering(start_paper_id: str) -> None: """ Perform iterative citation gathering up to MAX_DEPTH starting from a given paper. @@ -353,7 +384,7 @@ def iterative_citation_gathering(start_paper_id: str) -> None: Parameters: start_paper_id (str): The OpenAlex identifier for the starting paper. """ - logging.info(f"Starting iterative citation gathering from paper {start_paper_id}") + logging.info(f'Starting iterative citation gathering from paper {start_paper_id}') queue = deque() queue.append((start_paper_id, 1)) while queue: @@ -364,11 +395,13 @@ def iterative_citation_gathering(start_paper_id: str) -> None: if current_paper_id in VISITED_PAPERS: continue VISITED_PAPERS.add(current_paper_id) - logging.info(f"Processing paper {current_paper_id} at depth {current_depth}") + logging.info( + f'Processing paper {current_paper_id} at depth {current_depth}' + ) # Fetch and process the paper details if not already done if current_paper_id not in PAPERS_DICT: - url = f"https://api.openalex.org/works/{current_paper_id}" - params = {"mailto": OPENALEX_EMAIL} + url = f'https://api.openalex.org/works/{current_paper_id}' + params = {'mailto': OPENALEX_EMAIL} response = make_api_request(url, params=params) if response is None: continue @@ -386,19 +419,21 @@ def iterative_citation_gathering(start_paper_id: str) -> None: records_retrieved = 0 while True: params = { - "filter": f"cites:{current_paper_id}", - "page": page, - "per-page": per_page, - "mailto": OPENALEX_EMAIL + 'filter': f'cites:{current_paper_id}', + 'page': page, + 'per-page': per_page, + 'mailto': OPENALEX_EMAIL, } - url = "https://api.openalex.org/works" + url = 'https://api.openalex.org/works' response = make_api_request(url, params=params) if response is None: break data = response.json() works = data.get('results', []) if not works: - logging.info(f"No more citing papers found for paper {current_paper_id} at depth {current_depth}") + logging.info( + f'No more citing papers found for paper {current_paper_id} at depth {current_depth}' + ) break for work in works: citing_paper_id = work.get('id') @@ -407,23 +442,35 @@ def iterative_citation_gathering(start_paper_id: str) -> None: process_paper_data(work) # Update cited_by attribute if current_paper_id in PAPERS_DICT: - if citing_paper_id not in PAPERS_DICT[current_paper_id]['cited_by']: - PAPERS_DICT[current_paper_id]['cited_by'].append(citing_paper_id) + if ( + citing_paper_id + not in PAPERS_DICT[current_paper_id]['cited_by'] + ): + PAPERS_DICT[current_paper_id]['cited_by'].append( + citing_paper_id + ) records_retrieved += 1 queue.append((citing_paper_id, current_depth + 1)) if RECORD_LIMIT != 0 and records_retrieved >= RECORD_LIMIT: - logging.info(f"Reached record limit ({RECORD_LIMIT}) for citing papers of {current_paper_id}") + logging.info( + f'Reached record limit ({RECORD_LIMIT}) for citing papers of {current_paper_id}' + ) break - if data.get('meta', {}).get('next_page') and (RECORD_LIMIT == 0 or records_retrieved < RECORD_LIMIT): + if data.get('meta', {}).get('next_page') and ( + RECORD_LIMIT == 0 or records_retrieved < RECORD_LIMIT + ): page += 1 - logging.debug(f"Moving to page {page} for citing papers of {current_paper_id}") + logging.debug( + f'Moving to page {page} for citing papers of {current_paper_id}' + ) time.sleep(1) else: break except KeyboardInterrupt: - logging.warning("Process interrupted by user. Saving collected data.") + logging.warning('Process interrupted by user. Saving collected data.') break + def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: """ Collect data from the GitHub repository, including repository details, contributors, @@ -440,12 +487,12 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: headers = {} if GITHUB_TOKEN: headers['Authorization'] = f'token {GITHUB_TOKEN}' - base_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}" + base_url = f'https://api.github.com/repos/{repo_owner}/{repo_name}' repo_data: dict = {} # Get repository details response = make_api_request(base_url, headers=headers) if response is None: - logging.error("Failed to fetch repository data.") + logging.error('Failed to fetch repository data.') return None repo_info = response.json() repo_data['name'] = repo_info.get('name') @@ -465,13 +512,13 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: 'CONTRIBUTING.md': False, 'GOVERNANCE.md': False, 'FUNDING.yml': False, - 'funding.json': False + 'funding.json': False, } # Check for README and other files - contents_url = f"{base_url}/contents" + contents_url = f'{base_url}/contents' response = make_api_request(contents_url, headers=headers) if response is None: - logging.error("Failed to fetch repository contents.") + logging.error('Failed to fetch repository contents.') return None contents = response.json() for item in contents: @@ -487,11 +534,11 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: repo_data['documentation_files'][key] = True break # Get contributors - contributors_url = f"{base_url}/contributors" + contributors_url = f'{base_url}/contributors' contributors_set = set() page = 1 while True: - params = {"per_page": 100, "page": page} + params = {'per_page': 100, 'page': page} response = make_api_request(contributors_url, headers=headers, params=params) if response is None: break @@ -504,17 +551,17 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: contributors_set.add(login) if 'next' in response.links: page += 1 - logging.debug(f"Fetching page {page} of contributors") + logging.debug(f'Fetching page {page} of contributors') else: break repo_data['num_contributors'] = len(contributors_set) - logging.info(f"Total contributors: {repo_data['num_contributors']}") + logging.info(f'Total contributors: {repo_data["num_contributors"]}') # Get issues - issues_url = f"{base_url}/issues" + issues_url = f'{base_url}/issues' issues = [] page = 1 while True: - params = {"state": "all", "per_page": 100, "page": page} + params = {'state': 'all', 'per_page': 100, 'page': page} response = make_api_request(issues_url, headers=headers, params=params) if response is None: break @@ -523,8 +570,16 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: break issues.extend(page_issues) page += 1 - open_issues = [issue for issue in issues if issue.get('state') == 'open' and 'pull_request' not in issue] - closed_issues = [issue for issue in issues if issue.get('state') == 'closed' and 'pull_request' not in issue] + open_issues = [ + issue + for issue in issues + if issue.get('state') == 'open' and 'pull_request' not in issue + ] + closed_issues = [ + issue + for issue in issues + if issue.get('state') == 'closed' and 'pull_request' not in issue + ] repo_data['total_issues'] = len(issues) repo_data['open_issues'] = len(open_issues) repo_data['closed_issues'] = len(closed_issues) @@ -535,8 +590,8 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: num_issues_with_first_response = 0 for issue in closed_issues: try: - created_at = datetime.strptime(issue['created_at'], "%Y-%m-%dT%H:%M:%SZ") - closed_at = datetime.strptime(issue['closed_at'], "%Y-%m-%dT%H:%M:%SZ") + created_at = datetime.strptime(issue['created_at'], '%Y-%m-%dT%H:%M:%SZ') + closed_at = datetime.strptime(issue['closed_at'], '%Y-%m-%dT%H:%M:%SZ') close_time = (closed_at - created_at).total_seconds() / 3600 # in hours total_close_time += close_time num_closed_issues_with_close_time += 1 @@ -548,20 +603,32 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: comments = comments_response.json() if comments: first_comment = comments[0] - first_response_at = datetime.strptime(first_comment['created_at'], "%Y-%m-%dT%H:%M:%SZ") - first_response_time = (first_response_at - created_at).total_seconds() / 3600 # in hours + first_response_at = datetime.strptime( + first_comment['created_at'], '%Y-%m-%dT%H:%M:%SZ' + ) + first_response_time = ( + first_response_at - created_at + ).total_seconds() / 3600 # in hours total_first_response_time += first_response_time num_issues_with_first_response += 1 except Exception as e: - logging.error(f"Error processing issue dates: {e}") - repo_data['avg_time_to_close_issues'] = (total_close_time / num_closed_issues_with_close_time) if num_closed_issues_with_close_time > 0 else None - repo_data['avg_time_to_first_response_issue'] = (total_first_response_time / num_issues_with_first_response) if num_issues_with_first_response > 0 else None + logging.error(f'Error processing issue dates: {e}') + repo_data['avg_time_to_close_issues'] = ( + (total_close_time / num_closed_issues_with_close_time) + if num_closed_issues_with_close_time > 0 + else None + ) + repo_data['avg_time_to_first_response_issue'] = ( + (total_first_response_time / num_issues_with_first_response) + if num_issues_with_first_response > 0 + else None + ) # Get pull requests - pulls_url = f"{base_url}/pulls" + pulls_url = f'{base_url}/pulls' pulls = [] page = 1 while True: - params = {"state": "all", "per_page": 100, "page": page} + params = {'state': 'all', 'per_page': 100, 'page': page} response = make_api_request(pulls_url, headers=headers, params=params) if response is None: break @@ -584,9 +651,15 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: if pr_details.get('merged_at'): merged_pulls.append(pr) try: - created_at = datetime.strptime(pr_details['created_at'], "%Y-%m-%dT%H:%M:%SZ") - merged_at = datetime.strptime(pr_details['merged_at'], "%Y-%m-%dT%H:%M:%SZ") - merge_time = (merged_at - created_at).total_seconds() / 3600 # in hours + created_at = datetime.strptime( + pr_details['created_at'], '%Y-%m-%dT%H:%M:%SZ' + ) + merged_at = datetime.strptime( + pr_details['merged_at'], '%Y-%m-%dT%H:%M:%SZ' + ) + merge_time = ( + merged_at - created_at + ).total_seconds() / 3600 # in hours total_merge_time += merge_time num_merged_pulls_with_time += 1 # First review time @@ -596,29 +669,48 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: reviews = reviews_response.json() if reviews: first_review = reviews[0] - review_submitted_at = datetime.strptime(first_review['submitted_at'], "%Y-%m-%dT%H:%M:%SZ") - first_review_time = (review_submitted_at - created_at).total_seconds() / 3600 # in hours + review_submitted_at = datetime.strptime( + first_review['submitted_at'], '%Y-%m-%dT%H:%M:%SZ' + ) + first_review_time = ( + review_submitted_at - created_at + ).total_seconds() / 3600 # in hours total_first_review_time += first_review_time num_pulls_with_first_review += 1 except Exception as e: - logging.error(f"Error processing pull request dates: {e}") + logging.error(f'Error processing pull request dates: {e}') repo_data['total_pull_requests'] = len(pulls) repo_data['open_pull_requests'] = len(open_pulls) repo_data['closed_pull_requests'] = len(closed_pulls) repo_data['merged_pull_requests'] = len(merged_pulls) - repo_data['avg_time_to_merge_pr'] = (total_merge_time / num_merged_pulls_with_time) if num_merged_pulls_with_time > 0 else None - repo_data['avg_time_to_first_review_pr'] = (total_first_review_time / num_pulls_with_first_review) if num_pulls_with_first_review > 0 else None - repo_data['pr_merge_percentage'] = ((len(merged_pulls) / repo_data['total_pull_requests']) * 100) if repo_data['total_pull_requests'] > 0 else None + repo_data['avg_time_to_merge_pr'] = ( + (total_merge_time / num_merged_pulls_with_time) + if num_merged_pulls_with_time > 0 + else None + ) + repo_data['avg_time_to_first_review_pr'] = ( + (total_first_review_time / num_pulls_with_first_review) + if num_pulls_with_first_review > 0 + else None + ) + repo_data['pr_merge_percentage'] = ( + ((len(merged_pulls) / repo_data['total_pull_requests']) * 100) + if repo_data['total_pull_requests'] > 0 + else None + ) # Calculate pull request update frequency pr_dates = [] for pr in pulls: try: - pr_dates.append(datetime.strptime(pr['created_at'], "%Y-%m-%dT%H:%M:%SZ")) + pr_dates.append(datetime.strptime(pr['created_at'], '%Y-%m-%dT%H:%M:%SZ')) except Exception as e: - logging.error(f"Error parsing pull request date: {e}") + logging.error(f'Error parsing pull request date: {e}') if len(pr_dates) > 1: pr_dates.sort() - time_differences = [(pr_dates[i+1] - pr_dates[i]).total_seconds() / 3600 for i in range(len(pr_dates)-1)] + time_differences = [ + (pr_dates[i + 1] - pr_dates[i]).total_seconds() / 3600 + for i in range(len(pr_dates) - 1) + ] repo_data['pr_update_frequency'] = sum(time_differences) / len(time_differences) else: repo_data['pr_update_frequency'] = None @@ -627,7 +719,7 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: num_pulls_with_first_response = 0 for pr in pulls: try: - created_at = datetime.strptime(pr['created_at'], "%Y-%m-%dT%H:%M:%SZ") + created_at = datetime.strptime(pr['created_at'], '%Y-%m-%dT%H:%M:%SZ') comments_url = pr.get('comments_url') if comments_url: comments_response = make_api_request(comments_url, headers=headers) @@ -635,18 +727,26 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: comments = comments_response.json() if comments: first_comment = comments[0] - first_response_at = datetime.strptime(first_comment['created_at'], "%Y-%m-%dT%H:%M:%SZ") - first_response_time = (first_response_at - created_at).total_seconds() / 3600 # in hours + first_response_at = datetime.strptime( + first_comment['created_at'], '%Y-%m-%dT%H:%M:%SZ' + ) + first_response_time = ( + first_response_at - created_at + ).total_seconds() / 3600 # in hours total_first_response_time_pr += first_response_time num_pulls_with_first_response += 1 except Exception as e: - logging.error(f"Error processing pull request response time: {e}") - repo_data['avg_time_to_first_response_pr'] = (total_first_response_time_pr / num_pulls_with_first_response) if num_pulls_with_first_response > 0 else None + logging.error(f'Error processing pull request response time: {e}') + repo_data['avg_time_to_first_response_pr'] = ( + (total_first_response_time_pr / num_pulls_with_first_response) + if num_pulls_with_first_response > 0 + else None + ) # Get languages - languages_url = f"{base_url}/languages" + languages_url = f'{base_url}/languages' response = make_api_request(languages_url, headers=headers) if response is None: - logging.error("Failed to fetch languages.") + logging.error('Failed to fetch languages.') repo_data['languages'] = {} repo_data['language_percentages'] = {} else: @@ -654,14 +754,16 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: total_bytes = sum(languages.values()) repo_data['languages'] = languages if total_bytes > 0: - repo_data['language_percentages'] = {lang: (bytes_ / total_bytes) * 100 for lang, bytes_ in languages.items()} + repo_data['language_percentages'] = { + lang: (bytes_ / total_bytes) * 100 for lang, bytes_ in languages.items() + } else: repo_data['language_percentages'] = {} # Get total downloads from releases - releases_url = f"{base_url}/releases" + releases_url = f'{base_url}/releases' response = make_api_request(releases_url, headers=headers) if response is None: - logging.error("Failed to fetch releases.") + logging.error('Failed to fetch releases.') repo_data['total_downloads'] = 0 else: releases = response.json() @@ -674,11 +776,11 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: # Recent activity (past 60 days) since_date = (datetime.utcnow() - timedelta(days=60)).isoformat() + 'Z' # Recent commits - commits_url = f"{base_url}/commits" + commits_url = f'{base_url}/commits' commits = [] page = 1 while True: - params = {"since": since_date, "per_page": 100, "page": page} + params = {'since': since_date, 'per_page': 100, 'page': page} response = make_api_request(commits_url, headers=headers, params=params) if response is None or response.status_code != 200: break @@ -696,11 +798,11 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: contributors_set_recent.add(author.get('login')) repo_data['recent_active_contributors'] = len(contributors_set_recent) # Recent issues opened and closed - recent_issues_url = f"{base_url}/issues" + recent_issues_url = f'{base_url}/issues' recent_issues = [] page = 1 while True: - params = {"since": since_date, "state": "all", "per_page": 100, "page": page} + params = {'since': since_date, 'state': 'all', 'per_page': 100, 'page': page} response = make_api_request(recent_issues_url, headers=headers, params=params) if response is None or response.status_code != 200: break @@ -709,16 +811,24 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: break recent_issues.extend(page_issues) page += 1 - recent_issues_opened = [issue for issue in recent_issues if 'pull_request' not in issue and issue.get('created_at', '') >= since_date] - recent_issues_closed = [issue for issue in recent_issues_opened if issue.get('closed_at', '') >= since_date] + recent_issues_opened = [ + issue + for issue in recent_issues + if 'pull_request' not in issue and issue.get('created_at', '') >= since_date + ] + recent_issues_closed = [ + issue + for issue in recent_issues_opened + if issue.get('closed_at', '') >= since_date + ] repo_data['recent_issues_opened'] = len(recent_issues_opened) repo_data['recent_issues_closed'] = len(recent_issues_closed) # Recent pull requests opened and merged - recent_pulls_url = f"{base_url}/pulls" + recent_pulls_url = f'{base_url}/pulls' recent_pulls = [] page = 1 while True: - params = {"state": "all", "per_page": 100, "page": page} + params = {'state': 'all', 'per_page': 100, 'page': page} response = make_api_request(recent_pulls_url, headers=headers, params=params) if response is None or response.status_code != 200: break @@ -727,7 +837,9 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: break recent_pulls.extend(page_pulls) page += 1 - recent_pulls_opened = [pr for pr in recent_pulls if pr.get('created_at', '') >= since_date] + recent_pulls_opened = [ + pr for pr in recent_pulls if pr.get('created_at', '') >= since_date + ] recent_pulls_merged = [] for pr in recent_pulls_opened: pr_details_response = make_api_request(pr.get('url'), headers=headers) @@ -738,66 +850,75 @@ def collect_github_data(repo_owner: str, repo_name: str) -> Optional[dict]: repo_data['recent_pulls_opened'] = len(recent_pulls_opened) repo_data['recent_pulls_merged'] = len(recent_pulls_merged) # Add the repository URL - repo_data['url'] = f"https://github.com/{repo_owner}/{repo_name}" + repo_data['url'] = f'https://github.com/{repo_owner}/{repo_name}' # Add to projects list PROJECTS_LIST.append(repo_data) logging.info(f"GitHub data collected for '{repo_owner}/{repo_name}'") return repo_data + def run_repo_cite() -> None: """ Main function to run the repository citation data collection process. """ global OPENALEX_EMAIL, RECORD_LIMIT, MAX_DEPTH, GITHUB_TOKEN - logging.info("Script started") + logging.info('Script started') try: - repo_url = input("Enter GitHub repository URL: ").strip() + repo_url = input('Enter GitHub repository URL: ').strip() # Parse the GitHub URL to get the owner and repo name match = re.match(r'https?://github\.com/([^/]+)/([^/]+)', repo_url) if not match: - logging.error("Invalid GitHub URL format. Exiting.") + logging.error('Invalid GitHub URL format. Exiting.') return repo_owner, repo_name = match.groups() # Optional: Prompt for email and record limit - email_input = input("Enter your email for OpenAlex API (optional): ").strip() + email_input = input('Enter your email for OpenAlex API (optional): ').strip() if email_input: OPENALEX_EMAIL = email_input - record_limit_input = input("Enter number of records to retrieve per API call (integer, 0 for all) [default is 0]: ").strip() + record_limit_input = input( + 'Enter number of records to retrieve per API call (integer, 0 for all) [default is 0]: ' + ).strip() if record_limit_input: if record_limit_input.isdigit(): RECORD_LIMIT = int(record_limit_input) else: - logging.warning("Invalid record limit input. Using default (0 for all).") + logging.warning( + 'Invalid record limit input. Using default (0 for all).' + ) RECORD_LIMIT = 0 - max_depth_input = input("Enter maximum depth for citation traversal (integer) [default is 2]: ").strip() + max_depth_input = input( + 'Enter maximum depth for citation traversal (integer) [default is 2]: ' + ).strip() if max_depth_input: if max_depth_input.isdigit(): MAX_DEPTH = int(max_depth_input) else: - logging.warning("Invalid max depth input. Using default (2).") + logging.warning('Invalid max depth input. Using default (2).') MAX_DEPTH = 2 # Ensure GitHub token is available if not GITHUB_TOKEN: - logging.error("GitHub personal access token not found in .env file. Exiting.") + logging.error( + 'GitHub personal access token not found in .env file. Exiting.' + ) return # Collect GitHub data github_data = collect_github_data(repo_owner, repo_name) if github_data is None: - logging.error("Failed to collect GitHub data. Exiting.") + logging.error('Failed to collect GitHub data. Exiting.') return doi = get_doi_from_github_repo(repo_owner, repo_name) if not doi: - logging.error("DOI not found. Exiting.") + logging.error('DOI not found. Exiting.') return - logging.info(f"DOI found: {doi}") + logging.info(f'DOI found: {doi}') paper_data = get_paper_details(doi) if not paper_data: - logging.error("Paper details not found. Exiting.") + logging.error('Paper details not found. Exiting.') return process_paper_data(paper_data) @@ -820,7 +941,7 @@ def run_repo_cite() -> None: 'papers': list(PAPERS_DICT.values()), 'institutions': list(INSTITUTIONS_DICT.values()), 'topics': list(TOPICS_DICT.values()), - 'projects': PROJECTS_LIST + 'projects': PROJECTS_LIST, } # Save to JSON file @@ -829,29 +950,36 @@ def run_repo_cite() -> None: logging.info("Data collection complete. Output saved to 'output_data.json'.") # Log the total number of nodes - logging.info(f"Total number of papers: {len(output_data['papers'])}") - logging.info(f"Total number of people: {len(output_data['people'])}") - logging.info(f"Total number of institutions: {len(output_data['institutions'])}") - logging.info(f"Total number of topics: {len(output_data['topics'])}") - logging.info(f"Total number of projects: {len(output_data['projects'])}") + logging.info(f'Total number of papers: {len(output_data["papers"])}') + logging.info(f'Total number of people: {len(output_data["people"])}') + logging.info( + f'Total number of institutions: {len(output_data["institutions"])}' + ) + logging.info(f'Total number of topics: {len(output_data["topics"])}') + logging.info(f'Total number of projects: {len(output_data["projects"])}') except KeyboardInterrupt: - logging.warning("Process interrupted by user. Saving collected data.") + logging.warning('Process interrupted by user. Saving collected data.') output_data = { 'people': list(AUTHORS_DICT.values()), 'papers': list(PAPERS_DICT.values()), 'institutions': list(INSTITUTIONS_DICT.values()), 'topics': list(TOPICS_DICT.values()), - 'projects': PROJECTS_LIST + 'projects': PROJECTS_LIST, } with open('output_data_partial.json', 'w') as f: json.dump(output_data, f, indent=2) logging.info("Partial data saved to 'output_data_partial.json'.") - logging.info(f"Total number of papers collected: {len(output_data['papers'])}") - logging.info(f"Total number of people collected: {len(output_data['people'])}") - logging.info(f"Total number of institutions collected: {len(output_data['institutions'])}") - logging.info(f"Total number of topics collected: {len(output_data['topics'])}") - logging.info(f"Total number of projects collected: {len(output_data['projects'])}") - -if __name__ == "__main__": + logging.info(f'Total number of papers collected: {len(output_data["papers"])}') + logging.info(f'Total number of people collected: {len(output_data["people"])}') + logging.info( + f'Total number of institutions collected: {len(output_data["institutions"])}' + ) + logging.info(f'Total number of topics collected: {len(output_data["topics"])}') + logging.info( + f'Total number of projects collected: {len(output_data["projects"])}' + ) + + +if __name__ == '__main__': run_repo_cite() diff --git a/Older Experiments/scripts/repo_cite/test_repo_cite.py b/Older Experiments/scripts/repo_cite/test_repo_cite.py index e9ce641..43aac1b 100644 --- a/Older Experiments/scripts/repo_cite/test_repo_cite.py +++ b/Older Experiments/scripts/repo_cite/test_repo_cite.py @@ -1,37 +1,34 @@ import pytest from repo_cite import ( - process_paper_data, - PAPERS_DICT, AUTHORS_DICT, INSTITUTIONS_DICT, + PAPERS_DICT, TOPICS_DICT, + process_paper_data, ) # Sample paper data mimicking an OpenAlex response. sample_paper_data = { - "id": "https://openalex.org/W123456789", - "title": "Test Paper Title", - "doi": "10.1234/testdoi", - "publication_date": "2020-01-01", - "abstract_inverted_index": { - "Test": [1], - "paper": [2], - "abstract": [3] - }, - "concepts": [ - {"id": "C1", "display_name": "Concept 1"} - ], - "authorships": [ + 'id': 'https://openalex.org/W123456789', + 'title': 'Test Paper Title', + 'doi': '10.1234/testdoi', + 'publication_date': '2020-01-01', + 'abstract_inverted_index': {'Test': [1], 'paper': [2], 'abstract': [3]}, + 'concepts': [{'id': 'C1', 'display_name': 'Concept 1'}], + 'authorships': [ { - "author": {"id": "A1", "display_name": "Author One", "orcid": "0000-0001-2345-6789"}, - "institutions": [ - {"id": "I1", "display_name": "Institution One"} - ] + 'author': { + 'id': 'A1', + 'display_name': 'Author One', + 'orcid': '0000-0001-2345-6789', + }, + 'institutions': [{'id': 'I1', 'display_name': 'Institution One'}], } ], - "referenced_works": ["https://openalex.org/W987654321"] + 'referenced_works': ['https://openalex.org/W987654321'], } + @pytest.fixture(autouse=True) def clear_globals(): """Ensure global dictionaries are cleared before each test.""" @@ -41,29 +38,30 @@ def clear_globals(): TOPICS_DICT.clear() yield + def test_process_paper_data(): process_paper_data(sample_paper_data) - + # Verify that the paper is added to the global PAPERS_DICT. - assert sample_paper_data["id"] in PAPERS_DICT - paper_node = PAPERS_DICT[sample_paper_data["id"]] - assert paper_node["title"] == "Test Paper Title" - + assert sample_paper_data['id'] in PAPERS_DICT + paper_node = PAPERS_DICT[sample_paper_data['id']] + assert paper_node['title'] == 'Test Paper Title' + # The abstract_inverted_index should be converted to a space‐delimited abstract. - expected_abstract = "Test paper abstract" - assert paper_node["abstract"] == expected_abstract - + expected_abstract = 'Test paper abstract' + assert paper_node['abstract'] == expected_abstract + # Check that topics are processed. - assert "C1" in paper_node["topics"] - assert "C1" in TOPICS_DICT - topic_node = TOPICS_DICT["C1"] - assert topic_node["name"] == "Concept 1" - + assert 'C1' in paper_node['topics'] + assert 'C1' in TOPICS_DICT + topic_node = TOPICS_DICT['C1'] + assert topic_node['name'] == 'Concept 1' + # Check that authors and institutions have been added. - assert "A1" in AUTHORS_DICT - author_node = AUTHORS_DICT["A1"] - assert "I1" in author_node["affiliations"] - assert "I1" in INSTITUTIONS_DICT - + assert 'A1' in AUTHORS_DICT + author_node = AUTHORS_DICT['A1'] + assert 'I1' in author_node['affiliations'] + assert 'I1' in INSTITUTIONS_DICT + # Check that referenced works are captured. - assert "https://openalex.org/W987654321" in paper_node["references"] + assert 'https://openalex.org/W987654321' in paper_node['references'] diff --git a/Older Experiments/scripts/repo_finder/repofinder.py b/Older Experiments/scripts/repo_finder/repofinder.py index aa7a29d..3634bc0 100644 --- a/Older Experiments/scripts/repo_finder/repofinder.py +++ b/Older Experiments/scripts/repo_finder/repofinder.py @@ -1,28 +1,31 @@ -import requests -import json -import csv -import time -import re +import argparse import base64 +import csv +import json import logging import os -import argparse -from dotenv import load_dotenv +import re +import time from datetime import datetime, timedelta, timezone + +import requests +from dotenv import load_dotenv from tqdm import tqdm # Initialize the logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) + # Handler for logging with tqdm class TqdmLoggingHandler(logging.Handler): """ Custom logging handler compatible with tqdm progress bars. """ + def __init__(self, level=logging.NOTSET): super().__init__(level) - + def emit(self, record): try: msg = self.format(record) @@ -31,16 +34,18 @@ def emit(self, record): except Exception: self.handleError(record) + # Configure the logger to use the custom handler handler = TqdmLoggingHandler() handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logger.addHandler(handler) # Constants -GITHUB_API_URL = "https://api.github.com" +GITHUB_API_URL = 'https://api.github.com' MAX_RETRIES = 3 RETRY_DELAY = 2 # seconds + def github_api_request(url, headers, params=None): """ Sends a GET request to the GitHub API with rate limit handling. @@ -54,41 +59,48 @@ def github_api_request(url, headers, params=None): tuple: A tuple containing the JSON response and response headers. """ for attempt in range(1, MAX_RETRIES + 1): - logger.debug(f"Attempt {attempt} for URL: {url}") + logger.debug(f'Attempt {attempt} for URL: {url}') try: response = requests.get(url, headers=headers, params=params, timeout=10) response.raise_for_status() except requests.exceptions.Timeout: - logger.error(f"Timeout occurred for URL: {url}") + logger.error(f'Timeout occurred for URL: {url}') if attempt == MAX_RETRIES: raise time.sleep(RETRY_DELAY) continue except requests.exceptions.RequestException as e: - logger.error(f"Request exception: {e}") + logger.error(f'Request exception: {e}') if attempt == MAX_RETRIES: raise time.sleep(RETRY_DELAY) continue - logger.debug(f"Response status code: {response.status_code}") + logger.debug(f'Response status code: {response.status_code}') if response.status_code == 200: - logger.debug("Successful response.") + logger.debug('Successful response.') return response.json(), response.headers - elif response.status_code == 403 and 'X-RateLimit-Remaining' in response.headers: + elif ( + response.status_code == 403 and 'X-RateLimit-Remaining' in response.headers + ): if response.headers['X-RateLimit-Remaining'] == '0': reset_time = int(response.headers['X-RateLimit-Reset']) sleep_time = max(reset_time - int(time.time()), 0) + 1 - logger.warning(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.") + logger.warning( + f'Rate limit exceeded. Sleeping for {sleep_time} seconds.' + ) time.sleep(sleep_time) continue else: - logger.error(f"Error: {response.status_code} - {response.reason}") + logger.error(f'Error: {response.status_code} - {response.reason}') if attempt == MAX_RETRIES: response.raise_for_status() time.sleep(RETRY_DELAY) continue - raise Exception(f"Failed to get a successful response after {MAX_RETRIES} attempts.") + raise Exception( + f'Failed to get a successful response after {MAX_RETRIES} attempts.' + ) + def get_next_link(headers): """ @@ -115,6 +127,7 @@ def get_next_link(headers): return next_url return None + def search_repositories_with_queries(query_terms, headers): """ Searches GitHub repositories based on query terms and records matching queries. @@ -129,17 +142,19 @@ def search_repositories_with_queries(query_terms, headers): repositories = {} for query_term in query_terms: params = {'q': query_term, 'per_page': 100} - url = f"{GITHUB_API_URL}/search/repositories" + url = f'{GITHUB_API_URL}/search/repositories' while url: - logger.debug(f"Searching repositories with URL: {url} and params: {params}") + logger.debug(f'Searching repositories with URL: {url} and params: {params}') try: data, headers_response = github_api_request(url, headers, params) except Exception as e: - logger.error(f"Error searching repositories: {e}") + logger.error(f'Error searching repositories: {e}') break if data: items = data.get('items', []) - logger.info(f"Found {len(items)} repositories in this page for query '{query_term}'.") + logger.info( + f"Found {len(items)} repositories in this page for query '{query_term}'." + ) for repo in items: repo_id = repo.get('id') if repo_id in repositories: @@ -147,7 +162,7 @@ def search_repositories_with_queries(query_terms, headers): else: repositories[repo_id] = { 'repo_data': repo, - 'queries': set([query_term]) + 'queries': set([query_term]), } next_url = get_next_link(headers_response) url = next_url @@ -156,6 +171,7 @@ def search_repositories_with_queries(query_terms, headers): break return repositories + def load_keywords(filename): """ Loads keywords from a CSV file and preprocesses them. @@ -173,13 +189,14 @@ def load_keywords(filename): for row in reader: for keyword in row: keywords.add(keyword.strip().lower()) - logger.info(f"Loaded {len(keywords)} keywords.") + logger.info(f'Loaded {len(keywords)} keywords.') if not keywords: - logger.warning("No keywords found in the file.") + logger.warning('No keywords found in the file.') except FileNotFoundError: - logger.error(f"Keyword file {filename} not found.") + logger.error(f'Keyword file {filename} not found.') return keywords + def load_hierarchical_keywords(filename): """ Loads the hierarchical keyword dataset from a JSON file. @@ -193,15 +210,16 @@ def load_hierarchical_keywords(filename): try: with open(filename, 'r', encoding='utf-8') as jsonfile: data = json.load(jsonfile) - logger.info(f"Loaded hierarchical dataset with {len(data)} entries.") + logger.info(f'Loaded hierarchical dataset with {len(data)} entries.') return data except FileNotFoundError: - logger.error(f"Dataset file {filename} not found.") + logger.error(f'Dataset file {filename} not found.') return [] except json.JSONDecodeError as e: - logger.error(f"Error decoding JSON: {e}") + logger.error(f'Error decoding JSON: {e}') return [] + def contains_keywords(text, keywords): """ Checks if the text contains any of the keywords. @@ -220,6 +238,7 @@ def contains_keywords(text, keywords): return True return False + def count_keyword_matches(text, keywords): """ Counts the number of keyword matches in the text and collects matched keywords. @@ -239,6 +258,7 @@ def count_keyword_matches(text, keywords): count = len(matched_keywords) return count, matched_keywords + def match_repository_keywords(repo_text, hierarchical_keywords): """ Matches repository text against the hierarchical keywords and calculates scores. @@ -250,24 +270,21 @@ def match_repository_keywords(repo_text, hierarchical_keywords): Returns: tuple: A dictionary of scores and a list of matched keywords. """ - scores = { - 'domains': {}, - 'fields': {}, - 'subfields': {}, - 'topics': {} - } + scores = {'domains': {}, 'fields': {}, 'subfields': {}, 'topics': {}} matched_keywords = set() - + # Tokenize the repository text for efficient matching repo_words = set(re.findall(r'\b\w+\b', repo_text.lower())) - + for entry in hierarchical_keywords: domain = entry['Domain'] field = entry['Field'] subfield = entry['Subfield'] topic = entry['Topic'] - keywords = set(map(str.lower, entry['Keywords'])) # Ensure keywords are lowercase - + keywords = set( + map(str.lower, entry['Keywords']) + ) # Ensure keywords are lowercase + # Check for keyword matches common_keywords = repo_words.intersection(keywords) if common_keywords: @@ -277,9 +294,10 @@ def match_repository_keywords(repo_text, hierarchical_keywords): scores['fields'][field] = scores['fields'].get(field, 0) + 1 scores['subfields'][subfield] = scores['subfields'].get(subfield, 0) + 1 scores['topics'][topic] = scores['topics'].get(topic, 0) + 1 - + return scores, list(matched_keywords) + def get_contributors(owner, repo_name, headers): """ Retrieves the list of contributors for a given repository. @@ -292,15 +310,17 @@ def get_contributors(owner, repo_name, headers): Returns: list: A list of contributors. """ - url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/contributors" + url = f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/contributors' params = {'per_page': 100} contributors = [] while url: - logger.debug(f"Getting contributors for repository: {owner}/{repo_name}") + logger.debug(f'Getting contributors for repository: {owner}/{repo_name}') try: - contributors_data, headers_response = github_api_request(url, headers, params) + contributors_data, headers_response = github_api_request( + url, headers, params + ) except Exception as e: - logger.error(f"Error fetching contributors: {e}") + logger.error(f'Error fetching contributors: {e}') break if contributors_data: contributors.extend(contributors_data) @@ -309,9 +329,10 @@ def get_contributors(owner, repo_name, headers): params = None else: break - logger.debug(f"Total contributors fetched: {len(contributors)}") + logger.debug(f'Total contributors fetched: {len(contributors)}') return contributors if contributors else [] + def get_user_repositories(username, headers): """ Retrieves the list of repositories for a given user. @@ -324,15 +345,15 @@ def get_user_repositories(username, headers): list: A list of repositories. """ repos = [] - url = f"{GITHUB_API_URL}/users/{username}/repos" + url = f'{GITHUB_API_URL}/users/{username}/repos' params = {'per_page': 100, 'type': 'owner'} page = 1 # Track the current page while url: - logger.debug(f"Fetching repositories for user: {username}, page {page}") + logger.debug(f'Fetching repositories for user: {username}, page {page}') try: repo_data, headers_response = github_api_request(url, headers, params) except Exception as e: - logger.error(f"Error fetching user repositories: {e}") + logger.error(f'Error fetching user repositories: {e}') break if repo_data: repos.extend(repo_data) @@ -342,9 +363,10 @@ def get_user_repositories(username, headers): page += 1 else: break - logger.debug(f"Total repositories fetched for user {username}: {len(repos)}") + logger.debug(f'Total repositories fetched for user {username}: {len(repos)}') return repos + def analyze_user_repositories(repos, keywords, university_name): """ Analyzes a user's repositories for affiliation indicators. @@ -368,16 +390,19 @@ def analyze_user_repositories(repos, keywords, university_name): # Check for affiliation indicators text_to_check = ' '.join([repo_name, description, ' '.join(topics)]) if contains_keywords(text_to_check, {university_name.lower()}): - affiliation_indicators.append({ - 'name': repo_name, - 'description': description, - 'created_at': created_at, - 'updated_at': updated_at, - 'topics': topics, - 'url': repo_url - }) + affiliation_indicators.append( + { + 'name': repo_name, + 'description': description, + 'created_at': created_at, + 'updated_at': updated_at, + 'topics': topics, + 'url': repo_url, + } + ) return {'affiliation_indicators': affiliation_indicators} + def get_pull_request_reviews(owner, repo_name, pr_number, headers): """ Retrieves reviews for a specific pull request. @@ -392,14 +417,14 @@ def get_pull_request_reviews(owner, repo_name, pr_number, headers): list: A list of reviews. """ reviews = [] - url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/pulls/{pr_number}/reviews" + url = f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/pulls/{pr_number}/reviews' params = {'per_page': 100} while url: - logger.debug(f"Fetching reviews for PR #{pr_number} in {owner}/{repo_name}") + logger.debug(f'Fetching reviews for PR #{pr_number} in {owner}/{repo_name}') try: reviews_data, headers_response = github_api_request(url, headers, params) except Exception as e: - logger.error(f"Error fetching PR reviews: {e}") + logger.error(f'Error fetching PR reviews: {e}') break if reviews_data: reviews.extend(reviews_data) @@ -410,6 +435,7 @@ def get_pull_request_reviews(owner, repo_name, pr_number, headers): break return reviews + def analyze_pull_requests(pull_requests, owner, repo_name, headers): """ Analyzes pull requests for various metrics. @@ -428,9 +454,9 @@ def analyze_pull_requests(pull_requests, owner, repo_name, headers): 'open_prs': 0, 'closed_prs': 0, 'average_time_to_merge': None, # In days - 'pr_update_frequency': None, # Average number of days between PRs + 'pr_update_frequency': None, # Average number of days between PRs 'average_time_to_first_review': None, # In days - 'review_to_merge_percentage': None # Percentage + 'review_to_merge_percentage': None, # Percentage } if not pull_requests: @@ -444,7 +470,13 @@ def analyze_pull_requests(pull_requests, owner, repo_name, headers): time_to_first_review_list = [] # Initialize a progress bar for analyzing pull requests - with tqdm(total=len(pull_requests), desc='Analyzing PRs', unit='PR', position=2, leave=False) as pbar: + with tqdm( + total=len(pull_requests), + desc='Analyzing PRs', + unit='PR', + position=2, + leave=False, + ) as pbar: for pr in pull_requests: pr_number = pr.get('number') state = pr.get('state') @@ -457,9 +489,13 @@ def analyze_pull_requests(pull_requests, owner, repo_name, headers): pr_analysis['closed_prs'] += 1 if pr.get('merged_at'): - created_date = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ") - merged_date = datetime.strptime(pr['merged_at'], "%Y-%m-%dT%H:%M:%SZ") - duration = (merged_date - created_date).total_seconds() / (3600 * 24) + created_date = datetime.strptime(created_at, '%Y-%m-%dT%H:%M:%SZ') + merged_date = datetime.strptime( + pr['merged_at'], '%Y-%m-%dT%H:%M:%SZ' + ) + duration = (merged_date - created_date).total_seconds() / ( + 3600 * 24 + ) merged_durations.append(duration) # Fetch reviews for the PR @@ -470,9 +506,13 @@ def analyze_pull_requests(pull_requests, owner, repo_name, headers): reviews.sort(key=lambda x: x.get('submitted_at')) first_review_date = reviews[0].get('submitted_at') if first_review_date: - created_date = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ") - first_review_datetime = datetime.strptime(first_review_date, "%Y-%m-%dT%H:%M:%SZ") - time_to_first_review = (first_review_datetime - created_date).total_seconds() / (3600 * 24) + created_date = datetime.strptime(created_at, '%Y-%m-%dT%H:%M:%SZ') + first_review_datetime = datetime.strptime( + first_review_date, '%Y-%m-%dT%H:%M:%SZ' + ) + time_to_first_review = ( + first_review_datetime - created_date + ).total_seconds() / (3600 * 24) time_to_first_review_list.append(time_to_first_review) if pr.get('merged_at'): @@ -482,30 +522,41 @@ def analyze_pull_requests(pull_requests, owner, repo_name, headers): # Calculate average time to merge pull requests if merged_durations: - pr_analysis['average_time_to_merge'] = sum(merged_durations) / len(merged_durations) + pr_analysis['average_time_to_merge'] = sum(merged_durations) / len( + merged_durations + ) # Calculate PR update frequency pr_dates.sort() if len(pr_dates) > 1: date_differences = [] for i in range(1, len(pr_dates)): - date1 = datetime.strptime(pr_dates[i - 1], "%Y-%m-%dT%H:%M:%SZ") - date2 = datetime.strptime(pr_dates[i], "%Y-%m-%dT%H:%M:%SZ") + date1 = datetime.strptime(pr_dates[i - 1], '%Y-%m-%dT%H:%M:%SZ') + date2 = datetime.strptime(pr_dates[i], '%Y-%m-%dT%H:%M:%SZ') difference = (date2 - date1).total_seconds() / (3600 * 24) date_differences.append(difference) - pr_analysis['pr_update_frequency'] = sum(date_differences) / len(date_differences) + pr_analysis['pr_update_frequency'] = sum(date_differences) / len( + date_differences + ) # Calculate average time to first review if time_to_first_review_list: - pr_analysis['average_time_to_first_review'] = sum(time_to_first_review_list) / len(time_to_first_review_list) + pr_analysis['average_time_to_first_review'] = sum( + time_to_first_review_list + ) / len(time_to_first_review_list) # Calculate review-to-merge percentage if total_reviewed_prs > 0: - pr_analysis['review_to_merge_percentage'] = (reviewed_and_merged_prs / total_reviewed_prs) * 100 + pr_analysis['review_to_merge_percentage'] = ( + reviewed_and_merged_prs / total_reviewed_prs + ) * 100 return pr_analysis -def analyze_contributors(contributors, university_email_domain, university_name, keywords, headers): + +def analyze_contributors( + contributors, university_email_domain, university_name, keywords, headers +): """ Analyzes contributor profiles for affiliation and status. @@ -522,19 +573,27 @@ def analyze_contributors(contributors, university_email_domain, university_name, contributor_details = [] total_contributors = len(contributors) - with tqdm(total=total_contributors, desc='Analyzing Contributors', unit='contributor', position=3, leave=False) as pbar: + with tqdm( + total=total_contributors, + desc='Analyzing Contributors', + unit='contributor', + position=3, + leave=False, + ) as pbar: for index, contributor in enumerate(contributors, start=1): username = contributor.get('login') user_url = contributor.get('url') - logger.debug(f"Analyzing contributor [{index}/{total_contributors}]: {username}") + logger.debug( + f'Analyzing contributor [{index}/{total_contributors}]: {username}' + ) try: user_data, _ = github_api_request(user_url, headers) except Exception as e: - logger.warning(f"Could not retrieve data for user: {username} - {e}") + logger.warning(f'Could not retrieve data for user: {username} - {e}') pbar.update(1) continue if user_data: - logger.debug(f"Retrieved data for user: {username}") + logger.debug(f'Retrieved data for user: {username}') # Extract profile information email = user_data.get('email', '') bio = user_data.get('bio', '') @@ -548,19 +607,26 @@ def analyze_contributors(contributors, university_email_domain, university_name, created_at = user_data.get('created_at', '') updated_at = user_data.get('updated_at', '') # Determine status - if contains_keywords(bio or '', {'student', 'faculty', 'professor', 'researcher'}): + if contains_keywords( + bio or '', {'student', 'faculty', 'professor', 'researcher'} + ): status = 'Faculty/Student/Researcher' else: status = 'Unknown' # Determine affiliation - if (university_email_domain.lower() in (email or '').lower() or - contains_keywords(company or '', {university_name.lower()})): + if university_email_domain.lower() in ( + email or '' + ).lower() or contains_keywords( + company or '', {university_name.lower()} + ): affiliation = university_name else: affiliation = company or 'Unknown' # Analyze user's repositories repos = get_user_repositories(username, headers) - repo_analysis = analyze_user_repositories(repos, keywords, university_name) + repo_analysis = analyze_user_repositories( + repos, keywords, university_name + ) # Compile contributor details contributor_info = { 'username': username, @@ -577,15 +643,16 @@ def analyze_contributors(contributors, university_email_domain, university_name, 'followers': followers, 'created_at': created_at, 'updated_at': updated_at, - 'repositories': repo_analysis['affiliation_indicators'] + 'repositories': repo_analysis['affiliation_indicators'], } contributor_details.append(contributor_info) - logger.info(f"Contributor analyzed: {username}") + logger.info(f'Contributor analyzed: {username}') else: - logger.warning(f"Could not retrieve data for user: {username}") + logger.warning(f'Could not retrieve data for user: {username}') pbar.update(1) return contributor_details + def determine_project_type(repo_name, description, topics, readme_content, files): """ Determines the project type based on content analysis. @@ -601,8 +668,31 @@ def determine_project_type(repo_name, description, topics, readme_content, files tuple: Project type, scores, and matched keywords. """ classproject_keywords = {'assignment', 'homework', 'hw', 'coursework'} - research_keywords = {'research', 'thesis', 'dissertation', 'paper', 'publication', 'study', 'experiment', 'analysis', 'used in'} - syllabus_keywords = {'syllabus', 'curriculum', 'outline', 'schedule', 'taught', 'students', 'course', 'class', 'lecture', 'tutorial', 'exam', 'quiz'} + research_keywords = { + 'research', + 'thesis', + 'dissertation', + 'paper', + 'publication', + 'study', + 'experiment', + 'analysis', + 'used in', + } + syllabus_keywords = { + 'syllabus', + 'curriculum', + 'outline', + 'schedule', + 'taught', + 'students', + 'course', + 'class', + 'lecture', + 'tutorial', + 'exam', + 'quiz', + } text_to_check = ' '.join([repo_name, description, ' '.join(topics), readme_content]) file_names = ' '.join(files) @@ -611,21 +701,27 @@ def determine_project_type(repo_name, description, topics, readme_content, files total_text = text_to_check + ' ' + file_names # Count keyword matches and collect matched keywords for each category - classproject_score, classproject_matches = count_keyword_matches(total_text, classproject_keywords) - research_score, research_matches = count_keyword_matches(total_text, research_keywords) - syllabus_score, syllabus_matches = count_keyword_matches(total_text, syllabus_keywords) + classproject_score, classproject_matches = count_keyword_matches( + total_text, classproject_keywords + ) + research_score, research_matches = count_keyword_matches( + total_text, research_keywords + ) + syllabus_score, syllabus_matches = count_keyword_matches( + total_text, syllabus_keywords + ) # Determine the category with the highest score scores = { 'Class Project': classproject_score, 'Research Project': research_score, - 'Syllabus': syllabus_score + 'Syllabus': syllabus_score, } matched_keywords = { 'Class Project': classproject_matches, 'Research Project': research_matches, - 'Syllabus': syllabus_matches + 'Syllabus': syllabus_matches, } max_score = max(scores.values()) @@ -633,7 +729,9 @@ def determine_project_type(repo_name, description, topics, readme_content, files project_type = 'Other' else: # Handle ties - max_categories = [category for category, score in scores.items() if score == max_score] + max_categories = [ + category for category, score in scores.items() if score == max_score + ] if len(max_categories) == 1: project_type = max_categories[0] else: @@ -641,6 +739,7 @@ def determine_project_type(repo_name, description, topics, readme_content, files return project_type, scores, matched_keywords + def get_repository_issues(owner, repo_name, headers, since=None): """ Retrieves issues for a repository. @@ -655,20 +754,22 @@ def get_repository_issues(owner, repo_name, headers, since=None): list: A list of issues. """ issues = [] - url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/issues" + url = f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/issues' params = {'state': 'all', 'per_page': 100} if since: params['since'] = since while url: - logger.debug(f"Fetching issues for repository: {owner}/{repo_name}") + logger.debug(f'Fetching issues for repository: {owner}/{repo_name}') try: issues_data, headers_response = github_api_request(url, headers, params) except Exception as e: - logger.error(f"Error fetching issues: {e}") + logger.error(f'Error fetching issues: {e}') break if issues_data is not None: # Filter out pull requests - issues_only = [issue for issue in issues_data if 'pull_request' not in issue] + issues_only = [ + issue for issue in issues_data if 'pull_request' not in issue + ] issues.extend(issues_only) next_url = get_next_link(headers_response) url = next_url @@ -677,6 +778,7 @@ def get_repository_issues(owner, repo_name, headers, since=None): break return issues + def get_issue_comments(owner, repo_name, issue_number, headers): """ Retrieves comments for a specific issue. @@ -694,13 +796,17 @@ def get_issue_comments(owner, repo_name, issue_number, headers): page = 1 per_page = 100 while True: - url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/issues/{issue_number}/comments" + url = ( + f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/issues/{issue_number}/comments' + ) params = {'page': page, 'per_page': per_page} - logger.debug(f"Fetching comments for issue #{issue_number} in {owner}/{repo_name}, page {page}") + logger.debug( + f'Fetching comments for issue #{issue_number} in {owner}/{repo_name}, page {page}' + ) try: comments_data, headers_response = github_api_request(url, headers, params) except Exception as e: - logger.error(f"Error fetching issue comments: {e}") + logger.error(f'Error fetching issue comments: {e}') break if comments_data is not None: comments.extend(comments_data) @@ -711,6 +817,7 @@ def get_issue_comments(owner, repo_name, issue_number, headers): break return comments + def get_pull_request_comments(owner, repo_name, pr_number, headers): """ Retrieves comments for a specific pull request. @@ -728,13 +835,15 @@ def get_pull_request_comments(owner, repo_name, pr_number, headers): page = 1 per_page = 100 while True: - url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/pulls/{pr_number}/comments" + url = f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/pulls/{pr_number}/comments' params = {'page': page, 'per_page': per_page} - logger.debug(f"Fetching comments for PR #{pr_number} in {owner}/{repo_name}, page {page}") + logger.debug( + f'Fetching comments for PR #{pr_number} in {owner}/{repo_name}, page {page}' + ) try: comments_data, headers_response = github_api_request(url, headers, params) except Exception as e: - logger.error(f"Error fetching PR comments: {e}") + logger.error(f'Error fetching PR comments: {e}') break if comments_data: comments.extend(comments_data) @@ -745,7 +854,10 @@ def get_pull_request_comments(owner, repo_name, pr_number, headers): break return comments -def analyze_issues(issues, owner, repo_name, headers, university_email_domain, university_name): + +def analyze_issues( + issues, owner, repo_name, headers, university_email_domain, university_name +): """ Analyzes issues for collaboration and external participation. @@ -767,7 +879,7 @@ def analyze_issues(issues, owner, repo_name, headers, university_email_domain, u 'average_time_to_close': None, # In days 'issue_update_frequency': None, # Average number of days between issues 'external_participants': set(), - 'collaboration_opportunities': [] # List of issue numbers or titles + 'collaboration_opportunities': [], # List of issue numbers or titles } if not issues: @@ -778,7 +890,13 @@ def analyze_issues(issues, owner, repo_name, headers, university_email_domain, u issue_dates = [] # Initialize a progress bar for analyzing issues - with tqdm(total=len(issues), desc='Analyzing Issues', unit='issue', position=1, leave=False) as pbar: + with tqdm( + total=len(issues), + desc='Analyzing Issues', + unit='issue', + position=1, + leave=False, + ) as pbar: for issue in issues: issue_number = issue.get('number') state = issue.get('state') @@ -792,9 +910,11 @@ def analyze_issues(issues, owner, repo_name, headers, university_email_domain, u closed_at = issue.get('closed_at') if closed_at: # Calculate duration in days - created_date = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ") - closed_date = datetime.strptime(closed_at, "%Y-%m-%dT%H:%M:%SZ") - duration = (closed_date - created_date).total_seconds() / (3600 * 24) + created_date = datetime.strptime(created_at, '%Y-%m-%dT%H:%M:%SZ') + closed_date = datetime.strptime(closed_at, '%Y-%m-%dT%H:%M:%SZ') + duration = (closed_date - created_date).total_seconds() / ( + 3600 * 24 + ) closed_issue_durations.append(duration) # Fetch comments for the issue @@ -811,47 +931,61 @@ def analyze_issues(issues, owner, repo_name, headers, university_email_domain, u try: user_data, _ = github_api_request(user_url, headers) except Exception as e: - logger.warning(f"Could not retrieve data for commenter: {commenter_login} - {e}") + logger.warning( + f'Could not retrieve data for commenter: {commenter_login} - {e}' + ) continue if user_data: email = user_data.get('email', '') company = user_data.get('company', '') # Check if external - if (university_email_domain.lower() not in (email or '').lower() and - not contains_keywords(company or '', {university_name.lower()})): + if university_email_domain.lower() not in ( + email or '' + ).lower() and not contains_keywords( + company or '', {university_name.lower()} + ): issue_analysis['external_participants'].add(commenter_login) # Analyze issue content for collaboration opportunities if comments and len(comments) > 5: - issue_analysis['collaboration_opportunities'].append({ - 'issue_number': issue_number, - 'title': issue.get('title'), - 'comments_count': len(comments) - }) + issue_analysis['collaboration_opportunities'].append( + { + 'issue_number': issue_number, + 'title': issue.get('title'), + 'comments_count': len(comments), + } + ) pbar.update(1) # Update the issues progress bar # Calculate average time to close issues if closed_issue_durations: - issue_analysis['average_time_to_close'] = sum(closed_issue_durations) / len(closed_issue_durations) + issue_analysis['average_time_to_close'] = sum(closed_issue_durations) / len( + closed_issue_durations + ) # Calculate issue update frequency issue_dates.sort() if len(issue_dates) > 1: date_differences = [] for i in range(1, len(issue_dates)): - date1 = datetime.strptime(issue_dates[i - 1], "%Y-%m-%dT%H:%M:%SZ") - date2 = datetime.strptime(issue_dates[i], "%Y-%m-%dT%H:%M:%SZ") + date1 = datetime.strptime(issue_dates[i - 1], '%Y-%m-%dT%H:%M:%SZ') + date2 = datetime.strptime(issue_dates[i], '%Y-%m-%dT%H:%M:%SZ') difference = (date2 - date1).total_seconds() / (3600 * 24) date_differences.append(difference) - issue_analysis['issue_update_frequency'] = sum(date_differences) / len(date_differences) + issue_analysis['issue_update_frequency'] = sum(date_differences) / len( + date_differences + ) # Convert set to list for serialization - issue_analysis['external_participants'] = list(issue_analysis['external_participants']) + issue_analysis['external_participants'] = list( + issue_analysis['external_participants'] + ) return issue_analysis + def get_release_downloads(owner, repo_name, headers): """ Retrieves total download counts for all releases of a repository. @@ -864,14 +998,14 @@ def get_release_downloads(owner, repo_name, headers): Returns: int: Total number of downloads. """ - url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/releases" + url = f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/releases' releases = [] while url: - logger.debug(f"Fetching releases for repository: {owner}/{repo_name}") + logger.debug(f'Fetching releases for repository: {owner}/{repo_name}') try: releases_data, headers_response = github_api_request(url, headers) except Exception as e: - logger.error(f"Error fetching release downloads: {e}") + logger.error(f'Error fetching release downloads: {e}') break if releases_data: releases.extend(releases_data) @@ -888,6 +1022,7 @@ def get_release_downloads(owner, repo_name, headers): total_downloads += download_count return total_downloads + def get_repository_releases(owner, repo_name, headers, since=None): """ Retrieves the list of releases for a repository. @@ -901,19 +1036,20 @@ def get_repository_releases(owner, repo_name, headers, since=None): Returns: list: A list of releases. """ - url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/releases" + url = f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/releases' releases = [] while url: - logger.debug(f"Fetching releases for repository: {owner}/{repo_name}") + logger.debug(f'Fetching releases for repository: {owner}/{repo_name}') try: releases_data, headers_response = github_api_request(url, headers) except Exception as e: - logger.error(f"Error fetching releases: {e}") + logger.error(f'Error fetching releases: {e}') break if releases_data: if since: releases_data = [ - release for release in releases_data + release + for release in releases_data if release.get('published_at') and release['published_at'] >= since ] releases.extend(releases_data) @@ -923,6 +1059,7 @@ def get_repository_releases(owner, repo_name, headers, since=None): break return releases + def get_commits(owner, repo_name, headers, since=None): """ Retrieves commits for a repository. @@ -937,16 +1074,16 @@ def get_commits(owner, repo_name, headers, since=None): list: A list of commits. """ commits = [] - url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/commits" + url = f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/commits' params = {'per_page': 100} if since: params['since'] = since while url: - logger.debug(f"Fetching commits for repository: {owner}/{repo_name}") + logger.debug(f'Fetching commits for repository: {owner}/{repo_name}') try: commits_data, headers_response = github_api_request(url, headers, params) except Exception as e: - logger.error(f"Error fetching commits: {e}") + logger.error(f'Error fetching commits: {e}') break if commits_data: commits.extend(commits_data) @@ -957,6 +1094,7 @@ def get_commits(owner, repo_name, headers, since=None): break return commits + def get_repository_pull_requests(owner, repo_name, headers, since=None): """ Retrieves pull requests for a repository. @@ -971,16 +1109,16 @@ def get_repository_pull_requests(owner, repo_name, headers, since=None): list: A list of pull requests. """ pull_requests = [] - url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/pulls" + url = f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/pulls' params = {'state': 'all', 'per_page': 100} if since: params['since'] = since while url: - logger.debug(f"Fetching pull requests for repository: {owner}/{repo_name}") + logger.debug(f'Fetching pull requests for repository: {owner}/{repo_name}') try: pr_data, headers_response = github_api_request(url, headers, params) except Exception as e: - logger.error(f"Error fetching pull requests: {e}") + logger.error(f'Error fetching pull requests: {e}') break if pr_data: pull_requests.extend(pr_data) @@ -991,6 +1129,7 @@ def get_repository_pull_requests(owner, repo_name, headers, since=None): break return pull_requests + def get_active_contributors(commits): """ Retrieves active contributors from the list of commits. @@ -1008,6 +1147,7 @@ def get_active_contributors(commits): contributors.add(author['login']) return contributors + def calculate_average_time_to_close_issues(issues): """ Calculates the average time to close issues. @@ -1021,15 +1161,18 @@ def calculate_average_time_to_close_issues(issues): closed_durations = [] for issue in issues: if issue['state'] == 'closed': - created_at = datetime.strptime(issue['created_at'], "%Y-%m-%dT%H:%M:%SZ") - closed_at = datetime.strptime(issue['closed_at'], "%Y-%m-%dT%H:%M:%SZ") - duration = (closed_at - created_at).total_seconds() / 3600 # Duration in hours + created_at = datetime.strptime(issue['created_at'], '%Y-%m-%dT%H:%M:%SZ') + closed_at = datetime.strptime(issue['closed_at'], '%Y-%m-%dT%H:%M:%SZ') + duration = ( + closed_at - created_at + ).total_seconds() / 3600 # Duration in hours closed_durations.append(duration) if closed_durations: return sum(closed_durations) / len(closed_durations) else: return None + def calculate_average_time_to_merge_prs(pull_requests): """ Calculates the average time to merge pull requests. @@ -1043,15 +1186,18 @@ def calculate_average_time_to_merge_prs(pull_requests): merged_durations = [] for pr in pull_requests: if pr.get('merged_at'): - created_at = datetime.strptime(pr['created_at'], "%Y-%m-%dT%H:%M:%SZ") - merged_at = datetime.strptime(pr['merged_at'], "%Y-%m-%dT%H:%M:%SZ") - duration = (merged_at - created_at).total_seconds() / 3600 # Duration in hours + created_at = datetime.strptime(pr['created_at'], '%Y-%m-%dT%H:%M:%SZ') + merged_at = datetime.strptime(pr['merged_at'], '%Y-%m-%dT%H:%M:%SZ') + duration = ( + merged_at - created_at + ).total_seconds() / 3600 # Duration in hours merged_durations.append(duration) if merged_durations: return sum(merged_durations) / len(merged_durations) else: return None + def get_discussion_activity_count(owner, repo_name, headers, since_date): """ Counts comments on issues and pull requests within the time window. @@ -1070,18 +1216,25 @@ def get_discussion_activity_count(owner, repo_name, headers, since_date): issues = get_repository_issues(owner, repo_name, headers, since=since_date) for issue in issues: comments = get_issue_comments(owner, repo_name, issue['number'], headers) - issues_comments_count += len([comment for comment in comments if comment.get('created_at') >= since_date]) + issues_comments_count += len( + [comment for comment in comments if comment.get('created_at') >= since_date] + ) # Count comments on pull requests prs_comments_count = 0 - pull_requests = get_repository_pull_requests(owner, repo_name, headers, since=since_date) + pull_requests = get_repository_pull_requests( + owner, repo_name, headers, since=since_date + ) for pr in pull_requests: comments = get_pull_request_comments(owner, repo_name, pr['number'], headers) - prs_comments_count += len([comment for comment in comments if comment.get('created_at') >= since_date]) + prs_comments_count += len( + [comment for comment in comments if comment.get('created_at') >= since_date] + ) total_comments = issues_comments_count + prs_comments_count return total_comments + def calculate_activity_score(metrics, weights): """ Calculates the activity score based on metrics and weights. @@ -1102,12 +1255,12 @@ def calculate_activity_score(metrics, weights): 'avg_issue_close_time': 24, # In hours, lower is better 'recent_prs_opened_count': 100, 'recent_prs_merged_count': 100, - 'avg_pr_merge_time': 24, # In hours, lower is better + 'avg_pr_merge_time': 24, # In hours, lower is better 'stars_growth': 1000, 'forks_growth': 500, 'recent_releases_count': 20, 'total_downloads_recent': 10000, - 'discussion_activity_count': 500 + 'discussion_activity_count': 500, } # Normalize metrics @@ -1133,6 +1286,7 @@ def calculate_activity_score(metrics, weights): activity_score = min(max(activity_score, 1), 100) return activity_score + def contains_university_identifier(text, university_identifiers): """ Checks if the text contains any of the university identifiers. @@ -1150,7 +1304,10 @@ def contains_university_identifier(text, university_identifiers): return True return False -def count_university_identifier_occurrences(text, university_identifiers, points_per_occurrence): + +def count_university_identifier_occurrences( + text, university_identifiers, points_per_occurrence +): """ Counts the occurrences of university identifiers in the text and calculates points. @@ -1172,6 +1329,7 @@ def count_university_identifier_occurrences(text, university_identifiers, points points += count * points_value return points, matches + def analyze_contributors_for_affiliation(contributors, university_details): """ Analyzes contributors for affiliation with the university. @@ -1189,7 +1347,7 @@ def analyze_contributors_for_affiliation(contributors, university_details): matches = { 'email': {'contributors': [], 'points': 0}, 'profile': {'contributors': [], 'points': 0}, - 'other_repos': {'contributors': [], 'points': 0} + 'other_repos': {'contributors': [], 'points': 0}, } for contributor in contributors: email = contributor.get('email') or '' @@ -1208,11 +1366,13 @@ def analyze_contributors_for_affiliation(contributors, university_details): if associated_repos: points = 5 * len(associated_repos) other_repos_points += points - matches['other_repos']['contributors'].append({ - 'username': username, - 'repo_count': len(associated_repos), - 'points': points - }) + matches['other_repos']['contributors'].append( + { + 'username': username, + 'repo_count': len(associated_repos), + 'points': points, + } + ) # Record points matches['email']['points'] = email_points matches['profile']['points'] = profile_points @@ -1220,6 +1380,7 @@ def analyze_contributors_for_affiliation(contributors, university_details): total_points = email_points + profile_points + other_repos_points return total_points, matches + def analyze_owner_for_affiliation(owner_data, university_details): """ Analyzes the repository owner (organization) for affiliation with the university. @@ -1242,13 +1403,15 @@ def analyze_owner_for_affiliation(owner_data, university_details): org_email = owner_data.get('email', '') org_location = owner_data.get('location', '') # Ensure all items are strings - text_to_check = ' '.join([ - org_name or '', - org_description or '', - org_blog or '', - org_email or '', - org_location or '' - ]).lower() + text_to_check = ' '.join( + [ + org_name or '', + org_description or '', + org_blog or '', + org_email or '', + org_location or '', + ] + ).lower() points_per_occurrence = {k: 30 for k in university_details['identifiers']} points, owner_matches = count_university_identifier_occurrences( text_to_check, university_details['identifiers'], points_per_occurrence @@ -1256,7 +1419,18 @@ def analyze_owner_for_affiliation(owner_data, university_details): matches = owner_matches return points, matches, owner_type == 'Organization' -def analyze_repository(repo_info, university_details, keywords, university_email_domain, idx, headers, time_window, weights, hierarchical_keywords): + +def analyze_repository( + repo_info, + university_details, + keywords, + university_email_domain, + idx, + headers, + time_window, + weights, + hierarchical_keywords, +): """ Analyzes a repository for various metrics and information. @@ -1280,27 +1454,27 @@ def analyze_repository(repo_info, university_details, keywords, university_email repo_name = repo.get('name') description = repo.get('description') or '' topics = repo.get('topics', []) - readme_url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/readme" - logger.info(f"Analyzing repository [{idx}]: {repo_full_name}") + readme_url = f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/readme' + logger.info(f'Analyzing repository [{idx}]: {repo_full_name}') # Fetch README content try: readme_data, _ = github_api_request(readme_url, headers) except Exception as e: - logger.warning(f"Could not retrieve README for {repo_full_name}: {e}") + logger.warning(f'Could not retrieve README for {repo_full_name}: {e}') readme_data = None readme_content = '' if readme_data and readme_data.get('content'): - readme_content = base64.b64decode( - readme_data.get('content') - ).decode('utf-8', errors='ignore') + readme_content = base64.b64decode(readme_data.get('content')).decode( + 'utf-8', errors='ignore' + ) # Get list of files in the repository - contents_url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/contents" + contents_url = f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/contents' try: contents, _ = github_api_request(contents_url, headers) except Exception as e: - logger.warning(f"Could not retrieve contents for {repo_full_name}: {e}") + logger.warning(f'Could not retrieve contents for {repo_full_name}: {e}') contents = None files = [] if contents and isinstance(contents, list): @@ -1316,15 +1490,19 @@ def analyze_repository(repo_info, university_details, keywords, university_email is_scientific = contains_keywords(description + ' ' + readme_content, keywords) # Collect repository text for keyword matching - repo_text = ' '.join([ - repo_name or '', - description or '', - ' '.join(topics) or '', - readme_content or '' - ]) + repo_text = ' '.join( + [ + repo_name or '', + description or '', + ' '.join(topics) or '', + readme_content or '', + ] + ) # Match repository text against hierarchical keywords - hierarchical_scores, matched_keywords = match_repository_keywords(repo_text, hierarchical_keywords) + hierarchical_scores, matched_keywords = match_repository_keywords( + repo_text, hierarchical_keywords + ) # Determine the highest scores def get_highest_score(scores_dict): @@ -1340,10 +1518,7 @@ def get_highest_score(scores_dict): # Initialize total points and matches for confidence score total_points = 0 - matches = { - 'total_points': 0, - 'details': {} - } + matches = {'total_points': 0, 'details': {}} # Define university identifiers university_identifiers = { @@ -1365,10 +1540,7 @@ def get_highest_score(scores_dict): repo_text_lower, university_identifiers, points_per_occurrence ) total_points += points - matches['details']['repo_identifiers'] = { - 'matches': repo_matches, - 'points': points - } + matches['details']['repo_identifiers'] = {'matches': repo_matches, 'points': points} # Check repository topics topics_text = ' '.join(topics).lower() @@ -1376,10 +1548,7 @@ def get_highest_score(scores_dict): topics_text, university_identifiers, points_per_occurrence ) total_points += points - matches['details']['repo_topics'] = { - 'matches': topics_matches, - 'points': points - } + matches['details']['repo_topics'] = {'matches': topics_matches, 'points': points} # Check README content readme_text = readme_content.lower() @@ -1387,17 +1556,14 @@ def get_highest_score(scores_dict): readme_text, university_identifiers, points_per_occurrence ) total_points += points - matches['details']['readme'] = { - 'matches': readme_matches, - 'points': points - } + matches['details']['readme'] = {'matches': readme_matches, 'points': points} # Fetch repository owner data owner_url = repo['owner']['url'] try: owner_data, _ = github_api_request(owner_url, headers) except Exception as e: - logger.warning(f"Could not retrieve owner data for {repo_full_name}: {e}") + logger.warning(f'Could not retrieve owner data for {repo_full_name}: {e}') owner_data = {} owner_type = owner_data.get('type', 'User') @@ -1406,24 +1572,26 @@ def get_highest_score(scores_dict): owner_affiliation_matches = {} is_owner_org = owner_type == 'Organization' if is_owner_org: - points, owner_matches, _ = analyze_owner_for_affiliation(owner_data, university_details) + points, owner_matches, _ = analyze_owner_for_affiliation( + owner_data, university_details + ) owner_points += points total_points += owner_points matches['details']['owner_organization'] = { 'matches': owner_matches, - 'points': points + 'points': points, } # If owner is affiliated organization, assign high confidence if owner_points > 0: total_points += 500 # Assign high confidence matches['details']['repo_under_university_org'] = { 'matched': True, - 'points': 500 + 'points': 500, } else: matches['details']['repo_under_university_org'] = { 'matched': False, - 'points': 0 + 'points': 0, } else: # Owner is a user; check owner's profile @@ -1434,22 +1602,30 @@ def get_highest_score(scores_dict): total_points += points matches['details']['owner_profile'] = { 'matches': owner_matches, - 'points': points + 'points': points, } matches['details']['repo_under_university_org'] = { 'matched': False, - 'points': 0 + 'points': 0, } # Analyze contributors for affiliation contributors = get_contributors(owner, repo_name, headers) contributors_count = len(contributors) - logger.debug(f"Number of contributors found: {contributors_count}") - logger.debug(f"Analyzing contributors for repository: {repo_full_name}") - contributor_details = analyze_contributors(contributors, university_email_domain, university_details['name'], keywords, headers) + logger.debug(f'Number of contributors found: {contributors_count}') + logger.debug(f'Analyzing contributors for repository: {repo_full_name}') + contributor_details = analyze_contributors( + contributors, + university_email_domain, + university_details['name'], + keywords, + headers, + ) # Analyze contributors for confidence score - contributor_points, contributor_matches = analyze_contributors_for_affiliation(contributor_details, university_details) + contributor_points, contributor_matches = analyze_contributors_for_affiliation( + contributor_details, university_details + ) total_points += contributor_points matches['details']['contributors'] = contributor_matches @@ -1464,13 +1640,22 @@ def get_highest_score(scores_dict): license_name = license_info.get('name', 'No license') # Calculate 'since_date' based on the provided 'time_window' - since_date = (datetime.now(timezone.utc) - timedelta(days=time_window * 30)).strftime('%Y-%m-%dT%H:%M:%SZ') + since_date = ( + datetime.now(timezone.utc) - timedelta(days=time_window * 30) + ).strftime('%Y-%m-%dT%H:%M:%SZ') # Fetch all issues for total counts all_issues = get_repository_issues(owner, repo_name, headers) # Issue analysis using all issues - issues_analysis = analyze_issues(all_issues, owner, repo_name, headers, university_email_domain, university_details['name']) + issues_analysis = analyze_issues( + all_issues, + owner, + repo_name, + headers, + university_email_domain, + university_details['name'], + ) # Exclude 'collaboration_opportunities' from output issues_analysis_output = issues_analysis.copy() @@ -1484,8 +1669,16 @@ def get_highest_score(scores_dict): # Fetch recent issues for activity metrics recent_issues = get_repository_issues(owner, repo_name, headers, since=since_date) - recent_issues_opened_count = len([issue for issue in recent_issues if issue.get('created_at') >= since_date]) - recent_issues_closed_count = len([issue for issue in recent_issues if issue.get('closed_at') and issue['closed_at'] >= since_date]) + recent_issues_opened_count = len( + [issue for issue in recent_issues if issue.get('created_at') >= since_date] + ) + recent_issues_closed_count = len( + [ + issue + for issue in recent_issues + if issue.get('closed_at') and issue['closed_at'] >= since_date + ] + ) avg_issue_close_time = calculate_average_time_to_close_issues(recent_issues) # Fetch recent commits @@ -1493,9 +1686,19 @@ def get_highest_score(scores_dict): recent_commits_count = len(recent_commits) # Fetch recent pull requests - recent_pull_requests = get_repository_pull_requests(owner, repo_name, headers, since=since_date) - recent_prs_opened_count = len([pr for pr in recent_pull_requests if pr.get('created_at') >= since_date]) - recent_prs_merged_count = len([pr for pr in recent_pull_requests if pr.get('merged_at') and pr['merged_at'] >= since_date]) + recent_pull_requests = get_repository_pull_requests( + owner, repo_name, headers, since=since_date + ) + recent_prs_opened_count = len( + [pr for pr in recent_pull_requests if pr.get('created_at') >= since_date] + ) + recent_prs_merged_count = len( + [ + pr + for pr in recent_pull_requests + if pr.get('merged_at') and pr['merged_at'] >= since_date + ] + ) avg_pr_merge_time = calculate_average_time_to_merge_prs(recent_pull_requests) # Fetch active contributors @@ -1503,12 +1706,16 @@ def get_highest_score(scores_dict): active_contributors_count = len(active_contributors) # Fetch recent releases - recent_releases = get_repository_releases(owner, repo_name, headers, since=since_date) + recent_releases = get_repository_releases( + owner, repo_name, headers, since=since_date + ) recent_releases_count = len(recent_releases) total_downloads_recent = get_release_downloads(owner, repo_name, headers) # Collect discussion activity - discussion_activity_count = get_discussion_activity_count(owner, repo_name, headers, since_date) + discussion_activity_count = get_discussion_activity_count( + owner, repo_name, headers, since_date + ) # For stars and forks growth, GitHub API doesn't provide historical data stars_count = repo.get('stargazers_count', 0) @@ -1531,25 +1738,37 @@ def get_highest_score(scores_dict): 'forks_growth': forks_growth, 'recent_releases_count': recent_releases_count, 'total_downloads_recent': total_downloads_recent, - 'discussion_activity_count': discussion_activity_count + 'discussion_activity_count': discussion_activity_count, } # Calculate activity score activity_score = calculate_activity_score(activity_metrics, weights) # Last commit date - last_commit_date = recent_commits[0]['commit']['committer']['date'] if recent_commits else 'No recent commits' + last_commit_date = ( + recent_commits[0]['commit']['committer']['date'] + if recent_commits + else 'No recent commits' + ) # Check for documentation files has_readme = bool(readme_data) - code_of_conduct_url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/community/code_of_conduct" + code_of_conduct_url = ( + f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/community/code_of_conduct' + ) try: code_of_conduct, _ = github_api_request(code_of_conduct_url, headers) except Exception as e: - logger.warning(f"Could not retrieve code of conduct for {repo_full_name}: {e}") + logger.warning(f'Could not retrieve code of conduct for {repo_full_name}: {e}') code_of_conduct = None has_code_of_conduct = code_of_conduct is not None and 'url' in code_of_conduct - files_to_check = ['citation.cff', 'CONTRIBUTING.md', 'GOVERNANCE.md', 'FUNDING.yml', 'funding.json'] + files_to_check = [ + 'citation.cff', + 'CONTRIBUTING.md', + 'GOVERNANCE.md', + 'FUNDING.yml', + 'funding.json', + ] documentation = {file: False for file in files_to_check} if contents and isinstance(contents, list): for content in contents: @@ -1558,10 +1777,14 @@ def get_highest_score(scores_dict): # Lead institution affiliations = [c['affiliation'] for c in contributor_details if c['affiliation']] - lead_institution = max(set(affiliations), key=affiliations.count) if affiliations else 'Unknown' + lead_institution = ( + max(set(affiliations), key=affiliations.count) if affiliations else 'Unknown' + ) # External impact - external_contributors = [c for c in contributor_details if c['affiliation'] != university_details['name']] + external_contributors = [ + c for c in contributor_details if c['affiliation'] != university_details['name'] + ] external_impact = len(external_contributors) # Calculate association score @@ -1572,11 +1795,11 @@ def get_highest_score(scores_dict): open_issues_count = repo.get('open_issues_count', 0) # Fetch Languages - languages_url = f"{GITHUB_API_URL}/repos/{owner}/{repo_name}/languages" + languages_url = f'{GITHUB_API_URL}/repos/{owner}/{repo_name}/languages' try: languages_data, _ = github_api_request(languages_url, headers) except Exception as e: - logger.warning(f"Could not retrieve languages for {repo_full_name}: {e}") + logger.warning(f'Could not retrieve languages for {repo_full_name}: {e}') languages_data = None if languages_data: total_bytes = sum(languages_data.values()) @@ -1585,9 +1808,7 @@ def get_highest_score(scores_dict): for language, bytes_count in languages_data.items() } sorted_languages = sorted( - languages_percentages.items(), - key=lambda item: item[1], - reverse=True + languages_percentages.items(), key=lambda item: item[1], reverse=True ) main_language = sorted_languages[0][0] if sorted_languages else 'Unknown' else: @@ -1634,11 +1855,14 @@ def get_highest_score(scores_dict): 'subfield': subfield, 'topic': topic, 'matched_keywords': matched_keywords, - 'hierarchical_scores': hierarchical_scores + 'hierarchical_scores': hierarchical_scores, } - logger.info(f"Repository analyzed: {repo_full_name} with confidence score {confidence_score:.2f} and activity score {activity_score:.2f}") + logger.info( + f'Repository analyzed: {repo_full_name} with confidence score {confidence_score:.2f} and activity score {activity_score:.2f}' + ) return repo_data + def write_to_csv(all_repo_data, output_filename_csv): """ Writes repository data to a CSV file with separate columns for documentation files. @@ -1712,7 +1936,7 @@ def write_to_csv(all_repo_data, output_filename_csv): 'forks_growth', 'recent_releases_count', 'total_downloads_recent', - 'discussion_activity_count' + 'discussion_activity_count', ] # Open the CSV file for writing with open(output_filename_csv, 'w', newline='', encoding='utf-8') as csvfile: @@ -1721,11 +1945,18 @@ def write_to_csv(all_repo_data, output_filename_csv): for repo_data in all_repo_data: # Prepare project type scores as a string project_type_scores_str = '; '.join( - [f"{key}: {value}" for key, value in repo_data['project_type_scores'].items()] + [ + f'{key}: {value}' + for key, value in repo_data['project_type_scores'].items() + ] ) # Prepare project type matches as a string project_type_matches_str = '; '.join( - [f"{key}: {', '.join(value)}" for key, value in repo_data['project_type_matches'].items() if value] + [ + f'{key}: {", ".join(value)}' + for key, value in repo_data['project_type_matches'].items() + if value + ] ) # Extract issues analysis issues_analysis = repo_data['issues_analysis'] @@ -1733,7 +1964,12 @@ def write_to_csv(all_repo_data, output_filename_csv): pr_analysis = repo_data['pr_analysis'] # Prepare languages_percentages as a string languages_percentages_str = '; '.join( - [f"{language}: {percentage:.2f}%" for language, percentage in repo_data['languages_percentages'].items()] + [ + f'{language}: {percentage:.2f}%' + for language, percentage in repo_data[ + 'languages_percentages' + ].items() + ] ) # Prepare confidence matches as a string confidence_matches_str = json.dumps(repo_data['confidence_matches']) @@ -1779,15 +2015,23 @@ def write_to_csv(all_repo_data, output_filename_csv): 'closed_issues': issues_analysis['closed_issues'], 'average_time_to_close': issues_analysis['average_time_to_close'], 'issue_update_frequency': issues_analysis['issue_update_frequency'], - 'external_participants_count': len(issues_analysis['external_participants']), - 'external_participants': '; '.join(issues_analysis['external_participants']), + 'external_participants_count': len( + issues_analysis['external_participants'] + ), + 'external_participants': '; '.join( + issues_analysis['external_participants'] + ), 'total_prs': pr_analysis['total_prs'], 'open_prs': pr_analysis['open_prs'], 'closed_prs': pr_analysis['closed_prs'], 'average_time_to_merge': pr_analysis['average_time_to_merge'], 'pr_update_frequency': pr_analysis['pr_update_frequency'], - 'average_time_to_first_review': pr_analysis.get('average_time_to_first_review'), - 'review_to_merge_percentage': pr_analysis.get('review_to_merge_percentage'), + 'average_time_to_first_review': pr_analysis.get( + 'average_time_to_first_review' + ), + 'review_to_merge_percentage': pr_analysis.get( + 'review_to_merge_percentage' + ), 'main_language': repo_data['main_language'], 'languages_percentages': languages_percentages_str, 'stars_count': repo_data['stars_count'], @@ -1797,21 +2041,36 @@ def write_to_csv(all_repo_data, output_filename_csv): 'total_downloads': repo_data['total_downloads'], 'activity_score': repo_data.get('activity_score'), 'recent_commits_count': activity_metrics.get('recent_commits_count'), - 'active_contributors_count': activity_metrics.get('active_contributors_count'), - 'recent_issues_opened_count': activity_metrics.get('recent_issues_opened_count'), - 'recent_issues_closed_count': activity_metrics.get('recent_issues_closed_count'), + 'active_contributors_count': activity_metrics.get( + 'active_contributors_count' + ), + 'recent_issues_opened_count': activity_metrics.get( + 'recent_issues_opened_count' + ), + 'recent_issues_closed_count': activity_metrics.get( + 'recent_issues_closed_count' + ), 'avg_issue_close_time': activity_metrics.get('avg_issue_close_time'), - 'recent_prs_opened_count': activity_metrics.get('recent_prs_opened_count'), - 'recent_prs_merged_count': activity_metrics.get('recent_prs_merged_count'), + 'recent_prs_opened_count': activity_metrics.get( + 'recent_prs_opened_count' + ), + 'recent_prs_merged_count': activity_metrics.get( + 'recent_prs_merged_count' + ), 'avg_pr_merge_time': activity_metrics.get('avg_pr_merge_time'), 'stars_growth': activity_metrics.get('stars_growth'), 'forks_growth': activity_metrics.get('forks_growth'), 'recent_releases_count': activity_metrics.get('recent_releases_count'), - 'total_downloads_recent': activity_metrics.get('total_downloads_recent'), - 'discussion_activity_count': activity_metrics.get('discussion_activity_count') + 'total_downloads_recent': activity_metrics.get( + 'total_downloads_recent' + ), + 'discussion_activity_count': activity_metrics.get( + 'discussion_activity_count' + ), } writer.writerow(row) - logger.info(f"CSV data written to {output_filename_csv}") + logger.info(f'CSV data written to {output_filename_csv}') + def convert_sets_to_lists(obj): """ @@ -1832,6 +2091,7 @@ def convert_sets_to_lists(obj): else: return obj + def get_user_input(prompt): """ Prompts the user for input and ensures it's not empty. @@ -1847,7 +2107,8 @@ def get_user_input(prompt): if user_input: return user_input else: - print("Input cannot be empty. Please try again.") + print('Input cannot be empty. Please try again.') + def main(): """ @@ -1856,15 +2117,25 @@ def main(): start_time = time.time() # Parse command-line arguments (excluding --activity-metric) - parser = argparse.ArgumentParser(description='University Repository Analysis Script') - parser.add_argument('--limit', '-l', type=int, help='Limit processing to the first N repositories') + parser = argparse.ArgumentParser( + description='University Repository Analysis Script' + ) + parser.add_argument( + '--limit', '-l', type=int, help='Limit processing to the first N repositories' + ) args, unknown = parser.parse_known_args() # User input - university_name = get_user_input("Enter the university name (e.g., 'University of California, Santa Cruz'): ") + university_name = get_user_input( + "Enter the university name (e.g., 'University of California, Santa Cruz'): " + ) university_acronym = get_user_input("Enter the university acronym (e.g., 'UCSC'): ") - university_email_domain = get_user_input("Enter the university email domain (e.g., 'ucsc.edu'): ") - university_website_url = get_user_input("Enter the university website URL (e.g., 'ucsc.edu'): ") + university_email_domain = get_user_input( + "Enter the university email domain (e.g., 'ucsc.edu'): " + ) + university_website_url = get_user_input( + "Enter the university website URL (e.g., 'ucsc.edu'): " + ) additional_queries = [] while True: query = input("Enter an additional query (or 'n' to stop): ").strip() @@ -1878,17 +2149,21 @@ def main(): '2': {'name': 'Set your own', 'key': 'custom'}, } - print("\nChoose the activity metric:") + print('\nChoose the activity metric:') for number, option in activity_metric_options.items(): - print(f"{number}. {option['name']}") + print(f'{number}. {option["name"]}') while True: - activity_metric_choice = get_user_input("Enter the number of your choice: ").strip() + activity_metric_choice = get_user_input( + 'Enter the number of your choice: ' + ).strip() if activity_metric_choice in activity_metric_options: - args.activity_metric = activity_metric_options[activity_metric_choice]['key'] + args.activity_metric = activity_metric_options[activity_metric_choice][ + 'key' + ] break else: - print("Please enter a valid number from the options above.") + print('Please enter a valid number from the options above.') # Assign to args.activity_metric args.activity_metric = activity_metric_choice @@ -1897,11 +2172,13 @@ def main(): load_dotenv() github_token = os.getenv('GITHUB_TOKEN') if not github_token: - logger.error("GITHUB_TOKEN not found in .env file. Please create a .env file with your GitHub token.") + logger.error( + 'GITHUB_TOKEN not found in .env file. Please create a .env file with your GitHub token.' + ) exit(1) headers = { 'Authorization': f'token {github_token}', - 'Accept': 'application/vnd.github.v3+json' + 'Accept': 'application/vnd.github.v3+json', } # Load keywords @@ -1914,81 +2191,117 @@ def main(): query_terms = [ f'"{university_name}" in:name,description,readme', f'"{university_acronym}" in:name,description,readme', - f'"{university_email_domain}" in:email' + f'"{university_email_domain}" in:email', ] + additional_queries # Define the available metrics and their default weights available_metrics = { 'recent_commits_count': {'name': 'Recent Commits Count', 'default_weight': 20}, - 'active_contributors_count': {'name': 'Active Contributors Count', 'default_weight': 15}, - 'recent_issues_opened_count': {'name': 'Recent Issues Opened Count', 'default_weight': 10}, - 'recent_issues_closed_count': {'name': 'Recent Issues Closed Count', 'default_weight': 10}, - 'avg_issue_close_time': {'name': 'Average Time to Close Issues', 'default_weight': 5}, - 'recent_prs_opened_count': {'name': 'Recent PRs Opened Count', 'default_weight': 10}, - 'recent_prs_merged_count': {'name': 'Recent PRs Merged Count', 'default_weight': 10}, + 'active_contributors_count': { + 'name': 'Active Contributors Count', + 'default_weight': 15, + }, + 'recent_issues_opened_count': { + 'name': 'Recent Issues Opened Count', + 'default_weight': 10, + }, + 'recent_issues_closed_count': { + 'name': 'Recent Issues Closed Count', + 'default_weight': 10, + }, + 'avg_issue_close_time': { + 'name': 'Average Time to Close Issues', + 'default_weight': 5, + }, + 'recent_prs_opened_count': { + 'name': 'Recent PRs Opened Count', + 'default_weight': 10, + }, + 'recent_prs_merged_count': { + 'name': 'Recent PRs Merged Count', + 'default_weight': 10, + }, 'avg_pr_merge_time': {'name': 'Average Time to Merge PRs', 'default_weight': 5}, 'stars_growth': {'name': 'Growth in Stars', 'default_weight': 5}, 'forks_growth': {'name': 'Growth in Forks', 'default_weight': 5}, 'recent_releases_count': {'name': 'Recent Releases Count', 'default_weight': 5}, - 'total_downloads_recent': {'name': 'Total Downloads in Time Window', 'default_weight': 5}, - 'discussion_activity_count': {'name': 'Discussion Activity Count (Comments on Issues and PRs)', 'default_weight': 0} + 'total_downloads_recent': { + 'name': 'Total Downloads in Time Window', + 'default_weight': 5, + }, + 'discussion_activity_count': { + 'name': 'Discussion Activity Count (Comments on Issues and PRs)', + 'default_weight': 0, + }, } if args.activity_metric == '2': # Get custom time window while True: try: - time_window = int(get_user_input("Enter the number of months to look back: ")) + time_window = int( + get_user_input('Enter the number of months to look back: ') + ) if time_window > 0: break else: - print("Time window must be a positive integer.") + print('Time window must be a positive integer.') except ValueError: - print("Please enter a valid integer.") + print('Please enter a valid integer.') # Initialize weights weights = {} total_percentage = 0 # Display the list of metrics before assigning weights - print("\nYou will be assigning weights to the following metrics:") + print('\nYou will be assigning weights to the following metrics:') for metric_key, metric_info in available_metrics.items(): - print(f"- {metric_info['name']}") + print(f'- {metric_info["name"]}') - print("\nPlease assign percentages to the following metrics. The total must sum up to 100%.") + print( + '\nPlease assign percentages to the following metrics. The total must sum up to 100%.' + ) for metric_key, metric_info in available_metrics.items(): remaining_percentage = 100 - total_percentage while True: try: - prompt_message = f"Enter percentage for {metric_info['name']} (remaining {remaining_percentage}%): " + prompt_message = f'Enter percentage for {metric_info["name"]} (remaining {remaining_percentage}%): ' percentage = float(get_user_input(prompt_message)) if 0 <= percentage <= remaining_percentage: weights[metric_key] = percentage / 100 # Convert to decimal total_percentage += percentage break else: - print(f"Please enter a value between 0 and {remaining_percentage}.") + print( + f'Please enter a value between 0 and {remaining_percentage}.' + ) except ValueError: - print("Please enter a valid number.") + print('Please enter a valid number.') if total_percentage != 100: - print("Percentages do not sum up to 100%. Please run the script again and ensure the total sums to 100%.") + print( + 'Percentages do not sum up to 100%. Please run the script again and ensure the total sums to 100%.' + ) exit(1) else: # Use default OSSci Activity Metric time_window = 6 # Default time window in months # Extract default weights and convert to decimals - weights = {key: info['default_weight'] / 100 for key, info in available_metrics.items()} + weights = { + key: info['default_weight'] / 100 for key, info in available_metrics.items() + } # Search repositories repositories = search_repositories_with_queries(query_terms, headers) - logger.info(f"Total repositories found: {len(repositories)}") + logger.info(f'Total repositories found: {len(repositories)}') # Limit processing if --limit flag is set if args.limit: limit_count = args.limit - logger.info(f"Limiting processing to the first {limit_count} repositories due to --limit flag.") + logger.info( + f'Limiting processing to the first {limit_count} repositories due to --limit flag.' + ) # Convert repositories dictionary to a list of items and take the first N repositories_items = list(repositories.items())[:limit_count] else: @@ -2005,16 +2318,20 @@ def main(): university_acronym.lower(): {'points': 20}, university_email_domain.lower(): {'points': 30}, university_website_url.lower(): {'points': 20}, - } + }, } # Analyze repositories with a progress bar all_repo_data = [] total_repos = len(repositories_items) - with tqdm(total=total_repos, desc='Analyzing Repositories', unit='repo', position=0) as pbar: + with tqdm( + total=total_repos, desc='Analyzing Repositories', unit='repo', position=0 + ) as pbar: for idx, (repo_id, repo_info) in enumerate(repositories_items, start=1): - logger.info(f"Processing repository {idx}/{total_repos}: {repo_info['repo_data'].get('full_name', '')}") + logger.info( + f'Processing repository {idx}/{total_repos}: {repo_info["repo_data"].get("full_name", "")}' + ) repo_data = analyze_repository( repo_info, university_details, @@ -2024,7 +2341,7 @@ def main(): headers, time_window, weights, - hierarchical_keywords + hierarchical_keywords, ) all_repo_data.append(repo_data) pbar.update(1) @@ -2033,13 +2350,13 @@ def main(): all_repo_data_serializable = convert_sets_to_lists(all_repo_data) # Output results - output_filename_json = f"repository_data_{university_details['acronym']}.json" + output_filename_json = f'repository_data_{university_details["acronym"]}.json' with open(output_filename_json, 'w', encoding='utf-8') as f: json.dump(all_repo_data_serializable, f, ensure_ascii=False, indent=4) - logger.info(f"JSON data written to {output_filename_json}") + logger.info(f'JSON data written to {output_filename_json}') # Write to CSV - output_filename_csv = f"repository_data_{university_details['acronym']}.csv" + output_filename_csv = f'repository_data_{university_details["acronym"]}.csv' write_to_csv(all_repo_data_serializable, output_filename_csv) # Print the output if limited @@ -2048,7 +2365,8 @@ def main(): end_time = time.time() total_runtime = end_time - start_time - logger.info(f"Total runtime: {total_runtime:.2f} seconds") + logger.info(f'Total runtime: {total_runtime:.2f} seconds') + -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/backend/api/__init__.py b/backend/api/__init__.py index fe2f6f0..2ab1e30 100644 --- a/backend/api/__init__.py +++ b/backend/api/__init__.py @@ -1 +1 @@ -# Makes 'api' a Python package \ No newline at end of file +# Makes 'api' a Python package diff --git a/backend/api/deps.py b/backend/api/deps.py index e6a27e4..cf4a4a7 100644 --- a/backend/api/deps.py +++ b/backend/api/deps.py @@ -14,11 +14,12 @@ # Import the actual database session generator and SessionLocal factory # from the data layer. -from backend.data.database import SessionLocal, get_db +from backend.data.database import get_db # Logger for this module logger = logging.getLogger(__name__) + # --- Database Session Dependency --- def get_db_session() -> Generator[Session, None, None]: """ @@ -40,6 +41,7 @@ def get_db_session() -> Generator[Session, None, None]: # to the imported `get_db` generator function. yield from get_db() + # --- Example Usage in an Endpoint --- # # from fastapi import Depends, APIRouter @@ -62,4 +64,4 @@ def get_db_session() -> Generator[Session, None, None]: # # handled by the dependency mechanism thanks to the context manager # # or generator structure in `database.get_db`. # logger.info(f"Received database session: {db}") -# return {"message": "Items would be read here using the db session"} \ No newline at end of file +# return {"message": "Items would be read here using the db session"} diff --git a/backend/api/v1/__init__.py b/backend/api/v1/__init__.py index c97f7e7..ea67cbf 100644 --- a/backend/api/v1/__init__.py +++ b/backend/api/v1/__init__.py @@ -1 +1 @@ -# Makes 'v1' a Python package \ No newline at end of file +# Makes 'v1' a Python package diff --git a/backend/api/v1/api.py b/backend/api/v1/api.py index e955081..dcffb6b 100644 --- a/backend/api/v1/api.py +++ b/backend/api/v1/api.py @@ -19,7 +19,9 @@ from .endpoints import shared_recipes from .endpoints import affiliation_algorithms from .endpoints import history -from .endpoints import discovery_algorithms # Handles discovery algorithm related operations +from .endpoints import ( + discovery_algorithms, +) # Handles discovery algorithm related operations # Main router instance for API version 1. @@ -39,10 +41,22 @@ # Routes for search functionalities across the application data api_router.include_router(search.router, prefix="/search", tags=["Search"]) # Routes for managing shared analysis recipes or configurations -api_router.include_router(shared_recipes.router, prefix="/shared-recipes", tags=["Shared Analysis Recipes"]) +api_router.include_router( + shared_recipes.router, prefix="/shared-recipes", tags=["Shared Analysis Recipes"] +) # Routes for managing and executing repository-institution affiliation algorithms -api_router.include_router(affiliation_algorithms.router, prefix="/affiliation-algorithms", tags=["Affiliation Algorithms"]) +api_router.include_router( + affiliation_algorithms.router, + prefix="/affiliation-algorithms", + tags=["Affiliation Algorithms"], +) # Routes for accessing history of ingestion tasks -api_router.include_router(history.router, prefix="/ingestion-history", tags=["Ingestion History"]) +api_router.include_router( + history.router, prefix="/ingestion-history", tags=["Ingestion History"] +) # Routes for managing and executing discovery algorithms -api_router.include_router(discovery_algorithms.router, prefix="/discovery-algorithms", tags=["Discovery Algorithms"]) \ No newline at end of file +api_router.include_router( + discovery_algorithms.router, + prefix="/discovery-algorithms", + tags=["Discovery Algorithms"], +) diff --git a/backend/api/v1/endpoints/__init__.py b/backend/api/v1/endpoints/__init__.py index 543468d..62db2f3 100644 --- a/backend/api/v1/endpoints/__init__.py +++ b/backend/api/v1/endpoints/__init__.py @@ -1 +1 @@ -# Makes 'endpoints' a Python package containing specific endpoint routers \ No newline at end of file +# Makes 'endpoints' a Python package containing specific endpoint routers diff --git a/backend/api/v1/endpoints/affiliation_algorithms.py b/backend/api/v1/endpoints/affiliation_algorithms.py index c979d97..2ac46c3 100644 --- a/backend/api/v1/endpoints/affiliation_algorithms.py +++ b/backend/api/v1/endpoints/affiliation_algorithms.py @@ -16,15 +16,23 @@ # Internal dependencies for database session management, configuration, and utilities from backend.api.deps import get_db_session from backend.config.settings import settings + # Uses generalized discover_recipes and specific dir constant -from backend.utils.recipe_utils import discover_recipes, CONTRIB_AFFILIATION_ALGOS_DIR, RecipeMetadata +from backend.utils.recipe_utils import ( + discover_recipes, + CONTRIB_AFFILIATION_ALGOS_DIR, + RecipeMetadata, +) from backend.utils.recipe_executor import execute_recipe + # Import request/response schemas and database repository from backend.schemas.requests import AffiliationExecutionRequest -from backend.schemas.responses import AffiliationExecutionResponse, RecipeMetadataResponse +from backend.schemas.responses import ( + AffiliationExecutionResponse, + RecipeMetadataResponse, +) from backend.data.repositories import RepositoryInstitutionAffiliationRepository # Keep for constant def if needed elsewhere, though not directly used in this endpoint logic -from backend.utils.recipe_utils import PROJECT_ROOT_UTIL # Logger setup for this module logger = logging.getLogger(__name__) @@ -32,6 +40,7 @@ # API Router instance for affiliation algorithms router = APIRouter() + # --- AFFILIATION ALGORITHM DISCOVERY ENDPOINT --- @router.get( "/", @@ -55,23 +64,29 @@ def get_available_affiliation_algorithms(): Raises: HTTPException: 500 Internal Server Error if scanning or parsing fails unexpectedly. """ - logger.info(f"Request received: Discover affiliation algorithms from {CONTRIB_AFFILIATION_ALGOS_DIR}") + logger.info( + f"Request received: Discover affiliation algorithms from {CONTRIB_AFFILIATION_ALGOS_DIR}" + ) try: # Utilize the shared recipe discovery utility, specifying the target directory and function name discovered_algorithms = discover_recipes( recipes_base_dir=CONTRIB_AFFILIATION_ALGOS_DIR, - target_function_name="calculate_affiliations" # Target function specific to affiliation logic + target_function_name="calculate_affiliations", # Target function specific to affiliation logic ) # Convert the internal RecipeMetadata objects to the standardized response model - response_data = [RecipeMetadataResponse(**algo.to_dict()) for algo in discovered_algorithms] + response_data = [ + RecipeMetadataResponse(**algo.to_dict()) for algo in discovered_algorithms + ] return response_data - except Exception as e: + except Exception: logger.exception("Error occurred during affiliation algorithm discovery.") # Raise a generic server error if any part of the discovery process fails raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to discover affiliation algorithms." + detail="Failed to discover affiliation algorithms.", ) + + # --- END DISCOVERY ENDPOINT --- @@ -80,13 +95,19 @@ def get_available_affiliation_algorithms(): "/execute/{algorithm_name}/{algorithm_version}", response_model=AffiliationExecutionResponse, summary="Execute an Affiliation Algorithm", - status_code=status.HTTP_200_OK # Use 200 OK as the operation aims for completion and result reporting + status_code=status.HTTP_200_OK, # Use 200 OK as the operation aims for completion and result reporting ) def execute_affiliation_algorithm( - algorithm_name: str = FastApiPath(..., description="Name of the affiliation algorithm to execute."), - algorithm_version: str = FastApiPath(..., description="Version of the affiliation algorithm to execute."), - request_body: AffiliationExecutionRequest = Body(...), # Contains institution_id and algorithm-specific params - db: Session = Depends(get_db_session) # Database session dependency + algorithm_name: str = FastApiPath( + ..., description="Name of the affiliation algorithm to execute." + ), + algorithm_version: str = FastApiPath( + ..., description="Version of the affiliation algorithm to execute." + ), + request_body: AffiliationExecutionRequest = Body( + ... + ), # Contains institution_id and algorithm-specific params + db: Session = Depends(get_db_session), # Database session dependency ): """ Executes a specific affiliation algorithm script identified by its name and version. @@ -124,14 +145,16 @@ def execute_affiliation_algorithm( - 500 Internal Server Error: If database connection is missing, script execution fails, or results cannot be stored in the database. """ - logger.info(f"Request received: Execute affiliation algorithm '{algorithm_name}' version '{algorithm_version}' for institution {request_body.institution_id}") + logger.info( + f"Request received: Execute affiliation algorithm '{algorithm_name}' version '{algorithm_version}' for institution {request_body.institution_id}" + ) # 1. Find Algorithm Metadata by scanning the directory again # (Consider caching this discovery result in a production environment for performance) try: discovered_algorithms = discover_recipes( recipes_base_dir=CONTRIB_AFFILIATION_ALGOS_DIR, - target_function_name="calculate_affiliations" + target_function_name="calculate_affiliations", ) algo_meta: RecipeMetadata | None = None # Find the specific algorithm matching the request path parameters @@ -139,193 +162,234 @@ def execute_affiliation_algorithm( if algo.name == algorithm_name and algo.version == algorithm_version: algo_meta = algo break - except Exception as discovery_err: - logger.exception("Error during affiliation algorithm lookup for execution.") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to look up affiliation algorithm for execution." - ) + except Exception: + logger.exception("Error during affiliation algorithm lookup for execution.") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to look up affiliation algorithm for execution.", + ) # Handle case where the algorithm is not found if not algo_meta: - logger.warning(f"Affiliation algorithm not found: {algorithm_name} v{algorithm_version}") + logger.warning( + f"Affiliation algorithm not found: {algorithm_name} v{algorithm_version}" + ) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"Affiliation algorithm '{algorithm_name}' version '{algorithm_version}' not found." + detail=f"Affiliation algorithm '{algorithm_name}' version '{algorithm_version}' not found.", ) # 2. Parameter Validation against the discovered metadata required_params_from_docstring = {p.name for p in algo_meta.parameters} provided_params_in_body = set(request_body.parameters.keys()) # Parameters injected by the runner or part of the main request body, not the 'parameters' dict - internal_or_request_params = {'db_conn_str', 'institution_id'} + internal_or_request_params = {"db_conn_str", "institution_id"} # Determine which parameters expected by the script's function *must* be in the 'parameters' part of the request body - required_params_for_body = required_params_from_docstring - internal_or_request_params + required_params_for_body = ( + required_params_from_docstring - internal_or_request_params + ) missing_params_in_body = required_params_for_body - provided_params_in_body # Ensure the algorithm's docstring includes 'institution_id' as it's fundamental - if 'institution_id' not in required_params_from_docstring: - logger.error(f"Algorithm {algorithm_name} v{algorithm_version} docstring missing required 'institution_id' parameter definition.") - # Note: This is a developer error in the script, raising 500 as the system can't proceed correctly. - # Alternatively, could raise 422 if treated as a client error trying to use a badly defined script. - # Choosing 500 as it indicates a problem with the algorithm definition itself. - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Algorithm definition for '{algorithm_name}' v'{algorithm_version}' is missing the 'institution_id' parameter." - ) + if "institution_id" not in required_params_from_docstring: + logger.error( + f"Algorithm {algorithm_name} v{algorithm_version} docstring missing required 'institution_id' parameter definition." + ) + # Note: This is a developer error in the script, raising 500 as the system can't proceed correctly. + # Alternatively, could raise 422 if treated as a client error trying to use a badly defined script. + # Choosing 500 as it indicates a problem with the algorithm definition itself. + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Algorithm definition for '{algorithm_name}' v'{algorithm_version}' is missing the 'institution_id' parameter.", + ) # Raise error if any required parameters for the body dict are missing if missing_params_in_body: - logger.warning(f"Missing required parameters in request body 'parameters' field for {algorithm_name} v{algorithm_version}: {missing_params_in_body}") + logger.warning( + f"Missing required parameters in request body 'parameters' field for {algorithm_name} v{algorithm_version}: {missing_params_in_body}" + ) raise HTTPException( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=f"Missing required parameters in request body 'parameters' field: {', '.join(missing_params_in_body)}" + detail=f"Missing required parameters in request body 'parameters' field: {', '.join(missing_params_in_body)}", ) # Combine institution ID with other parameters for the script execution context - execution_params = {"institution_id": request_body.institution_id, **request_body.parameters} + execution_params = { + "institution_id": request_body.institution_id, + **request_body.parameters, + } # 3. Get DB Connection String from application settings db_connection_string = settings.DATABASE_URL if not db_connection_string: - logger.error("DATABASE_URL is not configured in settings.") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Database connection is not configured." - ) + logger.error("DATABASE_URL is not configured in settings.") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Database connection is not configured.", + ) # 4. Execute Algorithm Script via the recipe executor utility - logger.info(f"Calling recipe executor for affiliation algorithm: {algo_meta.file_path}") + logger.info( + f"Calling recipe executor for affiliation algorithm: {algo_meta.file_path}" + ) try: # The executor handles running the script's target function in a separate process execution_result = execute_recipe( - recipe_path_relative=algo_meta.file_path, # Path to the script file - recipe_params=execution_params, # Parameters for the script's function - db_conn_str=db_connection_string, # Database connection string - script_type='affiliation', # Type indicator for the executor - function_name='calculate_affiliations' # Target function within the script + recipe_path_relative=algo_meta.file_path, # Path to the script file + recipe_params=execution_params, # Parameters for the script's function + db_conn_str=db_connection_string, # Database connection string + script_type="affiliation", # Type indicator for the executor + function_name="calculate_affiliations", # Target function within the script ) except Exception as exec_api_err: # Catch unexpected errors during the invocation of the executor itself - logger.exception(f"Unexpected error calling recipe executor for {algorithm_name} v{algorithm_version}") + logger.exception( + f"Unexpected error calling recipe executor for {algorithm_name} v{algorithm_version}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Failed to invoke algorithm execution: {exec_api_err}" + detail=f"Failed to invoke algorithm execution: {exec_api_err}", ) # 5. Process Execution Results # Check if the execution itself reported failure if not execution_result or execution_result.get("success") is not True: - error_detail = execution_result.get("error", {"message": "Unknown execution error"}) - logger.error(f"Affiliation algorithm execution failed for {algorithm_name} v{algorithm_version}. Error: {error_detail}") + error_detail = execution_result.get( + "error", {"message": "Unknown execution error"} + ) + logger.error( + f"Affiliation algorithm execution failed for {algorithm_name} v{algorithm_version}. Error: {error_detail}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, # Provide the error message from the script if available - detail=f"Affiliation algorithm execution failed: {error_detail.get('message', 'Unknown error')}" + detail=f"Affiliation algorithm execution failed: {error_detail.get('message', 'Unknown error')}", ) # Extract the data payload, expected to be a list of dictionaries affiliation_results: List[Dict[str, Any]] = execution_result.get("data", []) # Validate the structure of the returned data if not isinstance(affiliation_results, list): - logger.error(f"Affiliation algorithm {algorithm_name} v{algorithm_version} returned unexpected data type: {type(affiliation_results)}. Expected List[Dict].") + logger.error( + f"Affiliation algorithm {algorithm_name} v{algorithm_version} returned unexpected data type: {type(affiliation_results)}. Expected List[Dict]." + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Affiliation algorithm returned data in an unexpected format." + detail="Affiliation algorithm returned data in an unexpected format.", ) # Initialize counters for the response summary - processed_count = len(affiliation_results) # Total results returned by the script + processed_count = len(affiliation_results) # Total results returned by the script created_count = 0 updated_count = 0 # Handle the case where the algorithm runs successfully but finds no affiliations if not affiliation_results: - logger.info(f"Affiliation algorithm {algorithm_name} v{algorithm_version} returned 0 results for institution {request_body.institution_id}.") + logger.info( + f"Affiliation algorithm {algorithm_name} v{algorithm_version} returned 0 results for institution {request_body.institution_id}." + ) return AffiliationExecutionResponse( status="COMPLETED", message="Affiliation calculation completed. Algorithm returned 0 results.", processed_count=0, created_count=0, - updated_count=0 + updated_count=0, ) # 6. Store Results in Database affiliation_repo = RepositoryInstitutionAffiliationRepository(db) try: - successful_items_stored = 0 # Count items successfully processed and prepared for commit + successful_items_stored = ( + 0 # Count items successfully processed and prepared for commit + ) # Iterate through each result dictionary returned by the algorithm for result_item in affiliation_results: # Extract required fields, handling potential missing keys gracefully repo_id = result_item.get("repository_id") confidence = result_item.get("confidence_score") # Evidence might be optional or structured differently depending on the algorithm - evidence = result_item.get("evidence") # Can be None or any JSON-serializable structure + evidence = result_item.get( + "evidence" + ) # Can be None or any JSON-serializable structure # Basic validation of required fields if repo_id is None or confidence is None: - logger.warning(f"Skipping affiliation result due to missing 'repository_id' or 'confidence_score': {result_item}") - continue # Skip this potentially malformed result item + logger.warning( + f"Skipping affiliation result due to missing 'repository_id' or 'confidence_score': {result_item}" + ) + continue # Skip this potentially malformed result item # Ensure confidence score is a float try: - confidence_float = float(confidence) + confidence_float = float(confidence) except (ValueError, TypeError): - logger.warning(f"Skipping affiliation result due to invalid 'confidence_score' type ({type(confidence)}): {result_item}") - continue # Skip item if confidence score is not convertible to float + logger.warning( + f"Skipping affiliation result due to invalid 'confidence_score' type ({type(confidence)}): {result_item}" + ) + continue # Skip item if confidence score is not convertible to float # Attempt to create or update the affiliation record in the database try: # The repository method handles the logic of finding existing records or creating new ones _, created = affiliation_repo.create_or_update_affiliation( - repository_id=int(repo_id), # Ensure repo_id is integer - institution_id=request_body.institution_id, # The target institution for this run - algorithm_name=algorithm_name, # Store which algorithm generated this result - algorithm_version=algorithm_version, # Store the specific version - confidence_score=confidence_float, # The calculated score - evidence=evidence, # Supporting evidence (JSON compatible) - parameters_used=request_body.parameters # Store parameters used for this run for traceability + repository_id=int(repo_id), # Ensure repo_id is integer + institution_id=request_body.institution_id, # The target institution for this run + algorithm_name=algorithm_name, # Store which algorithm generated this result + algorithm_version=algorithm_version, # Store the specific version + confidence_score=confidence_float, # The calculated score + evidence=evidence, # Supporting evidence (JSON compatible) + parameters_used=request_body.parameters, # Store parameters used for this run for traceability ) # Update counters based on whether a new record was created or an existing one updated if created: created_count += 1 else: updated_count += 1 - successful_items_stored += 1 # Increment count of successfully processed items + successful_items_stored += ( + 1 # Increment count of successfully processed items + ) except Exception as item_db_err: - # Log errors occurring during the processing of a single item, but allow the loop to continue - # This prevents one bad result from stopping the processing of others. - logger.error(f"Database error storing single affiliation result for Repo ID {repo_id}, Inst ID {request_body.institution_id}: {item_db_err}", exc_info=True) - # Do not increment successful_items_stored for this item + # Log errors occurring during the processing of a single item, but allow the loop to continue + # This prevents one bad result from stopping the processing of others. + logger.error( + f"Database error storing single affiliation result for Repo ID {repo_id}, Inst ID {request_body.institution_id}: {item_db_err}", + exc_info=True, + ) + # Do not increment successful_items_stored for this item # Commit the transaction only if at least one item was successfully processed and staged for commit if successful_items_stored > 0: - db.commit() - logger.info(f"Successfully processed and stored {successful_items_stored} affiliation results for Inst {request_body.institution_id} (Created: {created_count}, Updated: {updated_count}).") + db.commit() + logger.info( + f"Successfully processed and stored {successful_items_stored} affiliation results for Inst {request_body.institution_id} (Created: {created_count}, Updated: {updated_count})." + ) else: - # If no items were successfully processed (e.g., all had validation errors or DB errors), log this. - # A rollback might be implicitly handled by the session context manager or error handling above, - # but explicitly rolling back ensures no partial state if individual errors occurred but weren't caught cleanly. - logger.warning(f"No affiliation results were successfully processed for database storage for Inst {request_body.institution_id}.") - db.rollback() - + # If no items were successfully processed (e.g., all had validation errors or DB errors), log this. + # A rollback might be implicitly handled by the session context manager or error handling above, + # but explicitly rolling back ensures no partial state if individual errors occurred but weren't caught cleanly. + logger.warning( + f"No affiliation results were successfully processed for database storage for Inst {request_body.institution_id}." + ) + db.rollback() # Return the final summary response return AffiliationExecutionResponse( status="COMPLETED", message=f"Affiliation calculation completed. Items returned by script: {processed_count}. Successfully stored/updated in DB: {successful_items_stored}. Created: {created_count}, Updated: {updated_count}.", - processed_count=processed_count, # Total items the script *returned* - created_count=created_count, # Count of new DB records - updated_count=updated_count # Count of updated DB records + processed_count=processed_count, # Total items the script *returned* + created_count=created_count, # Count of new DB records + updated_count=updated_count, # Count of updated DB records ) except Exception as db_err: # Catch broader errors that might occur outside the loop (e.g., during commit if not caught earlier) - logger.exception(f"Database error storing affiliation results batch for Inst {request_body.institution_id}, Algo {algorithm_name} v{algorithm_version}") + logger.exception( + f"Database error storing affiliation results batch for Inst {request_body.institution_id}, Algo {algorithm_name} v{algorithm_version}" + ) # Ensure any partial changes from the loop are rolled back db.rollback() raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Failed to store affiliation results in database: {db_err}" - ) \ No newline at end of file + detail=f"Failed to store affiliation results in database: {db_err}", + ) diff --git a/backend/api/v1/endpoints/discovery_algorithms.py b/backend/api/v1/endpoints/discovery_algorithms.py index ec0953b..a8011de 100644 --- a/backend/api/v1/endpoints/discovery_algorithms.py +++ b/backend/api/v1/endpoints/discovery_algorithms.py @@ -7,28 +7,34 @@ """ import logging -from typing import List, Dict, Any +from typing import List, Dict -from fastapi import APIRouter, HTTPException, status, Depends, Body, Path as FastApiPath -from sqlalchemy.orm import Session +from fastapi import APIRouter, HTTPException, status, Body, Path as FastApiPath # Internal dependencies for utilities, configuration, schemas, and database access from backend.utils.recipe_utils import ( - discover_recipes, RecipeMetadata # Import RecipeMetadata class for type hinting + discover_recipes, + RecipeMetadata, # Import RecipeMetadata class for type hinting ) + # --- Define discovery directory constant relative to project root --- # This ensures the path is consistent regardless of where the application is run from from backend.utils.recipe_utils import PROJECT_ROOT_UTIL + CONTRIB_DISCOVERY_ALGOS_DIR = PROJECT_ROOT_UTIL / "contrib" / "discovery_algorithms" # --- End constant definition --- from backend.utils.recipe_executor import execute_recipe -from backend.config.settings import settings # Import application settings instance -from backend.schemas.requests import RecipeExecutionRequest # Use generic request schema for parameters -from backend.schemas.responses import RecipeMetadataResponse, DiscoveryExecutionResponse # Import specific response schemas +from backend.config.settings import settings # Import application settings instance +from backend.schemas.requests import ( + RecipeExecutionRequest, +) # Use generic request schema for parameters +from backend.schemas.responses import ( + RecipeMetadataResponse, + DiscoveryExecutionResponse, +) # Import specific response schemas # Import DB dependency; although not directly used in this endpoint's logic, # it might be required by future algorithms or for consistency. -from backend.api.deps import get_db_session # Logger setup for this module logger = logging.getLogger(__name__) @@ -36,6 +42,7 @@ # API Router instance for discovery algorithms router = APIRouter() + # --- Discovery Algorithm Discovery Endpoint --- @router.get( "/", @@ -58,41 +65,54 @@ def get_available_discovery_algorithms(): Raises: HTTPException: 500 Internal Server Error if the discovery process fails unexpectedly. """ - logger.info(f"Request received: Discover discovery algorithms from {CONTRIB_DISCOVERY_ALGOS_DIR}") + logger.info( + f"Request received: Discover discovery algorithms from {CONTRIB_DISCOVERY_ALGOS_DIR}" + ) # Check if the designated directory actually exists if not CONTRIB_DISCOVERY_ALGOS_DIR.is_dir(): - logger.warning(f"Discovery algorithms directory not found: {CONTRIB_DISCOVERY_ALGOS_DIR}") - return [] # Return empty list as per the spec if directory doesn't exist + logger.warning( + f"Discovery algorithms directory not found: {CONTRIB_DISCOVERY_ALGOS_DIR}" + ) + return [] # Return empty list as per the spec if directory doesn't exist try: # Use the generalized recipe discovery function, pointing it to the correct directory # and specifying the target function name expected within discovery scripts. discovered_algorithms = discover_recipes( recipes_base_dir=CONTRIB_DISCOVERY_ALGOS_DIR, - target_function_name="find_candidate_repos" # Function name specific to discovery algorithms + target_function_name="find_candidate_repos", # Function name specific to discovery algorithms ) # Convert internal metadata objects to the standard response format - response_data = [RecipeMetadataResponse(**algo.to_dict()) for algo in discovered_algorithms] + response_data = [ + RecipeMetadataResponse(**algo.to_dict()) for algo in discovered_algorithms + ] return response_data - except Exception as e: + except Exception: logger.exception("Error occurred during discovery algorithm discovery.") # Raise a generic server error if discovery fails raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to discover discovery algorithms." + detail="Failed to discover discovery algorithms.", ) + # --- Discovery Algorithm Execution Endpoint --- @router.post( "/execute/{algorithm_name}/{algorithm_version}", - response_model=DiscoveryExecutionResponse, # Expecting a list of strings (URLs) + response_model=DiscoveryExecutionResponse, # Expecting a list of strings (URLs) summary="Execute a Discovery Algorithm", - status_code=status.HTTP_200_OK # Use 200 OK as the operation aims for completion and result reporting + status_code=status.HTTP_200_OK, # Use 200 OK as the operation aims for completion and result reporting ) def execute_discovery_algorithm( - algorithm_name: str = FastApiPath(..., description="Name of the discovery algorithm to execute."), - algorithm_version: str = FastApiPath(..., description="Version of the discovery algorithm to execute."), - request_body: RecipeExecutionRequest = Body(...), # Contains algorithm-specific parameters + algorithm_name: str = FastApiPath( + ..., description="Name of the discovery algorithm to execute." + ), + algorithm_version: str = FastApiPath( + ..., description="Version of the discovery algorithm to execute." + ), + request_body: RecipeExecutionRequest = Body( + ... + ), # Contains algorithm-specific parameters # db: Session = Depends(get_db_session) # DB session currently unused here, keep commented for potential future use ): """ @@ -130,14 +150,16 @@ def execute_discovery_algorithm( if script execution fails, or if the script returns data in an unexpected format. """ - logger.info(f"Request received: Execute discovery algorithm '{algorithm_name}' version '{algorithm_version}'") + logger.info( + f"Request received: Execute discovery algorithm '{algorithm_name}' version '{algorithm_version}'" + ) # 1. Find Algorithm Metadata (Rescan for execution context) # (Consider caching this discovery result in production) try: discovered_algorithms = discover_recipes( recipes_base_dir=CONTRIB_DISCOVERY_ALGOS_DIR, - target_function_name="find_candidate_repos" + target_function_name="find_candidate_repos", ) algo_meta: RecipeMetadata | None = None # Locate the metadata for the requested algorithm @@ -145,36 +167,42 @@ def execute_discovery_algorithm( if algo.name == algorithm_name and algo.version == algorithm_version: algo_meta = algo break - except Exception as discovery_err: - logger.exception("Error during discovery algorithm lookup for execution.") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to look up discovery algorithm for execution." - ) + except Exception: + logger.exception("Error during discovery algorithm lookup for execution.") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to look up discovery algorithm for execution.", + ) # Handle case where algorithm is not found if not algo_meta: - logger.warning(f"Discovery algorithm not found: {algorithm_name} v{algorithm_version}") + logger.warning( + f"Discovery algorithm not found: {algorithm_name} v{algorithm_version}" + ) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"Discovery algorithm '{algorithm_name}' version '{algorithm_version}' not found." + detail=f"Discovery algorithm '{algorithm_name}' version '{algorithm_version}' not found.", ) # 2. Parameter Validation against discovered metadata required_params_from_docstring = {p.name for p in algo_meta.parameters} provided_params_in_body = set(request_body.parameters.keys()) # Parameters that are handled internally by the executor or are optional for the *user* to provide via the body - internal_or_optional_params = {'db_conn_str', 'github_api_token'} + internal_or_optional_params = {"db_conn_str", "github_api_token"} # Determine parameters the user *must* supply within the request_body.parameters field - required_params_for_body = required_params_from_docstring - internal_or_optional_params + required_params_for_body = ( + required_params_from_docstring - internal_or_optional_params + ) missing_params_in_body = required_params_for_body - provided_params_in_body # Raise error if required parameters are missing from the request body if missing_params_in_body: - logger.warning(f"Missing required parameters in request body 'parameters' field for discovery algorithm {algorithm_name} v{algorithm_version}: {missing_params_in_body}") + logger.warning( + f"Missing required parameters in request body 'parameters' field for discovery algorithm {algorithm_name} v{algorithm_version}: {missing_params_in_body}" + ) raise HTTPException( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=f"Missing required parameters in request body 'parameters' field: {', '.join(missing_params_in_body)}" + detail=f"Missing required parameters in request body 'parameters' field: {', '.join(missing_params_in_body)}", ) # Start building the parameters dictionary for the execution context @@ -182,22 +210,31 @@ def execute_discovery_algorithm( # 3. Get Secrets (GitHub Token) from application settings github_token = settings.GITHUB_API_TOKEN - secrets_dict: Dict[str, str] = {} # Dictionary to pass secrets securely to the executor + secrets_dict: Dict[ + str, str + ] = {} # Dictionary to pass secrets securely to the executor if not github_token: - # Check if the algorithm's function signature *requires* the token (i.e., not typed as Optional) - token_required = any(p.name == 'github_api_token' and not p.type.startswith('Optional') for p in algo_meta.parameters) - if token_required: - # If required by signature but not configured in settings, it's an operational error - logger.error("GITHUB_API_TOKEN is required by this algorithm's definition but not configured in application settings.") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="GitHub API Token is required for this discovery algorithm but is not configured in the server environment." - ) - else: - # Token not configured, but the algorithm declares it as optional. Allow execution to proceed. - # The script itself should handle anonymous operation if applicable. - logger.warning("GITHUB_API_TOKEN not configured in settings. Discovery algorithm will run anonymously if it supports it.") + # Check if the algorithm's function signature *requires* the token (i.e., not typed as Optional) + token_required = any( + p.name == "github_api_token" and not p.type.startswith("Optional") + for p in algo_meta.parameters + ) + if token_required: + # If required by signature but not configured in settings, it's an operational error + logger.error( + "GITHUB_API_TOKEN is required by this algorithm's definition but not configured in application settings." + ) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="GitHub API Token is required for this discovery algorithm but is not configured in the server environment.", + ) + else: + # Token not configured, but the algorithm declares it as optional. Allow execution to proceed. + # The script itself should handle anonymous operation if applicable. + logger.warning( + "GITHUB_API_TOKEN not configured in settings. Discovery algorithm will run anonymously if it supports it." + ) else: # Token is available, add it to the secrets dictionary secrets_dict["github_api_token"] = github_token @@ -205,55 +242,69 @@ def execute_discovery_algorithm( # 4. Get DB Connection String (pass to executor for consistency, even if unused by this specific script) db_connection_string = settings.DATABASE_URL if not db_connection_string: - # Database connection is generally expected to be available - logger.error("DATABASE_URL is not configured in settings.") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Database connection is not configured." - ) + # Database connection is generally expected to be available + logger.error("DATABASE_URL is not configured in settings.") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Database connection is not configured.", + ) # 5. Execute Algorithm Script via the recipe executor - logger.info(f"Calling recipe executor for discovery algorithm: {algo_meta.file_path}") + logger.info( + f"Calling recipe executor for discovery algorithm: {algo_meta.file_path}" + ) try: # Pass user parameters, DB string, secrets, and function/type info to the executor execution_result = execute_recipe( recipe_path_relative=algo_meta.file_path, recipe_params=execution_params, db_conn_str=db_connection_string, - script_type='discovery', # Indicate type for executor context - function_name='find_candidate_repos', # Target function in the script - secrets=secrets_dict # Pass secrets dictionary securely + script_type="discovery", # Indicate type for executor context + function_name="find_candidate_repos", # Target function in the script + secrets=secrets_dict, # Pass secrets dictionary securely ) except Exception as exec_api_err: # Catch unexpected errors during the invocation of the executor - logger.exception(f"Unexpected error calling recipe executor for discovery algorithm {algorithm_name} v{algorithm_version}") + logger.exception( + f"Unexpected error calling recipe executor for discovery algorithm {algorithm_name} v{algorithm_version}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Failed to invoke discovery algorithm execution: {exec_api_err}" + detail=f"Failed to invoke discovery algorithm execution: {exec_api_err}", ) # 6. Process Execution Results # Check if the execution result indicates failure if not execution_result or execution_result.get("success") is not True: - error_detail = execution_result.get("error", {"message": "Unknown execution error"}) - logger.error(f"Discovery algorithm execution failed for {algorithm_name} v{algorithm_version}. Error: {error_detail}") + error_detail = execution_result.get( + "error", {"message": "Unknown execution error"} + ) + logger.error( + f"Discovery algorithm execution failed for {algorithm_name} v{algorithm_version}. Error: {error_detail}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, # Report the error message from the script if available - detail=f"Discovery algorithm execution failed: {error_detail.get('message', 'Unknown error')}" + detail=f"Discovery algorithm execution failed: {error_detail.get('message', 'Unknown error')}", ) # Extract the result data, expected to be a list of strings (URLs) candidate_urls = execution_result.get("data", []) # Validate the format of the returned data - if not isinstance(candidate_urls, list) or not all(isinstance(url, str) for url in candidate_urls): - logger.error(f"Discovery algorithm {algorithm_name} v{algorithm_version} returned unexpected data type: {type(candidate_urls)}. Expected List[str]. Data sample: {str(candidate_urls)[:500]}") + if not isinstance(candidate_urls, list) or not all( + isinstance(url, str) for url in candidate_urls + ): + logger.error( + f"Discovery algorithm {algorithm_name} v{algorithm_version} returned unexpected data type: {type(candidate_urls)}. Expected List[str]. Data sample: {str(candidate_urls)[:500]}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Discovery algorithm returned data in an unexpected format (expected a list of URL strings)." + detail="Discovery algorithm returned data in an unexpected format (expected a list of URL strings).", ) # Log success and return the validated list of URLs - logger.info(f"Discovery algorithm {algorithm_name} v{algorithm_version} executed successfully, found {len(candidate_urls)} candidate URLs.") + logger.info( + f"Discovery algorithm {algorithm_name} v{algorithm_version} executed successfully, found {len(candidate_urls)} candidate URLs." + ) # FastAPI automatically uses the DiscoveryExecutionResponse (which is essentially List[str]) - return candidate_urls \ No newline at end of file + return candidate_urls diff --git a/backend/api/v1/endpoints/history.py b/backend/api/v1/endpoints/history.py index b093284..5095c9a 100644 --- a/backend/api/v1/endpoints/history.py +++ b/backend/api/v1/endpoints/history.py @@ -7,13 +7,16 @@ """ import logging -from typing import Optional, List +from typing import Optional from datetime import datetime from fastapi import APIRouter, Depends, HTTPException, status, Query from sqlalchemy.orm import Session + # Import necessary SQLAlchemy functions for querying -from sqlalchemy import desc, or_, func # Import desc (ordering), or_ (conditional logic), func (database functions) +from sqlalchemy import ( + desc, +) # Import desc (ordering), or_ (conditional logic), func (database functions) # Internal dependencies for database access and data models/schemas from backend.api.deps import get_db_session @@ -26,15 +29,22 @@ # API Router instance for history endpoints router = APIRouter() + @router.get( "/context", response_model=IngestionHistoryContextResponse, - summary="Get Context on Last Relevant Ingestion" + summary="Get Context on Last Relevant Ingestion", ) def get_ingestion_history_context( - param_type: str = Query(..., description="Type of parameter to match (e.g., 'keyword', 'url_pattern'). Indicates which table and field to search."), - param_value: str = Query(..., description="Value of the parameter to match (e.g., a specific keyword or a URL pattern)."), - db: Session = Depends(get_db_session) # Database session dependency + param_type: str = Query( + ..., + description="Type of parameter to match (e.g., 'keyword', 'url_pattern'). Indicates which table and field to search.", + ), + param_value: str = Query( + ..., + description="Value of the parameter to match (e.g., a specific keyword or a URL pattern).", + ), + db: Session = Depends(get_db_session), # Database session dependency ): """ Finds the timestamp and type of the most recently *completed* ingestion event @@ -71,77 +81,115 @@ def get_ingestion_history_context( - 400 Bad Request: If an unsupported `param_type` is provided. - 500 Internal Server Error: If a database query or other processing fails. """ - logger.info(f"Fetching ingestion history context for type '{param_type}' value '{param_value}'") + logger.info( + f"Fetching ingestion history context for type '{param_type}' value '{param_value}'" + ) # Initialize variables to store the result last_ingested_at: Optional[datetime] = None - ingestion_type: Optional[str] = None # Describes the source of the timestamp (e.g., KEYWORD_SEARCH, DIRECT_URL) + ingestion_type: Optional[str] = ( + None # Describes the source of the timestamp (e.g., KEYWORD_SEARCH, DIRECT_URL) + ) try: # --- Keyword Search History --- - if param_type == 'keyword': + if param_type == "keyword": # Primary Query: Find the most recent *completed* keyword search session matching the value. # Uses case-insensitive matching (`ilike`) on the raw keywords string. primary_keyword_query = ( - db.query(KeywordSearchSession.completed_at) # Select only the completion timestamp - .filter(KeywordSearchSession.keywords_raw.ilike(f"%{param_value}%")) # Case-insensitive substring match - .filter(KeywordSearchSession.status == 'COMPLETED') # Must be completed - .order_by(desc(KeywordSearchSession.completed_at)) # Get the most recent first + db.query( + KeywordSearchSession.completed_at + ) # Select only the completion timestamp + .filter( + KeywordSearchSession.keywords_raw.ilike(f"%{param_value}%") + ) # Case-insensitive substring match + .filter(KeywordSearchSession.status == "COMPLETED") # Must be completed + .order_by( + desc(KeywordSearchSession.completed_at) + ) # Get the most recent first ) - completed_result = primary_keyword_query.first() # Fetch the first result (most recent) + completed_result = ( + primary_keyword_query.first() + ) # Fetch the first result (most recent) if completed_result and completed_result.completed_at: last_ingested_at = completed_result.completed_at - ingestion_type = 'KEYWORD_SEARCH' # Indicates a completed keyword search session + ingestion_type = ( + "KEYWORD_SEARCH" # Indicates a completed keyword search session + ) else: - # Fallback Query: If no completed session found, find the most recent session - # matching the keyword, regardless of status, and use its creation time. - # This indicates when such a search was last *initiated*. - fallback_keyword_query = ( - db.query(KeywordSearchSession.created_at) # Select creation timestamp - .filter(KeywordSearchSession.keywords_raw.ilike(f"%{param_value}%")) # Match keyword - .order_by(desc(KeywordSearchSession.created_at)) # Most recent created first - ) - fallback_result = fallback_keyword_query.first() - if fallback_result and fallback_result.created_at: - last_ingested_at = fallback_result.created_at - # Use a distinct type to indicate it wasn't necessarily completed - ingestion_type = 'KEYWORD_SEARCH_INITIATED' + # Fallback Query: If no completed session found, find the most recent session + # matching the keyword, regardless of status, and use its creation time. + # This indicates when such a search was last *initiated*. + fallback_keyword_query = ( + db.query( + KeywordSearchSession.created_at + ) # Select creation timestamp + .filter( + KeywordSearchSession.keywords_raw.ilike(f"%{param_value}%") + ) # Match keyword + .order_by( + desc(KeywordSearchSession.created_at) + ) # Most recent created first + ) + fallback_result = fallback_keyword_query.first() + if fallback_result and fallback_result.created_at: + last_ingested_at = fallback_result.created_at + # Use a distinct type to indicate it wasn't necessarily completed + ingestion_type = "KEYWORD_SEARCH_INITIATED" # --- URL Pattern Search History --- - elif param_type == 'url_pattern': + elif param_type == "url_pattern": # Primary Query: Find the most recent *completed* root DiscoveryChain # of type DIRECT_URL where the 'url' parameter matches the pattern. # Uses JSONB operators (`->>`) for text extraction and `ilike`. primary_url_query = ( - db.query(DiscoveryChain.completed_at) # Select completion timestamp - .filter(DiscoveryChain.parent_chain_id.is_(None)) # Must be a root chain (no parent) - .filter(DiscoveryChain.discovery_type == 'DIRECT_URL') # Must be a direct URL ingestion + db.query(DiscoveryChain.completed_at) # Select completion timestamp + .filter( + DiscoveryChain.parent_chain_id.is_(None) + ) # Must be a root chain (no parent) + .filter( + DiscoveryChain.discovery_type == "DIRECT_URL" + ) # Must be a direct URL ingestion # Access the 'url' key within the JSONB 'parameters' field, cast to text, and perform case-insensitive match - .filter(DiscoveryChain.parameters['url'].astext.ilike(f"%{param_value}%")) - .filter(DiscoveryChain.status == 'COMPLETED') # Must be completed - .order_by(desc(DiscoveryChain.completed_at)) # Most recent completed first + .filter( + DiscoveryChain.parameters["url"].astext.ilike(f"%{param_value}%") + ) + .filter(DiscoveryChain.status == "COMPLETED") # Must be completed + .order_by( + desc(DiscoveryChain.completed_at) + ) # Most recent completed first ) completed_result = primary_url_query.first() if completed_result and completed_result.completed_at: last_ingested_at = completed_result.completed_at - ingestion_type = 'DIRECT_URL' # Indicates a completed direct URL ingestion + ingestion_type = ( + "DIRECT_URL" # Indicates a completed direct URL ingestion + ) else: # Fallback Query: If no completed chain found, find the most recent root DIRECT_URL chain # matching the pattern, regardless of status, and use its creation time. fallback_url_query = ( - db.query(DiscoveryChain.created_at) # Select creation timestamp - .filter(DiscoveryChain.parent_chain_id.is_(None)) # Root chain - .filter(DiscoveryChain.discovery_type == 'DIRECT_URL') # Direct URL type - .filter(DiscoveryChain.parameters['url'].astext.ilike(f"%{param_value}%")) # Match pattern - .order_by(desc(DiscoveryChain.created_at)) # Most recent created first - ) + db.query(DiscoveryChain.created_at) # Select creation timestamp + .filter(DiscoveryChain.parent_chain_id.is_(None)) # Root chain + .filter( + DiscoveryChain.discovery_type == "DIRECT_URL" + ) # Direct URL type + .filter( + DiscoveryChain.parameters["url"].astext.ilike( + f"%{param_value}%" + ) + ) # Match pattern + .order_by( + desc(DiscoveryChain.created_at) + ) # Most recent created first + ) fallback_result = fallback_url_query.first() if fallback_result and fallback_result.created_at: - last_ingested_at = fallback_result.created_at - # Use distinct type for initiated but not necessarily completed - ingestion_type = 'DIRECT_URL_INITIATED' + last_ingested_at = fallback_result.created_at + # Use distinct type for initiated but not necessarily completed + ingestion_type = "DIRECT_URL_INITIATED" # --- Unsupported Parameter Type --- else: @@ -149,21 +197,23 @@ def get_ingestion_history_context( logger.warning(f"Unsupported param_type requested: '{param_type}'") raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Unsupported param_type: '{param_type}'. Valid types are 'keyword' or 'url_pattern'." + detail=f"Unsupported param_type: '{param_type}'. Valid types are 'keyword' or 'url_pattern'.", ) # Construct and return the response object return IngestionHistoryContextResponse( param_type=param_type, param_value=param_value, - last_ingested_at=last_ingested_at, # Will be None if no match found - ingestion_type=ingestion_type # Will be None if no match found + last_ingested_at=last_ingested_at, # Will be None if no match found + ingestion_type=ingestion_type, # Will be None if no match found ) - except Exception as e: + except Exception: # Catch any unexpected database or processing errors - logger.exception(f"Error fetching ingestion history context for type '{param_type}' value '{param_value}'") + logger.exception( + f"Error fetching ingestion history context for type '{param_type}' value '{param_value}'" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="An unexpected error occurred while retrieving ingestion history context." - ) \ No newline at end of file + detail="An unexpected error occurred while retrieving ingestion history context.", + ) diff --git a/backend/api/v1/endpoints/ingestion.py b/backend/api/v1/endpoints/ingestion.py index 54abb44..6ffa728 100644 --- a/backend/api/v1/endpoints/ingestion.py +++ b/backend/api/v1/endpoints/ingestion.py @@ -8,7 +8,7 @@ import logging from datetime import datetime, timezone -from typing import Optional + # BackgroundTasks is no longer used as keyword ingestion is handled by Celery from fastapi import APIRouter, Depends, HTTPException, status from sqlalchemy.orm import Session @@ -18,12 +18,17 @@ # Internal dependencies for database session, models, schemas, services, and task definitions from backend.api.deps import get_db_session + # Import SessionLocal directly for creating isolated sessions in error handling from backend.data.database import SessionLocal from backend.schemas.requests import IngestionRequest, KeywordIngestionRequest -from backend.schemas.responses import DiscoveryChainSummary, KeywordSearchSessionResponse +from backend.schemas.responses import ( + DiscoveryChainSummary, + KeywordSearchSessionResponse, +) from backend.data.repositories import KeywordSearchSessionRepository from backend.data.models import KeywordSearchSession + # Import IngestionService, primarily used by the synchronous URL endpoint from backend.services.ingestion_service import IngestionService # Ensure task module is implicitly loaded if not explicitly imported elsewhere, @@ -45,11 +50,11 @@ # 202 Accepted is appropriate as the request is accepted, and while the main # URL processing might be synchronous, subsequent background tasks (like DOI processing) # might still occur. It signals initiation rather than immediate completion of *all* work. - status_code=status.HTTP_202_ACCEPTED + status_code=status.HTTP_202_ACCEPTED, ) def ingest_by_url( request: IngestionRequest, - db: Session = Depends(get_db_session) # Database session dependency + db: Session = Depends(get_db_session), # Database session dependency ): """ Accepts a GitHub repository URL and triggers the core ingestion process *synchronously* @@ -89,7 +94,10 @@ def ingest_by_url( # Check if the service method indicated failure to even start (e.g., invalid URL format) if root_chain is None: # This indicates an early failure within the service, likely validation. - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid repository URL format or unable to initiate ingestion.") + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Invalid repository URL format or unable to initiate ingestion.", + ) # Re-fetch the state from the database. While less critical in a purely sync flow, # it's good practice if the service *could* have modified it commit/flush happened. @@ -97,39 +105,64 @@ def ingest_by_url( # Explicitly check the final status recorded in the database after the service call returns. if root_chain.status == "FAILED": - logger.error(f"Synchronous part of ingestion failed for URL {url_str}. Root chain ID: {root_chain.id}. Check service logs for details.") - # Return the summary of the failed chain. The HTTP status remains 202 (Accepted), - # but the response body indicates the failure outcome. - return root_chain + logger.error( + f"Synchronous part of ingestion failed for URL {url_str}. Root chain ID: {root_chain.id}. Check service logs for details." + ) + # Return the summary of the failed chain. The HTTP status remains 202 (Accepted), + # but the response body indicates the failure outcome. + return root_chain elif root_chain.status != "COMPLETED": - # Log if the synchronous part finished with an unexpected status (e.g., PENDING if workflow changed) - logger.warning(f"Synchronous part of ingestion for URL {url_str} finished with unexpected status '{root_chain.status}'. Chain ID: {root_chain.id}.") - return root_chain # Return the chain summary with its current status + # Log if the synchronous part finished with an unexpected status (e.g., PENDING if workflow changed) + logger.warning( + f"Synchronous part of ingestion for URL {url_str} finished with unexpected status '{root_chain.status}'. Chain ID: {root_chain.id}." + ) + return root_chain # Return the chain summary with its current status # Log successful completion of the synchronous part - logger.info(f"Synchronous part of ingestion completed successfully for {url_str}, root chain ID: {root_chain.id}") + logger.info( + f"Synchronous part of ingestion completed successfully for {url_str}, root chain ID: {root_chain.id}" + ) # Return the summary of the successfully completed root chain return root_chain except ValueError as ve: - # Catch specific validation errors raised potentially by Pydantic or service logic - logger.error(f"Value error during ingestion request for {request.url}: {ve}", exc_info=True) - # Ensure transaction rollback on error - try: db.rollback() - except Exception: logger.error("Failed to rollback transaction after ValueError.") - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) + # Catch specific validation errors raised potentially by Pydantic or service logic + logger.error( + f"Value error during ingestion request for {request.url}: {ve}", + exc_info=True, + ) + # Ensure transaction rollback on error + try: + db.rollback() + except Exception: + logger.error("Failed to rollback transaction after ValueError.") + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) except RuntimeError as re: # Catch runtime errors that might indicate deeper issues in the service - logger.error(f"Runtime error during ingestion for {request.url}: {re}", exc_info=True) - try: db.rollback() - except Exception: logger.error("Failed to rollback transaction after RuntimeError.") - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Ingestion process encountered a runtime error for URL {request.url}. Check server logs.") - except Exception as e: + logger.error( + f"Runtime error during ingestion for {request.url}: {re}", exc_info=True + ) + try: + db.rollback() + except Exception: + logger.error("Failed to rollback transaction after RuntimeError.") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Ingestion process encountered a runtime error for URL {request.url}. Check server logs.", + ) + except Exception: # Catch any other unexpected exceptions during the endpoint execution - logger.exception(f"Unexpected error during /ingest/url endpoint for {request.url}") - try: db.rollback() - except Exception: logger.error("Failed to rollback transaction after unexpected exception.") - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="An unexpected error occurred during URL ingestion.") + logger.exception( + f"Unexpected error during /ingest/url endpoint for {request.url}" + ) + try: + db.rollback() + except Exception: + logger.error("Failed to rollback transaction after unexpected exception.") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="An unexpected error occurred during URL ingestion.", + ) # --- Endpoint for Keyword Ingestion (Asynchronous via Celery) --- @@ -137,11 +170,11 @@ def ingest_by_url( "/keywords", response_model=KeywordSearchSessionResponse, summary="Trigger discovery and ingestion by Keywords (Async via Celery)", - status_code=status.HTTP_202_ACCEPTED # 202 Accepted indicates the task is queued, not completed + status_code=status.HTTP_202_ACCEPTED, # 202 Accepted indicates the task is queued, not completed ) def ingest_by_keywords( request: KeywordIngestionRequest, - db: Session = Depends(get_db_session) # Database session dependency + db: Session = Depends(get_db_session), # Database session dependency ): """ Accepts keywords, initiates a keyword search session, and queues an asynchronous @@ -183,19 +216,20 @@ def ingest_by_keywords( # Basic validation if not request.keywords: raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail="Keywords cannot be empty." + status_code=status.HTTP_400_BAD_REQUEST, detail="Keywords cannot be empty." ) session_repo = KeywordSearchSessionRepository(db) - search_session: KeywordSearchSession | None = None # Initialize for potential use in error handling + search_session: KeywordSearchSession | None = ( + None # Initialize for potential use in error handling + ) try: # 1. Create the initial KeywordSearchSession record in the database search_session = KeywordSearchSession( keywords_raw=request.keywords, - status="QUEUED", # Set initial status - started_at=None, # Task will set this when it starts + status="QUEUED", # Set initial status + started_at=None, # Task will set this when it starts # completed_at=None, # Task will set this on completion/failure # created_at is handled by the model default timestamp ) @@ -210,7 +244,9 @@ def ingest_by_keywords( session_id = search_session.id # --- End Commit --- - logger.info(f"Created KeywordSearchSession {session_id} with status QUEUED for keywords: '{request.keywords}'.") + logger.info( + f"Created KeywordSearchSession {session_id} with status QUEUED for keywords: '{request.keywords}'." + ) # 2. Enqueue the Celery task to perform the discovery and ingestion try: @@ -218,54 +254,71 @@ def ingest_by_keywords( # Pass necessary arguments (session ID, keywords) for the task function. # Note: The task name format is typically 'module.path.to.function'. celery_app.send_task( - 'backend.tasks.discovery_tasks.keyword_discovery_celery_task', - args=[session_id, request.keywords] + "backend.tasks.discovery_tasks.keyword_discovery_celery_task", + args=[session_id, request.keywords], # Optionally add kwargs={}, countdown=, eta=, etc. ) - logger.info(f"Successfully enqueued Celery task 'keyword_discovery_celery_task' for session {session_id}.") - except Exception as celery_err: - # Handle potential errors during communication with the Celery broker (e.g., connection refused) - logger.exception(f"Failed to send task to Celery for session {session_id}. Attempting to mark session as FAILED.") - - # --- Best-effort attempt to mark the session as FAILED --- - # Use a new, independent database session for this update to avoid interfering - # with the main request's session state, especially in error scenarios. - try: - # Create a new session scope using SessionLocal factory - with SessionLocal() as temp_db: - # Retrieve the session record within the new session - failed_session = temp_db.get(KeywordSearchSession, session_id) - if failed_session: - # Update status and completion time - failed_session.status = "FAILED" - failed_session.completed_at = datetime.now(timezone.utc) - # Add and commit within the temporary session - temp_db.add(failed_session) - temp_db.commit() - logger.warning(f"Successfully marked session {session_id} as FAILED in DB due to Celery enqueue error.") - else: - # This case should be rare if commit succeeded earlier, but log if it happens - logger.error(f"Could not find session {session_id} in temporary session to mark as FAILED after Celery error.") - except Exception as fail_update_err: - # Log errors during the failure update attempt itself - logger.error(f"Error occurred while trying to mark session {session_id} as FAILED via temporary session: {fail_update_err}") - # Note: We don't rollback temp_db here as context manager handles it. - - # Raise an HTTP exception to signal the failure to the client - raise HTTPException(status_code=500, detail="Failed to enqueue the background discovery task. The process could not be started.") + logger.info( + f"Successfully enqueued Celery task 'keyword_discovery_celery_task' for session {session_id}." + ) + except Exception: + # Handle potential errors during communication with the Celery broker (e.g., connection refused) + logger.exception( + f"Failed to send task to Celery for session {session_id}. Attempting to mark session as FAILED." + ) + + # --- Best-effort attempt to mark the session as FAILED --- + # Use a new, independent database session for this update to avoid interfering + # with the main request's session state, especially in error scenarios. + try: + # Create a new session scope using SessionLocal factory + with SessionLocal() as temp_db: + # Retrieve the session record within the new session + failed_session = temp_db.get(KeywordSearchSession, session_id) + if failed_session: + # Update status and completion time + failed_session.status = "FAILED" + failed_session.completed_at = datetime.now(timezone.utc) + # Add and commit within the temporary session + temp_db.add(failed_session) + temp_db.commit() + logger.warning( + f"Successfully marked session {session_id} as FAILED in DB due to Celery enqueue error." + ) + else: + # This case should be rare if commit succeeded earlier, but log if it happens + logger.error( + f"Could not find session {session_id} in temporary session to mark as FAILED after Celery error." + ) + except Exception as fail_update_err: + # Log errors during the failure update attempt itself + logger.error( + f"Error occurred while trying to mark session {session_id} as FAILED via temporary session: {fail_update_err}" + ) + # Note: We don't rollback temp_db here as context manager handles it. + + # Raise an HTTP exception to signal the failure to the client + raise HTTPException( + status_code=500, + detail="Failed to enqueue the background discovery task. The process could not be started.", + ) # 3. Return the initially created session details (status is 'QUEUED') # The client now knows the task is accepted and has the ID to track it. return search_session - except Exception as e: + except Exception: # Catch errors during the initial database interaction (session creation/commit) - logger.exception(f"Error creating initial KeywordSearchSession or committing for keywords: '{request.keywords}'") + logger.exception( + f"Error creating initial KeywordSearchSession or committing for keywords: '{request.keywords}'" + ) # Rollback the main transaction if session creation failed before commit try: db.rollback() except Exception as rb_err: - logger.error(f"Error during rollback after failing to create session: {rb_err}") + logger.error( + f"Error during rollback after failing to create session: {rb_err}" + ) # Signal internal server error raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, @@ -277,11 +330,11 @@ def ingest_by_keywords( @router.get( "/keywords/status/{session_id}", response_model=KeywordSearchSessionResponse, - summary="Get status of a Keyword Search Session" + summary="Get status of a Keyword Search Session", ) def get_keyword_session_status( session_id: int, - db: Session = Depends(get_db_session) # Database session dependency + db: Session = Depends(get_db_session), # Database session dependency ): """ Retrieves the current status and details of a specific KeywordSearchSession @@ -319,4 +372,6 @@ def get_keyword_session_status( logger.debug(f"Returning status '{search_session.status}' for session {session_id}") # Return the full session details matching the response model return search_session -# --- END --- \ No newline at end of file + + +# --- END --- diff --git a/backend/api/v1/endpoints/retrieval.py b/backend/api/v1/endpoints/retrieval.py index 80a69ee..c1ae290 100644 --- a/backend/api/v1/endpoints/retrieval.py +++ b/backend/api/v1/endpoints/retrieval.py @@ -7,25 +7,43 @@ """ import logging -from sqlalchemy.orm import Session, joinedload, selectinload -from sqlalchemy import func, select +from sqlalchemy.orm import Session, joinedload from fastapi import APIRouter, Depends, HTTPException, status from typing import List, Optional # Internal dependencies for database access, schemas, repositories, and models from backend.api.deps import get_db_session from backend.schemas.responses import ( - RepositoryResponse, OwnerResponse, ContributorResponse, WorkResponse, - PersonResponse, InstitutionResponse, - TopicSummary, SubfieldSummary, FieldSummary, DomainSummary, PrimaryTopicResponse + RepositoryResponse, + OwnerResponse, + ContributorResponse, + WorkResponse, + PersonResponse, + InstitutionResponse, + TopicSummary, + SubfieldSummary, + FieldSummary, + DomainSummary, + PrimaryTopicResponse, ) from backend.data.repositories import ( - RepositoryRepository, OwnerRepository, ContributorRepository, WorkRepository, - PersonRepository, InstitutionRepository + RepositoryRepository, + OwnerRepository, + ContributorRepository, + WorkRepository, + PersonRepository, + InstitutionRepository, ) from backend.data.models import ( - Work, WorkTopic, Topic, Subfield, Field, Domain, - Person, Institution, Contributor, Repository + Work, + WorkTopic, + Topic, + Subfield, + Field, + Person, + Institution, + Contributor, + Repository, ) # Logger setup for this module @@ -39,6 +57,7 @@ # These functions provide a standard way to fetch an entity by ID # or raise an HTTP 404 Not Found error if it doesn't exist. + def _get_repository_or_404(db: Session, repo_id: int) -> Repository: """Fetches a Repository by ID or raises HTTP 404.""" repo_repo = RepositoryRepository(db=db) @@ -51,6 +70,7 @@ def _get_repository_or_404(db: Session, repo_id: int) -> Repository: ) return repository + def _get_work_or_404(db: Session, work_id: int) -> Work: """Fetches a Work by ID or raises HTTP 404.""" # Note: This specific helper might not be used by the main get_work below @@ -65,6 +85,7 @@ def _get_work_or_404(db: Session, work_id: int) -> Work: ) return work + def _get_institution_or_404(db: Session, institution_id: int) -> Institution: """Fetches an Institution by ID or raises HTTP 404.""" inst_repo = InstitutionRepository(db=db) @@ -77,6 +98,7 @@ def _get_institution_or_404(db: Session, institution_id: int) -> Institution: ) return institution + def _get_person_or_404(db: Session, person_id: int) -> Person: """Fetches a Person by ID or raises HTTP 404.""" person_repo = PersonRepository(db=db) @@ -89,6 +111,7 @@ def _get_person_or_404(db: Session, person_id: int) -> Person: ) return person + def _get_contributor_or_404(db: Session, contributor_id: int) -> Contributor: """Fetches a Contributor by ID or raises HTTP 404.""" contrib_repo = ContributorRepository(db=db) @@ -100,20 +123,20 @@ def _get_contributor_or_404(db: Session, contributor_id: int) -> Contributor: detail=f"Contributor with id {contributor_id} not found", ) return contributor + + # --- End Helper Functions --- # --- Entity Retrieval Endpoints --- + @router.get( "/repositories/{id}", - response_model=RepositoryResponse, # Use the detailed response model - summary="Get Repository by ID" + response_model=RepositoryResponse, # Use the detailed response model + summary="Get Repository by ID", ) -def get_repository( - id: int, - db: Session = Depends(get_db_session) -): +def get_repository(id: int, db: Session = Depends(get_db_session)): """ Retrieves detailed information for a specific repository using its internal database ID. @@ -134,15 +157,9 @@ def get_repository( # FastAPI automatically maps the SQLAlchemy model to the Pydantic response model return repository -@router.get( - "/owners/{id}", - response_model=OwnerResponse, - summary="Get Owner by ID" -) -def get_owner( - id: int, - db: Session = Depends(get_db_session) -): + +@router.get("/owners/{id}", response_model=OwnerResponse, summary="Get Owner by ID") +def get_owner(id: int, db: Session = Depends(get_db_session)): """ Retrieves detailed information for a specific repository owner (User or Organization) using its internal database ID. @@ -168,15 +185,13 @@ def get_owner( ) return owner + @router.get( "/contributors/{id}", response_model=ContributorResponse, - summary="Get Contributor by ID" + summary="Get Contributor by ID", ) -def get_contributor( - id: int, - db: Session = Depends(get_db_session) -): +def get_contributor(id: int, db: Session = Depends(get_db_session)): """ Retrieves detailed information for a specific contributor (GitHub user linked to a repository) using its internal database ID. @@ -196,16 +211,16 @@ def get_contributor( contributor = _get_contributor_or_404(db, id) return contributor + # --- FINAL REVISED /works/{id} ENDPOINT --- @router.get( "/works/{id}", - response_model=WorkResponse, # Use the detailed Work response model - summary="Get Work by ID" + response_model=WorkResponse, # Use the detailed Work response model + summary="Get Work by ID", ) def get_work( - id: int, - db: Session = Depends(get_db_session) -) -> WorkResponse: # Explicitly type hint the return as the Pydantic model for clarity + id: int, db: Session = Depends(get_db_session) +) -> WorkResponse: # Explicitly type hint the return as the Pydantic model for clarity """ Retrieves detailed information for a specific scholarly work by its internal database ID. This includes the work's metadata, its primary topic (with its @@ -240,7 +255,9 @@ def get_work( # Step 2: Initialize structures to hold topic information primary_topic_response: Optional[PrimaryTopicResponse] = None topic_summaries: List[TopicSummary] = [] - processed_topic_ids: set[int] = set() # Track processed topics to avoid duplicates if needed + processed_topic_ids: set[int] = ( + set() + ) # Track processed topics to avoid duplicates if needed try: # Step 2a: Query for all WorkTopic associations for this work. @@ -261,11 +278,11 @@ def get_work( # Filter for the specific work ID .filter(WorkTopic.work_id == id) ) - work_topic_associations = work_topic_query.all() # Execute the query + work_topic_associations = work_topic_query.all() # Execute the query # Step 3: Process the fetched associations to build the response structure for wt in work_topic_associations: - topic = wt.topic # The actual Topic object + topic = wt.topic # The actual Topic object # Ensure the topic exists and hasn't been processed already if topic and topic.id not in processed_topic_ids: processed_topic_ids.add(topic.id) @@ -287,28 +304,37 @@ def get_work( # Build summaries for each level of the hierarchy if they exist if topic.subfield: # Validate each level against its Pydantic summary model - subfield_summary = SubfieldSummary.model_validate(topic.subfield) + subfield_summary = SubfieldSummary.model_validate( + topic.subfield + ) if topic.subfield.field: - field_summary = FieldSummary.model_validate(topic.subfield.field) + field_summary = FieldSummary.model_validate( + topic.subfield.field + ) if topic.subfield.field.domain: - domain_summary = DomainSummary.model_validate(topic.subfield.field.domain) + domain_summary = DomainSummary.model_validate( + topic.subfield.field.domain + ) # Construct the PrimaryTopicResponse using the validated topic summary # and the hierarchy summaries. Include the score from the association. primary_topic_response = PrimaryTopicResponse( - id=topic_summary.id, # From validated summary - openalex_id=topic_summary.openalex_id, # From validated summary - display_name=topic_summary.display_name, # From validated summary - created_at=topic_summary.created_at, # From validated summary - updated_at=topic_summary.updated_at, # From validated summary - score=wt.score, # Score from the WorkTopic link - subfield=subfield_summary, # Populated if exists - field=field_summary, # Populated if exists - domain=domain_summary # Populated if exists + id=topic_summary.id, # From validated summary + openalex_id=topic_summary.openalex_id, # From validated summary + display_name=topic_summary.display_name, # From validated summary + created_at=topic_summary.created_at, # From validated summary + updated_at=topic_summary.updated_at, # From validated summary + score=wt.score, # Score from the WorkTopic link + subfield=subfield_summary, # Populated if exists + field=field_summary, # Populated if exists + domain=domain_summary, # Populated if exists ) except Exception as e: # Log errors during processing/validation of a single topic, but continue - logger.error(f"Error processing/validating topic {getattr(topic, 'id', 'N/A')} for work {id}: {e}", exc_info=True) + logger.error( + f"Error processing/validating topic {getattr(topic, 'id', 'N/A')} for work {id}: {e}", + exc_info=True, + ) # Decide whether to raise, skip, or partially include data based on requirements except Exception as e: @@ -316,7 +342,7 @@ def get_work( logger.exception(f"Database error fetching topic data for work {id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to retrieve associated topic data for the work." + detail="Failed to retrieve associated topic data for the work.", ) # Step 4: Manually construct the dictionary for the final WorkResponse. @@ -335,11 +361,12 @@ def get_work( "host_venue_display_name": work.host_venue_display_name, "openalex_url": work.openalex_url, # Add the processed topic data - "primary_topic": primary_topic_response, # Populated if a primary topic was found - "topics": topic_summaries if topic_summaries else None # List of all topic summaries, or None if empty + "primary_topic": primary_topic_response, # Populated if a primary topic was found + "topics": topic_summaries + if topic_summaries + else None, # List of all topic summaries, or None if empty } - # Step 5: Validate the constructed dictionary against the WorkResponse Pydantic model. # This ensures the final structure matches the defined schema before returning. try: @@ -351,20 +378,15 @@ def get_work( logger.exception(f"Error validating final WorkResponse data for work {id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to format the final work data into the expected response structure." + detail="Failed to format the final work data into the expected response structure.", ) + + # --- END FINAL REVISED ENDPOINT --- -@router.get( - "/persons/{id}", - response_model=PersonResponse, - summary="Get Person by ID" -) -def get_person( - id: int, - db: Session = Depends(get_db_session) -): +@router.get("/persons/{id}", response_model=PersonResponse, summary="Get Person by ID") +def get_person(id: int, db: Session = Depends(get_db_session)): """ Retrieves detailed information for a specific person (author/researcher) using their internal database ID. @@ -384,15 +406,13 @@ def get_person( person = _get_person_or_404(db, id) return person + @router.get( "/institutions/{id}", response_model=InstitutionResponse, - summary="Get Institution by ID" + summary="Get Institution by ID", ) -def get_institution( - id: int, - db: Session = Depends(get_db_session) -): +def get_institution(id: int, db: Session = Depends(get_db_session)): """ Retrieves detailed information for a specific institution using its internal database ID. @@ -410,4 +430,4 @@ def get_institution( logger.debug(f"Retrieving institution with id: {id}") # Use the helper to fetch or raise 404 institution = _get_institution_or_404(db, id) - return institution \ No newline at end of file + return institution diff --git a/backend/api/v1/endpoints/search.py b/backend/api/v1/endpoints/search.py index 66ea641..9bfdf2c 100644 --- a/backend/api/v1/endpoints/search.py +++ b/backend/api/v1/endpoints/search.py @@ -11,14 +11,18 @@ from fastapi import APIRouter, Depends, Query, HTTPException, status from sqlalchemy.orm import Session + # Import necessary SQLAlchemy functions for searching and ordering -from sqlalchemy import or_, func +from sqlalchemy import or_ # Internal dependencies for database access, models, and response schemas from backend.api.deps import get_db_session from backend.data.models import Repository, Work, Person, Institution from backend.schemas.responses import ( - RepositorySummary, WorkSummary, PersonSummary, InstitutionSummary # Use summary schemas for search results + RepositorySummary, + WorkSummary, + PersonSummary, + InstitutionSummary, # Use summary schemas for search results ) # Logger setup for this module @@ -30,19 +34,32 @@ # Default pagination parameters for search results DEFAULT_SEARCH_SKIP = 0 DEFAULT_SEARCH_LIMIT = 100 -MAX_SEARCH_LIMIT = 200 # Define a maximum limit for safety/performance +MAX_SEARCH_LIMIT = 200 # Define a maximum limit for safety/performance @router.get( "/repositories", - response_model=List[RepositorySummary], # Return a list of summaries - summary="Search Repositories" + response_model=List[RepositorySummary], # Return a list of summaries + summary="Search Repositories", ) def search_repositories( - q: str = Query(..., min_length=1, description="Search query string used to match repository name or description."), - skip: int = Query(DEFAULT_SEARCH_SKIP, ge=0, description="Number of results to skip (for pagination)."), - limit: int = Query(DEFAULT_SEARCH_LIMIT, ge=1, le=MAX_SEARCH_LIMIT, description="Maximum number of results to return."), - db: Session = Depends(get_db_session) # Database session dependency + q: str = Query( + ..., + min_length=1, + description="Search query string used to match repository name or description.", + ), + skip: int = Query( + DEFAULT_SEARCH_SKIP, + ge=0, + description="Number of results to skip (for pagination).", + ), + limit: int = Query( + DEFAULT_SEARCH_LIMIT, + ge=1, + le=MAX_SEARCH_LIMIT, + description="Maximum number of results to return.", + ), + db: Session = Depends(get_db_session), # Database session dependency ): """ Searches for repositories where the query string `q` appears in the @@ -62,7 +79,9 @@ def search_repositories( Raises: HTTPException: 500 Internal Server Error if the search query fails. """ - logger.info(f"Searching repositories with query: '{q}', skip: {skip}, limit: {limit}") + logger.info( + f"Searching repositories with query: '{q}', skip: {skip}, limit: {limit}" + ) # Prepare the search term for use with ILIKE (case-insensitive LIKE) search_term = f"%{q}%" @@ -73,39 +92,54 @@ def search_repositories( .filter( # Use 'or_' to match the search term in either field or_( - Repository.full_name.ilike(search_term), # Case-insensitive match on full name - Repository.description.ilike(search_term) # Case-insensitive match on description + Repository.full_name.ilike( + search_term + ), # Case-insensitive match on full name + Repository.description.ilike( + search_term + ), # Case-insensitive match on description ) ) # Order results: repositories with more stars appear first. # `nullslast()` ensures repositories without star counts appear at the end. .order_by(Repository.stargazers_count.desc().nullslast()) - .offset(skip) # Apply pagination offset - .limit(limit) # Apply pagination limit + .offset(skip) # Apply pagination offset + .limit(limit) # Apply pagination limit ) # Execute the query and get results results = query.all() # FastAPI handles mapping the results to the response model (List[RepositorySummary]) return results - except Exception as e: + except Exception: # Log unexpected errors during the search logger.exception(f"Error during repository search for query '{q}'") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="An error occurred while searching for repositories." + detail="An error occurred while searching for repositories.", ) @router.get( "/works", - response_model=List[WorkSummary], # Return a list of summaries - summary="Search Works" + response_model=List[WorkSummary], # Return a list of summaries + summary="Search Works", ) def search_works( - q: str = Query(..., min_length=1, description="Search query string used to match work title or DOI."), - skip: int = Query(DEFAULT_SEARCH_SKIP, ge=0, description="Number of results to skip."), - limit: int = Query(DEFAULT_SEARCH_LIMIT, ge=1, le=MAX_SEARCH_LIMIT, description="Maximum number of results to return."), - db: Session = Depends(get_db_session) # Database session dependency + q: str = Query( + ..., + min_length=1, + description="Search query string used to match work title or DOI.", + ), + skip: int = Query( + DEFAULT_SEARCH_SKIP, ge=0, description="Number of results to skip." + ), + limit: int = Query( + DEFAULT_SEARCH_LIMIT, + ge=1, + le=MAX_SEARCH_LIMIT, + description="Maximum number of results to return.", + ), + db: Session = Depends(get_db_session), # Database session dependency ): """ Searches for scholarly works where the query string `q` appears in the @@ -126,7 +160,7 @@ def search_works( HTTPException: 500 Internal Server Error if the search query fails. """ logger.info(f"Searching works with query: '{q}', skip: {skip}, limit: {limit}") - search_term = f"%{q}%" # Prepare term for ILIKE + search_term = f"%{q}%" # Prepare term for ILIKE try: query = ( @@ -134,8 +168,8 @@ def search_works( .filter( # Match the search term in either title or DOI or_( - Work.title.ilike(search_term), # Case-insensitive match on title - Work.doi.ilike(search_term) # Case-insensitive match on DOI + Work.title.ilike(search_term), # Case-insensitive match on title + Work.doi.ilike(search_term), # Case-insensitive match on DOI ) ) # Order results: more cited works appear first. @@ -145,24 +179,35 @@ def search_works( ) results = query.all() return results - except Exception as e: + except Exception: logger.exception(f"Error during work search for query '{q}'") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="An error occurred while searching for works." + detail="An error occurred while searching for works.", ) @router.get( "/people", - response_model=List[PersonSummary], # Return a list of summaries - summary="Search People" + response_model=List[PersonSummary], # Return a list of summaries + summary="Search People", ) def search_people( - q: str = Query(..., min_length=1, description="Search query string used to match person display name or ORCID."), - skip: int = Query(DEFAULT_SEARCH_SKIP, ge=0, description="Number of results to skip."), - limit: int = Query(DEFAULT_SEARCH_LIMIT, ge=1, le=MAX_SEARCH_LIMIT, description="Maximum number of results to return."), - db: Session = Depends(get_db_session) # Database session dependency + q: str = Query( + ..., + min_length=1, + description="Search query string used to match person display name or ORCID.", + ), + skip: int = Query( + DEFAULT_SEARCH_SKIP, ge=0, description="Number of results to skip." + ), + limit: int = Query( + DEFAULT_SEARCH_LIMIT, + ge=1, + le=MAX_SEARCH_LIMIT, + description="Maximum number of results to return.", + ), + db: Session = Depends(get_db_session), # Database session dependency ): """ Searches for people (authors/researchers) where the query string `q` @@ -184,7 +229,7 @@ def search_people( HTTPException: 500 Internal Server Error if the search query fails. """ logger.info(f"Searching people with query: '{q}', skip: {skip}, limit: {limit}") - search_term = f"%{q}%" # Prepare term for ILIKE + search_term = f"%{q}%" # Prepare term for ILIKE try: query = ( @@ -192,8 +237,10 @@ def search_people( .filter( # Match the search term in either display name or ORCID or_( - Person.display_name.ilike(search_term), # Case-insensitive match on display name - Person.orcid.ilike(search_term) # Case-insensitive match on ORCID + Person.display_name.ilike( + search_term + ), # Case-insensitive match on display name + Person.orcid.ilike(search_term), # Case-insensitive match on ORCID # Future enhancement: Add search on Person.display_name_alternatives (JSONB array) # This would require database-specific JSON functions, e.g., for PostgreSQL: # func.lower(Person.display_name_alternatives::text).contains(q.lower()) @@ -206,24 +253,36 @@ def search_people( ) results = query.all() return results - except Exception as e: + except Exception: logger.exception(f"Error during people search for query '{q}'") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="An error occurred while searching for people." + detail="An error occurred while searching for people.", ) + @router.get( "/institutions", - response_model=List[InstitutionSummary], # Return a list of summaries - summary="Search Institutions" + response_model=List[InstitutionSummary], # Return a list of summaries + summary="Search Institutions", ) def search_institutions( - q: str = Query(..., min_length=1, description="Search query string used to match institution display name or ROR ID."), + q: str = Query( + ..., + min_length=1, + description="Search query string used to match institution display name or ROR ID.", + ), # Corrected default skip value for consistency - skip: int = Query(DEFAULT_SEARCH_SKIP, ge=0, description="Number of results to skip."), - limit: int = Query(DEFAULT_SEARCH_LIMIT, ge=1, le=MAX_SEARCH_LIMIT, description="Maximum number of results to return."), - db: Session = Depends(get_db_session) # Database session dependency + skip: int = Query( + DEFAULT_SEARCH_SKIP, ge=0, description="Number of results to skip." + ), + limit: int = Query( + DEFAULT_SEARCH_LIMIT, + ge=1, + le=MAX_SEARCH_LIMIT, + description="Maximum number of results to return.", + ), + db: Session = Depends(get_db_session), # Database session dependency ): """ Searches for institutions where the query string `q` appears in the @@ -243,8 +302,10 @@ def search_institutions( Raises: HTTPException: 500 Internal Server Error if the search query fails. """ - logger.info(f"Searching institutions with query: '{q}', skip: {skip}, limit: {limit}") - search_term = f"%{q}%" # Prepare term for ILIKE + logger.info( + f"Searching institutions with query: '{q}', skip: {skip}, limit: {limit}" + ) + search_term = f"%{q}%" # Prepare term for ILIKE try: query = ( @@ -252,8 +313,12 @@ def search_institutions( .filter( # Match the search term in either display name or ROR ID or_( - Institution.display_name.ilike(search_term), # Case-insensitive match on display name - Institution.ror.ilike(search_term) # Case-insensitive match on ROR ID + Institution.display_name.ilike( + search_term + ), # Case-insensitive match on display name + Institution.ror.ilike( + search_term + ), # Case-insensitive match on ROR ID ) ) # Order results alphabetically by name @@ -263,9 +328,9 @@ def search_institutions( ) results = query.all() return results - except Exception as e: + except Exception: logger.exception(f"Error during institution search for query '{q}'") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="An error occurred while searching for institutions." - ) \ No newline at end of file + detail="An error occurred while searching for institutions.", + ) diff --git a/backend/api/v1/endpoints/shared_recipes.py b/backend/api/v1/endpoints/shared_recipes.py index e9f8e25..dda36a0 100644 --- a/backend/api/v1/endpoints/shared_recipes.py +++ b/backend/api/v1/endpoints/shared_recipes.py @@ -7,19 +7,29 @@ """ import logging -from pathlib import Path -from typing import List, Dict, Any +from typing import List -from fastapi import APIRouter, HTTPException, status, Depends, Body, Path as FastApiPath -from sqlalchemy.orm import Session +from fastapi import APIRouter, HTTPException, status, Body, Path as FastApiPath # Internal dependencies for recipe discovery, execution, configuration, schemas, and DB access -from backend.utils.recipe_utils import discover_recipes, CONTRIB_QUERIES_DIR, RecipeMetadata, RecipeParameterMetadata # Import utility and constants -from backend.utils.recipe_executor import execute_recipe # Utility to run scripts safely -from backend.config.settings import settings # Access to application settings (e.g., DB URL) -from backend.schemas.requests import RecipeExecutionRequest # Standard request body for execution -from backend.schemas.responses import RecipeMetadataResponse, RecipeExecutionResponse # Standard response models -from backend.api.deps import get_db_session # Database session dependency (though not directly used here) +from backend.utils.recipe_utils import ( + discover_recipes, + CONTRIB_QUERIES_DIR, + RecipeMetadata, +) # Import utility and constants +from backend.utils.recipe_executor import ( + execute_recipe, +) # Utility to run scripts safely +from backend.config.settings import ( + settings, +) # Access to application settings (e.g., DB URL) +from backend.schemas.requests import ( + RecipeExecutionRequest, +) # Standard request body for execution +from backend.schemas.responses import ( + RecipeMetadataResponse, + RecipeExecutionResponse, +) # Standard response models # Logger setup for this module logger = logging.getLogger(__name__) @@ -27,6 +37,7 @@ # API Router instance for shared recipe endpoints router = APIRouter() + # --- Recipe Discovery Endpoint --- @router.get( "/", @@ -50,36 +61,47 @@ def get_available_analysis_recipes(): Raises: HTTPException: 500 Internal Server Error if scanning or parsing fails unexpectedly. """ - logger.info(f"Request received: Discover analysis recipes from {CONTRIB_QUERIES_DIR}") + logger.info( + f"Request received: Discover analysis recipes from {CONTRIB_QUERIES_DIR}" + ) try: # Use the shared discovery utility, targeting the 'queries' directory # and the specific function name expected in analysis recipes. discovered_recipes = discover_recipes( recipes_base_dir=CONTRIB_QUERIES_DIR, - target_function_name="run_analysis" # Target function for analysis scripts + target_function_name="run_analysis", # Target function for analysis scripts ) # Convert internal metadata objects to the standardized response model - response_data = [RecipeMetadataResponse(**recipe.to_dict()) for recipe in discovered_recipes] + response_data = [ + RecipeMetadataResponse(**recipe.to_dict()) for recipe in discovered_recipes + ] return response_data - except Exception as e: + except Exception: # Log and raise generic error if discovery fails logger.exception("Error occurred during analysis recipe discovery.") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to discover analysis recipes." + detail="Failed to discover analysis recipes.", ) + # --- Recipe Execution Endpoint --- @router.post( "/execute/{recipe_name}/{recipe_version}", - response_model=RecipeExecutionResponse, # Expected response structure after execution + response_model=RecipeExecutionResponse, # Expected response structure after execution summary="Execute an Analysis Recipe", - status_code=status.HTTP_200_OK # Use 200 OK for successful execution initiation and result return + status_code=status.HTTP_200_OK, # Use 200 OK for successful execution initiation and result return ) def execute_analysis_recipe( - recipe_name: str = FastApiPath(..., description="Name of the recipe script (without .py or version)."), - recipe_version: str = FastApiPath(..., description="Version identifier of the recipe (e.g., 'v1')."), - request_body: RecipeExecutionRequest = Body(...), # Contains recipe-specific parameters + recipe_name: str = FastApiPath( + ..., description="Name of the recipe script (without .py or version)." + ), + recipe_version: str = FastApiPath( + ..., description="Version identifier of the recipe (e.g., 'v1')." + ), + request_body: RecipeExecutionRequest = Body( + ... + ), # Contains recipe-specific parameters # Note: get_db_session is not directly used here as the connection string is passed # to the executor, but it ensures DB is accessible if needed. # db: Session = Depends(get_db_session) @@ -120,14 +142,15 @@ def execute_analysis_recipe( - 500 Internal Server Error: If database connection is missing, script execution fails, or an unexpected error occurs during the process. """ - logger.info(f"Request received: Execute recipe '{recipe_name}' version '{recipe_version}' with params: {list(request_body.parameters.keys())}") + logger.info( + f"Request received: Execute recipe '{recipe_name}' version '{recipe_version}' with params: {list(request_body.parameters.keys())}" + ) # 1. Find Recipe Metadata (Rescan directory for execution context) # (Consider caching this discovery result in production) try: discovered_recipes = discover_recipes( - recipes_base_dir=CONTRIB_QUERIES_DIR, - target_function_name="run_analysis" + recipes_base_dir=CONTRIB_QUERIES_DIR, target_function_name="run_analysis" ) recipe_meta: RecipeMetadata | None = None # Find the specific recipe matching the request path parameters @@ -135,20 +158,20 @@ def execute_analysis_recipe( if recipe.name == recipe_name and recipe.version == recipe_version: recipe_meta = recipe break - except Exception as discovery_err: - # Handle errors during the lookup process itself - logger.exception("Error during recipe lookup for execution.") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to look up recipe for execution." - ) + except Exception: + # Handle errors during the lookup process itself + logger.exception("Error during recipe lookup for execution.") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to look up recipe for execution.", + ) # Handle case where the recipe is not found if not recipe_meta: logger.warning(f"Recipe not found: {recipe_name} v{recipe_version}") raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"Recipe '{recipe_name}' version '{recipe_version}' not found." + detail=f"Recipe '{recipe_name}' version '{recipe_version}' not found.", ) # --- 2. FIXED Parameter validation against discovered metadata --- @@ -156,52 +179,61 @@ def execute_analysis_recipe( missing_required_params = set() # Check if the user is providing a list of repository IDs, which might affect # whether a single 'repository_id' parameter is still required. - providing_multiple_repos = 'repository_ids' in provided_params + providing_multiple_repos = "repository_ids" in provided_params # Iterate through parameters defined in the recipe's docstring metadata for param_meta in recipe_meta.parameters: # Ignore parameters managed internally by the execution environment - if param_meta.name == 'db_conn_str': + if param_meta.name == "db_conn_str": continue # If the user provides 'repository_ids', skip checking requirement for 'repository_id' # This allows recipes to accept either a single ID or a list. - if param_meta.name == 'repository_id' and providing_multiple_repos: - logger.debug(f"Ignoring requirement check for '{param_meta.name}' because 'repository_ids' was provided.") + if param_meta.name == "repository_id" and providing_multiple_repos: + logger.debug( + f"Ignoring requirement check for '{param_meta.name}' because 'repository_ids' was provided." + ) continue # Also skip requirement check for 'repository_ids' itself if provided (handled above) - if param_meta.name == 'repository_ids' and providing_multiple_repos: - continue + if param_meta.name == "repository_ids" and providing_multiple_repos: + continue # Determine if the parameter is optional based on its type hint in the docstring metadata. # Checks for standard 'Optional[...]' syntax or '... | None'. - is_optional = param_meta.type.startswith('Optional[') or ' | None' in param_meta.type or 'Optional' in param_meta.type + is_optional = ( + param_meta.type.startswith("Optional[") + or " | None" in param_meta.type + or "Optional" in param_meta.type + ) # If the parameter is NOT optional AND it was NOT provided in the request body, mark it as missing. if not is_optional and param_meta.name not in provided_params: missing_required_params.add(param_meta.name) - logger.debug(f"Parameter '{param_meta.name}' (Type: {param_meta.type}) identified as required but missing. Optional: {is_optional}, Provided: {provided_params}") - + logger.debug( + f"Parameter '{param_meta.name}' (Type: {param_meta.type}) identified as required but missing. Optional: {is_optional}, Provided: {provided_params}" + ) # If any required parameters were found missing, raise a validation error. if missing_required_params: - missing_params_str = ', '.join(sorted(list(missing_required_params))) - logger.warning(f"Missing required parameters for recipe {recipe_name} v{recipe_version}: {missing_params_str}") + missing_params_str = ", ".join(sorted(list(missing_required_params))) + logger.warning( + f"Missing required parameters for recipe {recipe_name} v{recipe_version}: {missing_params_str}" + ) raise HTTPException( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=f"Missing required parameters: {missing_params_str}" + detail=f"Missing required parameters: {missing_params_str}", ) # --- END FIXED VALIDATION --- # 3. Get Database Connection String from application settings db_connection_string = settings.DATABASE_URL if not db_connection_string: - # DB connection is essential for recipes interacting with data - logger.error("DATABASE_URL is not configured in settings.") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Database connection string is not configured." - ) + # DB connection is essential for recipes interacting with data + logger.error("DATABASE_URL is not configured in settings.") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Database connection string is not configured.", + ) # 4. Prepare final parameters for the executor # Start with the parameters provided by the user @@ -210,25 +242,31 @@ def execute_analysis_recipe( # unless the recipe script *explicitly* defines 'db_conn_str' as one of its function arguments. # 5. Execute the recipe script via the executor utility - logger.info(f"Calling recipe executor for: {recipe_meta.file_path} with params keys: {list(final_params.keys())}") + logger.info( + f"Calling recipe executor for: {recipe_meta.file_path} with params keys: {list(final_params.keys())}" + ) try: # The executor handles running the script's 'run_analysis' function execution_result = execute_recipe( - recipe_path_relative=recipe_meta.file_path, # Path to the script - recipe_params=final_params, # User-provided parameters - db_conn_str=db_connection_string, # DB connection string for the script - script_type='analysis', # Type indicator for the executor - function_name='run_analysis' # Target function within the script + recipe_path_relative=recipe_meta.file_path, # Path to the script + recipe_params=final_params, # User-provided parameters + db_conn_str=db_connection_string, # DB connection string for the script + script_type="analysis", # Type indicator for the executor + function_name="run_analysis", # Target function within the script # secrets={} # Pass secrets dictionary if analysis recipes need them ) # Log the outcome reported by the executor - logger.info(f"Recipe executor finished for: {recipe_meta.file_path}. Reported success: {execution_result.get('success')}") + logger.info( + f"Recipe executor finished for: {recipe_meta.file_path}. Reported success: {execution_result.get('success')}" + ) # Return the entire result object from the executor (contains success, data/error) return execution_result except Exception as exec_err: # Catch unexpected errors during the API endpoint's attempt to call the executor - logger.exception(f"Unexpected error in API endpoint while trying to execute recipe {recipe_name} v{recipe_version}: {exec_err}") + logger.exception( + f"Unexpected error in API endpoint while trying to execute recipe {recipe_name} v{recipe_version}: {exec_err}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"An unexpected server error occurred during recipe execution: {exec_err}" - ) \ No newline at end of file + detail=f"An unexpected server error occurred during recipe execution: {exec_err}", + ) diff --git a/backend/api/v1/endpoints/surfacing.py b/backend/api/v1/endpoints/surfacing.py index 5ebdd7a..33eecda 100644 --- a/backend/api/v1/endpoints/surfacing.py +++ b/backend/api/v1/endpoints/surfacing.py @@ -15,23 +15,39 @@ # Internal dependencies for database access, schemas, services, repositories, and models from backend.api.deps import get_db_session + # Import required Pydantic response schemas for surfacing results from backend.schemas.responses import ( - WorkSummary, RepositorySummary, RepositoryCitationCountResponse, - PersonSummary, InstitutionSummary, + WorkSummary, + RepositorySummary, + RepositoryCitationCountResponse, + PersonSummary, + InstitutionSummary, AffiliationResultResponse, - ContributorResponse, # Used for shared contributor details - SoftwareDependencyResponse # Used for repository dependencies + ContributorResponse, # Used for shared contributor details + SoftwareDependencyResponse, # Used for repository dependencies ) + # Service layer containing the business logic for surfacing relationships from backend.services.surfacing_service import SurfacingService + # Repositories are primarily used by helper functions for 404 checks from backend.data.repositories import ( - RepositoryRepository, WorkRepository, InstitutionRepository, PersonRepository, - ContributorRepository # Needed for _get_contributor_or_404 + RepositoryRepository, + WorkRepository, + InstitutionRepository, + PersonRepository, + ContributorRepository, # Needed for _get_contributor_or_404 ) + # Models needed for helper function type hints and potentially by the service -from backend.data.models import Repository, Work, Institution, Person, Contributor, SoftwareDependency # Ensure Contributor is imported +from backend.data.models import ( + Repository, + Work, + Institution, + Person, + Contributor, +) # Ensure Contributor is imported # Logger setup for this module logger = logging.getLogger(__name__) @@ -43,18 +59,22 @@ # These ensure that the primary entity ID provided in the path exists before # attempting to find related entities. + def _get_repository_or_404(db: Session, repo_id: int) -> Repository: """Fetches a Repository by ID or raises HTTP 404.""" repo_repo = RepositoryRepository(db=db) repository = repo_repo.get(id=repo_id) if not repository: - logger.warning(f"Repository with id {repo_id} not found for surfacing operation.") + logger.warning( + f"Repository with id {repo_id} not found for surfacing operation." + ) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"Repository with id {repo_id} not found", ) return repository + def _get_work_or_404(db: Session, work_id: int) -> Work: """Fetches a Work by ID or raises HTTP 404.""" work_repo = WorkRepository(db=db) @@ -67,18 +87,22 @@ def _get_work_or_404(db: Session, work_id: int) -> Work: ) return work + def _get_institution_or_404(db: Session, institution_id: int) -> Institution: """Fetches an Institution by ID or raises HTTP 404.""" inst_repo = InstitutionRepository(db=db) institution = inst_repo.get(id=institution_id) if not institution: - logger.warning(f"Institution with id {institution_id} not found for surfacing operation.") + logger.warning( + f"Institution with id {institution_id} not found for surfacing operation." + ) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"Institution with id {institution_id} not found", ) return institution + def _get_person_or_404(db: Session, person_id: int) -> Person: """Fetches a Person by ID or raises HTTP 404.""" person_repo = PersonRepository(db=db) @@ -91,31 +115,37 @@ def _get_person_or_404(db: Session, person_id: int) -> Person: ) return person + def _get_contributor_or_404(db: Session, contributor_id: int) -> Contributor: """Fetches a Contributor by ID or raises HTTP 404.""" contrib_repo = ContributorRepository(db=db) contributor = contrib_repo.get(id=contributor_id) if not contributor: - logger.warning(f"Contributor with id {contributor_id} not found for surfacing operation.") + logger.warning( + f"Contributor with id {contributor_id} not found for surfacing operation." + ) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"Contributor with id {contributor_id} not found", ) return contributor + + # --- End Helper Functions --- # --- Surfacing Endpoints --- + @router.get( "/repositories/{repo_id}/works", - response_model=List[WorkSummary], # Returns summaries of related works - summary="Get Works associated with a Repository" + response_model=List[WorkSummary], # Returns summaries of related works + summary="Get Works associated with a Repository", ) def get_repository_works( repo_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject SurfacingService dependency + service: SurfacingService = Depends(), # Inject SurfacingService dependency ): """ Retrieves a list of scholarly works (summaries) that have been linked @@ -134,7 +164,7 @@ def get_repository_works( 500 if an error occurs during retrieval. """ logger.info(f"Request received: Get works for repository ID {repo_id}") - _get_repository_or_404(db, repo_id) # Ensure repository exists + _get_repository_or_404(db, repo_id) # Ensure repository exists try: # Delegate the core logic to the surfacing service works = service.get_works_for_repository(db=db, repository_id=repo_id) @@ -144,18 +174,19 @@ def get_repository_works( logger.exception(f"Error retrieving works for repository {repo_id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to retrieve associated works." + detail="Failed to retrieve associated works.", ) + @router.get( "/works/{work_id}/repositories", - response_model=List[RepositorySummary], # Returns summaries of related repositories - summary="Get Repositories associated with a Work" + response_model=List[RepositorySummary], # Returns summaries of related repositories + summary="Get Repositories associated with a Work", ) def get_work_repositories( work_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of repositories (summaries) that have been linked @@ -174,7 +205,7 @@ def get_work_repositories( 500 if an error occurs during retrieval. """ logger.info(f"Request received: Get repositories for work ID {work_id}") - _get_work_or_404(db, work_id) # Ensure work exists + _get_work_or_404(db, work_id) # Ensure work exists try: repositories = service.get_repositories_for_work(db=db, work_id=work_id) return repositories @@ -182,18 +213,19 @@ def get_work_repositories( logger.exception(f"Error retrieving repositories for work {work_id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to retrieve associated repositories." + detail="Failed to retrieve associated repositories.", ) + @router.get( "/works/{work_id}/citations", - response_model=List[WorkSummary], # Returns summaries of citing works - summary="Get Works citing a specific Work" + response_model=List[WorkSummary], # Returns summaries of citing works + summary="Get Works citing a specific Work", ) def get_work_citations( work_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of scholarly works (summaries) that cite the specified work ID. @@ -212,7 +244,7 @@ def get_work_citations( 500 if an error occurs during retrieval. """ logger.info(f"Request received: Get citations for work ID {work_id}") - _get_work_or_404(db, work_id) # Ensure the cited work exists + _get_work_or_404(db, work_id) # Ensure the cited work exists try: # Service method likely looks up citing works based on stored relationships citing_works = service.get_works_cited_by(db=db, work_id=work_id) @@ -221,18 +253,19 @@ def get_work_citations( logger.exception(f"Error retrieving citations for work {work_id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to retrieve citing works." + detail="Failed to retrieve citing works.", ) + @router.get( "/works/{work_id}/references", - response_model=List[WorkSummary], # Returns summaries of referenced works - summary="Get Works referenced by a specific Work" + response_model=List[WorkSummary], # Returns summaries of referenced works + summary="Get Works referenced by a specific Work", ) def get_work_references( work_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of scholarly works (summaries) that are referenced by @@ -251,18 +284,21 @@ def get_work_references( 500 if an error occurs during retrieval. """ logger.info(f"Request received: Get references for work ID {work_id}") - _get_work_or_404(db, work_id) # Ensure the citing work exists + _get_work_or_404(db, work_id) # Ensure the citing work exists try: # Service method likely looks up referenced works based on stored relationships - referenced_works = service.get_works_citing(db=db, work_id=work_id) # Note: Service method name might seem reversed but implies "works that this work cites" + referenced_works = service.get_works_citing( + db=db, work_id=work_id + ) # Note: Service method name might seem reversed but implies "works that this work cites" return referenced_works except Exception as e: logger.exception(f"Error retrieving references for work {work_id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to retrieve referenced works." + detail="Failed to retrieve referenced works.", ) + @router.get( "/repositories/{repo_id}/citation_count", response_model=RepositoryCitationCountResponse, @@ -271,12 +307,12 @@ def get_work_references( "Retrieves citation metrics for a repository: " "1. `aggregated_citation_count`: Sum of 'cited_by_count' from OpenAlex for all works linked to the repository. " "2. `discovered_citation_count`: Count of unique citing works found within the MOSS database itself that cite any work linked to the repository." - ) + ), ) def get_repository_citation_counts( repo_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Calculates and retrieves citation counts for a given repository. This includes @@ -297,27 +333,30 @@ def get_repository_citation_counts( 500 if an error occurs during calculation. """ logger.info(f"Request received: Get citation counts for repository ID {repo_id}") - _get_repository_or_404(db, repo_id) # Ensure repository exists + _get_repository_or_404(db, repo_id) # Ensure repository exists try: - citation_counts_dict = service.get_repository_aggregated_citations(db=db, repository_id=repo_id) + citation_counts_dict = service.get_repository_aggregated_citations( + db=db, repository_id=repo_id + ) # The service returns a dictionary suitable for the response model return citation_counts_dict except Exception as e: logger.exception(f"Error calculating citation counts for repo {repo_id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to calculate citation counts." + detail="Failed to calculate citation counts.", ) + @router.get( "/repositories/{repo_id}/shared_contributors", - response_model=List[RepositorySummary], # Returns summaries of related repositories - summary="Get Repositories sharing Contributors" + response_model=List[RepositorySummary], # Returns summaries of related repositories + summary="Get Repositories sharing Contributors", ) def get_shared_contributors_repositories( repo_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of other repositories (summaries) that share at least one @@ -336,29 +375,36 @@ def get_shared_contributors_repositories( HTTPException: 404 if the repository ID is not found. 500 if an error occurs during retrieval. """ - logger.info(f"Request received: Get repositories sharing contributors with repo ID {repo_id}") - _get_repository_or_404(db, repo_id) # Ensure source repository exists + logger.info( + f"Request received: Get repositories sharing contributors with repo ID {repo_id}" + ) + _get_repository_or_404(db, repo_id) # Ensure source repository exists try: - shared_repos = service.get_repositories_sharing_contributors(db=db, repository_id=repo_id) + shared_repos = service.get_repositories_sharing_contributors( + db=db, repository_id=repo_id + ) return shared_repos except Exception as e: - logger.exception(f"Error finding repositories sharing contributors with repo {repo_id}: {e}") + logger.exception( + f"Error finding repositories sharing contributors with repo {repo_id}: {e}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to find repositories sharing contributors." + detail="Failed to find repositories sharing contributors.", ) + @router.get( "/repositories/{repo_id_1}/shared_contributors_with/{repo_id_2}", - response_model=List[ContributorResponse], # Returns detailed contributor info + response_model=List[ContributorResponse], # Returns detailed contributor info summary="Get Specific Contributors Shared Between Two Repositories", - tags=["Surfacing", "Contributors"] # Add relevant tags for API documentation + tags=["Surfacing", "Contributors"], # Add relevant tags for API documentation ) def get_shared_contributor_details_between_repos( repo_id_1: int, repo_id_2: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves the detailed information for contributors who are associated with @@ -378,10 +424,12 @@ def get_shared_contributor_details_between_repos( 404 if either repository ID is not found. 500 if an error occurs during retrieval. """ - logger.info(f"Request received: Get shared contributor details between repo {repo_id_1} and {repo_id_2}") + logger.info( + f"Request received: Get shared contributor details between repo {repo_id_1} and {repo_id_2}" + ) # Check for self-comparison if repo_id_1 == repo_id_2: - raise HTTPException( + raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot compare a repository with itself for shared contributors.", ) @@ -396,21 +444,24 @@ def get_shared_contributor_details_between_repos( # FastAPI maps the Contributor models to ContributorResponse return shared_contributors except Exception as e: - logger.exception(f"Error getting shared contributors between {repo_id_1} and {repo_id_2}: {e}") + logger.exception( + f"Error getting shared contributors between {repo_id_1} and {repo_id_2}: {e}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to retrieve shared contributor details." + detail="Failed to retrieve shared contributor details.", ) + @router.get( "/repositories/{repo_id}/shared_works", - response_model=List[RepositorySummary], # Returns summaries of related repositories - summary="Get Repositories sharing linked Works" + response_model=List[RepositorySummary], # Returns summaries of related repositories + summary="Get Repositories sharing linked Works", ) def get_shared_works_repositories( repo_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of other repositories (summaries) that share at least one @@ -429,27 +480,34 @@ def get_shared_works_repositories( HTTPException: 404 if the repository ID is not found. 500 if an error occurs during retrieval. """ - logger.info(f"Request received: Get repositories sharing works with repo ID {repo_id}") - _get_repository_or_404(db, repo_id) # Ensure source repository exists + logger.info( + f"Request received: Get repositories sharing works with repo ID {repo_id}" + ) + _get_repository_or_404(db, repo_id) # Ensure source repository exists try: - shared_repos = service.get_repositories_sharing_works(db=db, repository_id=repo_id) + shared_repos = service.get_repositories_sharing_works( + db=db, repository_id=repo_id + ) return shared_repos except Exception as e: - logger.exception(f"Error finding repositories sharing works with repo {repo_id}: {e}") + logger.exception( + f"Error finding repositories sharing works with repo {repo_id}: {e}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to find repositories sharing linked works." + detail="Failed to find repositories sharing linked works.", ) + @router.get( "/works/{work_id}/citing_people", - response_model=List[PersonSummary], # Returns summaries of people - summary="Get People who authored works citing this Work" + response_model=List[PersonSummary], # Returns summaries of people + summary="Get People who authored works citing this Work", ) def get_work_citing_people( work_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of people (summaries) who are authors of scholarly works @@ -468,7 +526,7 @@ def get_work_citing_people( 500 if an error occurs during retrieval. """ logger.info(f"Request received: Get people citing work ID {work_id}") - _get_work_or_404(db, work_id) # Ensure the cited work exists + _get_work_or_404(db, work_id) # Ensure the cited work exists try: people = service.get_people_citing_work(db=db, work_id=work_id) return people @@ -476,18 +534,19 @@ def get_work_citing_people( logger.exception(f"Error finding people citing work {work_id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to find people associated with citing works." + detail="Failed to find people associated with citing works.", ) + @router.get( "/works/{work_id}/citing_institutions", - response_model=List[InstitutionSummary], # Returns summaries of institutions - summary="Get Institutions affiliated with authors citing this Work" + response_model=List[InstitutionSummary], # Returns summaries of institutions + summary="Get Institutions affiliated with authors citing this Work", ) def get_work_citing_institutions( work_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of institutions (summaries) that are affiliated with authors @@ -506,7 +565,7 @@ def get_work_citing_institutions( 500 if an error occurs during retrieval. """ logger.info(f"Request received: Get institutions citing work ID {work_id}") - _get_work_or_404(db, work_id) # Ensure the cited work exists + _get_work_or_404(db, work_id) # Ensure the cited work exists try: institutions = service.get_institutions_citing_work(db=db, work_id=work_id) return institutions @@ -514,18 +573,19 @@ def get_work_citing_institutions( logger.exception(f"Error finding institutions citing work {work_id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to find institutions associated with citing works." + detail="Failed to find institutions associated with citing works.", ) + @router.get( "/institutions/{institution_id}/repositories", - response_model=List[RepositorySummary], # Returns summaries of repositories - summary="Get Repositories linked to an Institution" + response_model=List[RepositorySummary], # Returns summaries of repositories + summary="Get Repositories linked to an Institution", ) def get_institution_repositories( institution_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of repositories (summaries) that have been linked to the @@ -543,27 +603,34 @@ def get_institution_repositories( HTTPException: 404 if the institution ID is not found. 500 if an error occurs during retrieval. """ - logger.info(f"Request received: Get repositories for institution ID {institution_id}") - _get_institution_or_404(db, institution_id) # Ensure institution exists + logger.info( + f"Request received: Get repositories for institution ID {institution_id}" + ) + _get_institution_or_404(db, institution_id) # Ensure institution exists try: - repositories = service.get_repositories_by_institution(db=db, institution_id=institution_id) + repositories = service.get_repositories_by_institution( + db=db, institution_id=institution_id + ) return repositories except Exception as e: - logger.exception(f"Error finding repositories for institution {institution_id}: {e}") + logger.exception( + f"Error finding repositories for institution {institution_id}: {e}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to find linked repositories for the institution." + detail="Failed to find linked repositories for the institution.", ) + @router.get( "/persons/{person_id}/works", - response_model=List[WorkSummary], # Returns summaries of works - summary="Get Works associated with a Person" + response_model=List[WorkSummary], # Returns summaries of works + summary="Get Works associated with a Person", ) def get_person_works( person_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of scholarly works (summaries) authored by or associated @@ -582,7 +649,7 @@ def get_person_works( 500 if an error occurs during retrieval. """ logger.info(f"Request received: Get works for person ID {person_id}") - _get_person_or_404(db, person_id) # Ensure person exists + _get_person_or_404(db, person_id) # Ensure person exists try: works = service.get_works_by_person(db=db, person_id=person_id) return works @@ -590,19 +657,20 @@ def get_person_works( logger.exception(f"Error finding works for person {person_id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to find works associated with the person." + detail="Failed to find works associated with the person.", ) + @router.get( "/contributors/{contributor_id}/repositories", - response_model=List[RepositorySummary], # Returns summaries of repositories + response_model=List[RepositorySummary], # Returns summaries of repositories summary="Get Repositories associated with a Contributor", - tags=["Surfacing", "Contributors"] + tags=["Surfacing", "Contributors"], ) def get_contributor_repositories( contributor_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of repositories (summaries) that the specified contributor @@ -620,32 +688,44 @@ def get_contributor_repositories( HTTPException: 404 if the contributor ID is not found. 500 if an error occurs during retrieval. """ - logger.info(f"Request received: Get repositories for contributor ID {contributor_id}") - _get_contributor_or_404(db, contributor_id) # Ensure contributor link exists + logger.info( + f"Request received: Get repositories for contributor ID {contributor_id}" + ) + _get_contributor_or_404(db, contributor_id) # Ensure contributor link exists try: - repositories = service.get_repositories_by_contributor(db=db, contributor_id=contributor_id) + repositories = service.get_repositories_by_contributor( + db=db, contributor_id=contributor_id + ) # FastAPI handles mapping Repository models to RepositorySummary return repositories except Exception as e: - logger.exception(f"Error finding repositories for contributor {contributor_id}: {e}") + logger.exception( + f"Error finding repositories for contributor {contributor_id}: {e}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to find repositories associated with the contributor." + detail="Failed to find repositories associated with the contributor.", ) # --- Endpoints related to Affiliations --- + @router.get( "/repositories/{repo_id}/affiliations", response_model=List[AffiliationResultResponse], - summary="Get Affiliations for a Repository" + summary="Get Affiliations for a Repository", ) def get_repository_affiliations( repo_id: int, - min_confidence: Optional[float] = Query(0.0, ge=0.0, le=1.0, description="Optional minimum confidence score [0.0, 1.0] to filter results."), + min_confidence: Optional[float] = Query( + 0.0, + ge=0.0, + le=1.0, + description="Optional minimum confidence score [0.0, 1.0] to filter results.", + ), db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of repository-institution affiliations calculated for the @@ -665,30 +745,40 @@ def get_repository_affiliations( HTTPException: 404 if the repository ID is not found. 500 if an error occurs during retrieval. """ - logger.info(f"Request received: Get affiliations for repository ID {repo_id} (min_conf: {min_confidence})") - _get_repository_or_404(db, repo_id) # Ensure repository exists + logger.info( + f"Request received: Get affiliations for repository ID {repo_id} (min_conf: {min_confidence})" + ) + _get_repository_or_404(db, repo_id) # Ensure repository exists try: affiliations = service.get_affiliations_for_repository( - db=db, repository_id=repo_id, min_confidence=min_confidence or 0.0 # Use 0.0 if None + db=db, + repository_id=repo_id, + min_confidence=min_confidence or 0.0, # Use 0.0 if None ) return affiliations except Exception as e: logger.exception(f"Error getting affiliations for repository {repo_id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to retrieve repository affiliations." + detail="Failed to retrieve repository affiliations.", ) + @router.get( "/institutions/{inst_id}/affiliations", response_model=List[AffiliationResultResponse], - summary="Get Affiliations for an Institution (Filtered)" + summary="Get Affiliations for an Institution (Filtered)", ) def get_institution_affiliations_filtered( inst_id: int, - min_confidence: Optional[float] = Query(0.0, ge=0.0, le=1.0, description="Optional minimum confidence score [0.0, 1.0] to filter results."), + min_confidence: Optional[float] = Query( + 0.0, + ge=0.0, + le=1.0, + description="Optional minimum confidence score [0.0, 1.0] to filter results.", + ), db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of repository-institution affiliations calculated for the @@ -711,29 +801,36 @@ def get_institution_affiliations_filtered( HTTPException: 404 if the institution ID is not found. 500 if an error occurs during retrieval. """ - logger.info(f"Request received: Get filtered affiliations for institution ID {inst_id} (min_conf: {min_confidence})") - _get_institution_or_404(db, inst_id) # Ensure institution exists + logger.info( + f"Request received: Get filtered affiliations for institution ID {inst_id} (min_conf: {min_confidence})" + ) + _get_institution_or_404(db, inst_id) # Ensure institution exists try: affiliations = service.get_affiliations_for_institution( - db=db, institution_id=inst_id, min_confidence=min_confidence or 0.0 # Use 0.0 if None + db=db, + institution_id=inst_id, + min_confidence=min_confidence or 0.0, # Use 0.0 if None ) return affiliations except Exception as e: - logger.exception(f"Error getting filtered affiliations for institution {inst_id}: {e}") + logger.exception( + f"Error getting filtered affiliations for institution {inst_id}: {e}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to retrieve filtered institution affiliations." + detail="Failed to retrieve filtered institution affiliations.", ) + @router.get( "/institutions/{inst_id}/affiliation_results", response_model=List[AffiliationResultResponse], - summary="Get All Stored Affiliation Results for an Institution" + summary="Get All Stored Affiliation Results for an Institution", ) def get_all_institution_affiliation_results( inst_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves *all* stored repository-institution affiliation results associated @@ -751,8 +848,10 @@ def get_all_institution_affiliation_results( HTTPException: 404 if the institution ID is not found. 500 if an error occurs during retrieval. """ - logger.info(f"Request received: Get ALL affiliation results for institution ID {inst_id}") - _get_institution_or_404(db, inst_id) # Ensure institution exists + logger.info( + f"Request received: Get ALL affiliation results for institution ID {inst_id}" + ) + _get_institution_or_404(db, inst_id) # Ensure institution exists try: # Call the service method with minimum confidence set to 0 to retrieve all results affiliations = service.get_affiliations_for_institution( @@ -760,23 +859,26 @@ def get_all_institution_affiliation_results( ) return affiliations except Exception as e: - logger.exception(f"Error getting all affiliation results for institution {inst_id}: {e}") + logger.exception( + f"Error getting all affiliation results for institution {inst_id}: {e}" + ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to retrieve all affiliation results for the institution." + detail="Failed to retrieve all affiliation results for the institution.", ) + # --- Endpoint for Software Dependencies --- @router.get( "/repositories/{repo_id}/dependencies", response_model=List[SoftwareDependencyResponse], summary="Get Software Dependencies for a Repository", - tags=["Surfacing", "Dependencies"] + tags=["Surfacing", "Dependencies"], ) def get_repository_dependencies( repo_id: int, db: Session = Depends(get_db_session), - service: SurfacingService = Depends() # Inject service + service: SurfacingService = Depends(), # Inject service ): """ Retrieves a list of software dependencies (e.g., libraries, packages) @@ -796,15 +898,19 @@ def get_repository_dependencies( 500 if an error occurs during retrieval. """ logger.info(f"Request received: Get dependencies for repository ID {repo_id}") - _get_repository_or_404(db, repo_id) # Ensure repository exists + _get_repository_or_404(db, repo_id) # Ensure repository exists try: - dependencies = service.get_dependencies_for_repository(db=db, repository_id=repo_id) + dependencies = service.get_dependencies_for_repository( + db=db, repository_id=repo_id + ) # FastAPI handles mapping SoftwareDependency models to SoftwareDependencyResponse return dependencies except Exception as e: logger.exception(f"Error finding dependencies for repository {repo_id}: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Failed to find dependencies for the repository." + detail="Failed to find dependencies for the repository.", ) -# --- END ADDED ENDPOINT --- \ No newline at end of file + + +# --- END ADDED ENDPOINT --- diff --git a/backend/celery_app.py b/backend/celery_app.py index 52e3ff6..6286a13 100644 --- a/backend/celery_app.py +++ b/backend/celery_app.py @@ -17,14 +17,16 @@ import logging from celery import Celery + # Import Celery signals for hooking into its logging setup process. from celery.signals import setup_logging as setup_celery_logging_signal + # Import custom logging setup functions and handlers. from backend.config.logging_config import ( setup_logging, - ConcurrentRotatingFileHandler, # Process-safe handler (if available). - RotatingFileHandler, # Standard library fallback handler. - CONCURRENT_HANDLER_AVAILABLE # Flag indicating which handler is used. + ConcurrentRotatingFileHandler, # Process-safe handler (if available). + RotatingFileHandler, # Standard library fallback handler. + CONCURRENT_HANDLER_AVAILABLE, # Flag indicating which handler is used. ) # Import application settings to access configuration values like broker URLs. @@ -58,7 +60,7 @@ # Logs for Celery workers will be directed to 'moss_celery.log'. setup_logging( log_file_name="moss_celery.log", - handler_class=celery_handler_class # Pass the chosen handler class. + handler_class=celery_handler_class, # Pass the chosen handler class. ) # Obtain the application's logger instance *after* the setup is complete. @@ -79,39 +81,41 @@ def configure_celery_logging(**kwargs): its own log handlers and ensuring our custom setup via `setup_logging` persists. """ - logger.info("Celery 'setup_logging' signal intercepted. Skipping Celery's default logger setup.") + logger.info( + "Celery 'setup_logging' signal intercepted. Skipping Celery's default logger setup." + ) pass + + # --- End Signal Handler --- # --- Initialize Celery Application --- # Create the Celery application instance. celery_app = Celery( - __name__, # Use the current module name as the app name. - broker=settings.CELERY_BROKER_URL, # URL for the message broker (e.g., Redis, RabbitMQ). - backend=settings.CELERY_RESULT_BACKEND_URL, # URL for storing task results. + __name__, # Use the current module name as the app name. + broker=settings.CELERY_BROKER_URL, # URL for the message broker (e.g., Redis, RabbitMQ). + backend=settings.CELERY_RESULT_BACKEND_URL, # URL for storing task results. # List of modules Celery should inspect to discover task definitions. include=[ - 'backend.tasks.scholarly_tasks', # Tasks related to scholarly data processing. - 'backend.tasks.discovery_tasks', # Tasks related to repository/keyword discovery. + "backend.tasks.scholarly_tasks", # Tasks related to scholarly data processing. + "backend.tasks.discovery_tasks", # Tasks related to repository/keyword discovery. # Add other modules containing Celery tasks here. - ] + ], ) # --- Apply Celery Configuration --- # Update the Celery application configuration with specific settings. celery_app.conf.update( - task_serializer='json', # Use JSON for serializing task messages. - accept_content=['json'], # Only accept JSON-formatted task messages. - result_serializer='json', # Use JSON for serializing task results. - timezone='UTC', # Standardize on UTC for time-related operations. - enable_utc=True, # Ensure UTC is enabled for scheduling and timestamps. - task_track_started=True, # Record when a task begins execution (useful for monitoring). - + task_serializer="json", # Use JSON for serializing task messages. + accept_content=["json"], # Only accept JSON-formatted task messages. + result_serializer="json", # Use JSON for serializing task results. + timezone="UTC", # Standardize on UTC for time-related operations. + enable_utc=True, # Ensure UTC is enabled for scheduling and timestamps. + task_track_started=True, # Record when a task begins execution (useful for monitoring). # Optional: Retry connecting to the broker on startup if it's not immediately available. # Useful in containerized environments where services might start in parallel. # broker_connection_retry_on_startup=True, - # Note: Worker pool and concurrency are often configured via command-line arguments # (e.g., `celery -A ... worker -P eventlet -c 4`), but can be set here as defaults. # worker_concurrency=4, # Example: Default number of concurrent worker processes/threads. @@ -132,4 +136,4 @@ def configure_celery_logging(**kwargs): # logger.warning("Attempting to start Celery worker directly from script execution. " # "Use the 'celery' command-line interface instead.") # celery_app.start() -# --- END OF FILE celery_app.py --- \ No newline at end of file +# --- END OF FILE celery_app.py --- diff --git a/backend/config/__init__.py b/backend/config/__init__.py index 9a8bb4a..4dc99be 100644 --- a/backend/config/__init__.py +++ b/backend/config/__init__.py @@ -1 +1 @@ -# Makes 'config' a Python package \ No newline at end of file +# Makes 'config' a Python package diff --git a/backend/config/logging_config.py b/backend/config/logging_config.py index ae716e9..053da68 100644 --- a/backend/config/logging_config.py +++ b/backend/config/logging_config.py @@ -11,7 +11,6 @@ import logging import sys -import os from logging.handlers import RotatingFileHandler from pathlib import Path @@ -20,6 +19,7 @@ # especially on Windows. Fall back to standard RotatingFileHandler if unavailable. try: from concurrent_log_handler import ConcurrentRotatingFileHandler + CONCURRENT_HANDLER_AVAILABLE = True except ImportError: # Use standard RotatingFileHandler as a fallback if concurrent_log_handler is not installed. @@ -41,15 +41,18 @@ "%(asctime)s [%(levelname)-5.5s] [%(name)s] [%(process)d] - %(message)s" ) + # --- Function to configure a specific logger --- def configure_logger( logger_instance: logging.Logger, log_level_console: int = logging.INFO, log_level_file: int = logging.DEBUG, log_file_name: str = "moss_app.log", - max_bytes: int = 10*1024*1024, # 10 MB log file size limit before rotation - backup_count: int = 5, # Number of backup log files to keep - handler_class: type[logging.FileHandler] = RotatingFileHandler # Handler class to use (allows selecting ConcurrentRotatingFileHandler) + max_bytes: int = 10 * 1024 * 1024, # 10 MB log file size limit before rotation + backup_count: int = 5, # Number of backup log files to keep + handler_class: type[ + logging.FileHandler + ] = RotatingFileHandler, # Handler class to use (allows selecting ConcurrentRotatingFileHandler) ): """ Configures console and file handlers for a given logger instance. @@ -88,7 +91,10 @@ def configure_logger( log_file_path = LOG_DIR / log_file_name # Determine which handler class to actually use, falling back if necessary. selected_handler_class = handler_class - if handler_class is ConcurrentRotatingFileHandler and not CONCURRENT_HANDLER_AVAILABLE: + if ( + handler_class is ConcurrentRotatingFileHandler + and not CONCURRENT_HANDLER_AVAILABLE + ): # Log a warning if the preferred concurrent handler isn't available and we're falling back. # This primarily affects multi-process scenarios on Windows. logging.warning( @@ -101,7 +107,8 @@ def configure_logger( # Check if a file handler of the *selected type* pointing to the *same file* # already exists for this logger instance to prevent duplicates. handler_exists = any( - isinstance(h, selected_handler_class) and getattr(h, 'baseFilename', None) == str(log_file_path) + isinstance(h, selected_handler_class) + and getattr(h, "baseFilename", None) == str(log_file_path) for h in logger_instance.handlers ) @@ -114,7 +121,7 @@ def configure_logger( filename=str(log_file_path), maxBytes=max_bytes, backupCount=backup_count, - encoding='utf-8', + encoding="utf-8", # delay=True # Optional: Set to True if experiencing file locking issues with ConcurrentRotatingFileHandler ) file_handler.setFormatter(log_formatter) @@ -124,7 +131,7 @@ def configure_logger( # Log configuration details only if the logger actually has handlers now. # Use basicConfig as a last resort if no handlers were added (shouldn't normally happen here). if not logger_instance.hasHandlers(): - logging.basicConfig(level=logging.INFO) # Fallback basic config + logging.basicConfig(level=logging.INFO) # Fallback basic config logger_instance.info( f"Logger '{logger_instance.name}' configured using {selected_handler_class.__name__}. " f"Console Level: {logging.getLevelName(log_level_console)}, " @@ -137,10 +144,12 @@ def configure_logger( def setup_logging( root_log_level_console=logging.INFO, root_log_level_file=logging.DEBUG, - app_log_level_console=logging.INFO, # Parameter kept for potential future granular configuration - app_log_level_file=logging.DEBUG, # Parameter kept for potential future granular configuration + app_log_level_console=logging.INFO, # Parameter kept for potential future granular configuration + app_log_level_file=logging.DEBUG, # Parameter kept for potential future granular configuration log_file_name="moss_app.log", - handler_class: type[logging.FileHandler] = RotatingFileHandler # Default to standard rotating handler + handler_class: type[ + logging.FileHandler + ] = RotatingFileHandler, # Default to standard rotating handler ): """ Configures the root logger for the application. @@ -160,51 +169,56 @@ def setup_logging( root_logger = logging.getLogger() if not root_logger.hasHandlers(): - # If the root logger has no handlers, configure it from scratch. - # Pass the desired handler class to the configuration function. - configure_logger( - root_logger, - root_log_level_console, - root_log_level_file, - log_file_name, - handler_class=handler_class + # If the root logger has no handlers, configure it from scratch. + # Pass the desired handler class to the configuration function. + configure_logger( + root_logger, + root_log_level_console, + root_log_level_file, + log_file_name, + handler_class=handler_class, ) else: - # If handlers already exist, check if the file handler needs adjustment. - handler_updated = False - for handler in root_logger.handlers: - # Identify the relevant file handler based on its type and filename. - # Check if it's a FileHandler subclass and has a baseFilename attribute matching the target log file. - if isinstance(handler, logging.FileHandler) and getattr(handler, 'baseFilename', None) and getattr(handler, 'baseFilename', '').endswith(log_file_name): - # Check if the existing handler is of the type we intended to use. - if not isinstance(handler, handler_class): - root_logger.warning( - f"Root logger has existing handler of wrong type ({type(handler).__name__}) " - f"for {log_file_name}. Expected {handler_class.__name__}. " - "Reconfiguration might be needed manually or on restart." - ) - # Check if the existing handler's level matches the desired file level. - elif handler.level != root_log_level_file: - root_logger.info( - f"Updating existing file handler level for root logger to " - f"{logging.getLevelName(root_log_level_file)}" + # If handlers already exist, check if the file handler needs adjustment. + handler_updated = False + for handler in root_logger.handlers: + # Identify the relevant file handler based on its type and filename. + # Check if it's a FileHandler subclass and has a baseFilename attribute matching the target log file. + if ( + isinstance(handler, logging.FileHandler) + and getattr(handler, "baseFilename", None) + and getattr(handler, "baseFilename", "").endswith(log_file_name) + ): + # Check if the existing handler is of the type we intended to use. + if not isinstance(handler, handler_class): + root_logger.warning( + f"Root logger has existing handler of wrong type ({type(handler).__name__}) " + f"for {log_file_name}. Expected {handler_class.__name__}. " + "Reconfiguration might be needed manually or on restart." + ) + # Check if the existing handler's level matches the desired file level. + elif handler.level != root_log_level_file: + root_logger.info( + f"Updating existing file handler level for root logger to " + f"{logging.getLevelName(root_log_level_file)}" ) - handler.setLevel(root_log_level_file) - handler_updated = True - # Assume only one file handler corresponds to this log file name. - break - - if handler_updated: - root_logger.info( - f"Root logger already configured. Ensured file level is " - f"{logging.getLevelName(root_log_level_file)} for handler type {handler_class.__name__}." + handler.setLevel(root_log_level_file) + handler_updated = True + # Assume only one file handler corresponds to this log file name. + break + + if handler_updated: + root_logger.info( + f"Root logger already configured. Ensured file level is " + f"{logging.getLevelName(root_log_level_file)} for handler type {handler_class.__name__}." ) - else: - # Log a warning if root logger was configured but no matching handler was found to update. - root_logger.warning( - f"Root logger already configured, but no matching file handler found " - f"for {log_file_name} and type {handler_class.__name__} to update level." - ) + else: + # Log a warning if root logger was configured but no matching handler was found to update. + root_logger.warning( + f"Root logger already configured, but no matching file handler found " + f"for {log_file_name} and type {handler_class.__name__} to update level." + ) + # --- Example Usage in other modules --- # import logging @@ -217,4 +231,4 @@ def setup_logging( # logger.debug("This is a debug message, typically useful for development.") # logger.warning("This indicates a potential issue.") # logger.error("This signals an error that occurred.") -# logger.critical("This indicates a critical failure.") \ No newline at end of file +# logger.critical("This indicates a critical failure.") diff --git a/backend/config/settings.py b/backend/config/settings.py index 316db4d..f8a7db2 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -15,9 +15,9 @@ # --- Project Root Determination --- # Assume settings.py is located in 'backend/config'. Navigate up two levels to find the project root. -PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) # Construct the full path to the .env file in the project root. -DOTENV_PATH = os.path.join(PROJECT_ROOT, '.env') +DOTENV_PATH = os.path.join(PROJECT_ROOT, ".env") # --- Load Environment Variables --- # Attempt to load the .env file if it exists. Variables defined in the environment @@ -34,6 +34,7 @@ # Get a logger instance specific to this module. logger = logging.getLogger(__name__) + class Settings: """ Application settings loaded from environment variables. @@ -42,16 +43,23 @@ class Settings: using a `.env` file as a potential source. Performs basic validation to ensure critical settings are present. """ + # --- Database Configuration --- - DATABASE_URL: str | None = None # Connection string for the primary database. + DATABASE_URL: str | None = None # Connection string for the primary database. # --- External Service API Keys --- - GITHUB_API_TOKEN: str | None = None # Token for authenticating with the GitHub API. - OPENALEX_EMAIL: str | None = None # Email address for identifying requests to the OpenAlex API (polite pool). + GITHUB_API_TOKEN: str | None = None # Token for authenticating with the GitHub API. + OPENALEX_EMAIL: str | None = ( + None # Email address for identifying requests to the OpenAlex API (polite pool). + ) # --- Celery Configuration (Task Queue) --- - CELERY_BROKER_URL: str | None = None # URL for the Celery message broker (e.g., Redis, RabbitMQ). - CELERY_RESULT_BACKEND_URL: str | None = None # URL for the Celery result backend (e.g., Redis, database). + CELERY_BROKER_URL: str | None = ( + None # URL for the Celery message broker (e.g., Redis, RabbitMQ). + ) + CELERY_RESULT_BACKEND_URL: str | None = ( + None # URL for the Celery result backend (e.g., Redis, database). + ) def __init__(self): """ @@ -64,8 +72,12 @@ def __init__(self): self.OPENALEX_EMAIL = os.getenv("OPENALEX_EMAIL") # Load Celery settings, providing defaults suitable for local development if not set. - self.CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0") - self.CELERY_RESULT_BACKEND_URL = os.getenv("CELERY_RESULT_BACKEND_URL", "redis://localhost:6379/1") + self.CELERY_BROKER_URL = os.getenv( + "CELERY_BROKER_URL", "redis://localhost:6379/0" + ) + self.CELERY_RESULT_BACKEND_URL = os.getenv( + "CELERY_RESULT_BACKEND_URL", "redis://localhost:6379/1" + ) # --- Validation --- # Define settings considered essential for the application to run correctly. @@ -91,12 +103,18 @@ def __init__(self): # Log the status of loaded settings for debugging, avoiding sensitive values. # Indicate whether a value was explicitly set or if a default is being used (for Celery). logger.debug(f"DATABASE_URL: {'Set' if self.DATABASE_URL else 'Not Set'}") - logger.debug(f"GITHUB_API_TOKEN: {'Set' if self.GITHUB_API_TOKEN else 'Not Set'}") + logger.debug( + f"GITHUB_API_TOKEN: {'Set' if self.GITHUB_API_TOKEN else 'Not Set'}" + ) logger.debug(f"OPENALEX_EMAIL: {self.OPENALEX_EMAIL or 'Not Set'}") - logger.debug(f"CELERY_BROKER_URL: {'Set from environment' if os.getenv('CELERY_BROKER_URL') else 'Using Default/Loaded'}") - logger.debug(f"CELERY_RESULT_BACKEND_URL: {'Set from environment' if os.getenv('CELERY_RESULT_BACKEND_URL') else 'Using Default/Loaded'}") + logger.debug( + f"CELERY_BROKER_URL: {'Set from environment' if os.getenv('CELERY_BROKER_URL') else 'Using Default/Loaded'}" + ) + logger.debug( + f"CELERY_RESULT_BACKEND_URL: {'Set from environment' if os.getenv('CELERY_RESULT_BACKEND_URL') else 'Using Default/Loaded'}" + ) # Create a single, globally accessible instance of the Settings class. # Other modules can import this instance directly: `from backend.config.settings import settings` -settings = Settings() \ No newline at end of file +settings = Settings() diff --git a/backend/data/__init__.py b/backend/data/__init__.py index 1de139b..ba911fb 100644 --- a/backend/data/__init__.py +++ b/backend/data/__init__.py @@ -1 +1 @@ -# Makes 'data' a Python package \ No newline at end of file +# Makes 'data' a Python package diff --git a/backend/data/database.py b/backend/data/database.py index 6ccea45..b32ecfa 100644 --- a/backend/data/database.py +++ b/backend/data/database.py @@ -10,9 +10,10 @@ import logging from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker + # Use declarative_base from sqlalchemy.orm as recommended in modern SQLAlchemy from sqlalchemy.orm import declarative_base -from sqlalchemy.exc import SQLAlchemyError # Specific exception for database errors +from sqlalchemy.exc import SQLAlchemyError # Specific exception for database errors # Import application settings, expected to contain the DATABASE_URL from backend.config.settings import settings @@ -37,13 +38,13 @@ # --- Connection Pool Configuration --- # These parameters tune the connection pool behavior for performance and reliability. # pool_size: The target number of connections to keep readily available in the pool. - pool_size=20, # Increased from default (often 5) to handle more concurrent requests. + pool_size=20, # Increased from default (often 5) to handle more concurrent requests. # max_overflow: The maximum number of additional connections allowed beyond 'pool_size' # during peak load before requests start waiting. - max_overflow=30, # Allows for bursts of activity. (default often 10) + max_overflow=30, # Allows for bursts of activity. (default often 10) # pool_timeout: The number of seconds to wait when trying to get a connection from the # pool before raising a TimeoutError. - pool_timeout=30 # Standard timeout duration. + pool_timeout=30, # Standard timeout duration. ) # --- Optional: Connection Event Logging --- @@ -59,9 +60,15 @@ # Log essential information about the engine setup for monitoring. # Avoid logging the full DATABASE_URL for security, show only the end part. - log_url_display = f"{'*' * 5}{SQLALCHEMY_DATABASE_URL[-5:]}" if SQLALCHEMY_DATABASE_URL else "Not Set" + log_url_display = ( + f"{'*' * 5}{SQLALCHEMY_DATABASE_URL[-5:]}" + if SQLALCHEMY_DATABASE_URL + else "Not Set" + ) logger.info(f"SQLAlchemy engine created for URL ending in: {log_url_display}") - logger.info(f"SQLAlchemy pool settings: size={engine.pool.size()}, overflow={engine.pool.overflow()}, timeout={engine.pool.timeout()}") + logger.info( + f"SQLAlchemy pool settings: size={engine.pool.size()}, overflow={engine.pool.overflow()}, timeout={engine.pool.timeout()}" + ) # --- Robust Error Handling --- # Catch specific errors during engine creation to provide informative logs and fail gracefully. @@ -86,7 +93,7 @@ # autoflush=False: Prevents automatic flushing of changes before queries, giving more control. autoflush=False, # bind=engine: Associates this session factory with our configured database engine. - bind=engine + bind=engine, ) # --- Declarative Base --- @@ -94,6 +101,7 @@ # All application data models should inherit from this 'Base'. Base = declarative_base() + # --- Dependency for Web Frameworks (e.g., FastAPI) --- def get_db(): """ @@ -108,7 +116,7 @@ def get_db(): Yields: sqlalchemy.orm.Session: A database session instance. """ - db = SessionLocal() # Create a new session instance from the factory. + db = SessionLocal() # Create a new session instance from the factory. try: # Yield the session to the part of the code that depends on it (e.g., a request handler). yield db @@ -117,6 +125,7 @@ def get_db(): # It's crucial to close the session to release the database connection back to the pool. db.close() + # --- Example Standalone Usage (Commented Out) --- # This section demonstrates how to use the SessionLocal directly, # typically needed in scripts, background tasks, or tests outside the @@ -145,4 +154,4 @@ def get_db(): # raise # Re-raise the exception after rollback if necessary # finally: # # Always ensure the session is closed to free up resources. -# db.close() \ No newline at end of file +# db.close() diff --git a/backend/data/migrations/env.py b/backend/data/migrations/env.py index ca921d5..5472576 100644 --- a/backend/data/migrations/env.py +++ b/backend/data/migrations/env.py @@ -1,6 +1,5 @@ import os import sys -from logging.config import fileConfig from sqlalchemy import engine_from_config from sqlalchemy import pool @@ -9,12 +8,15 @@ # --- MOSS CONFIGURATION START --- # Add the project's root directory to the Python path -PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) +PROJECT_ROOT = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "..", "..") +) if PROJECT_ROOT not in sys.path: sys.path.insert(0, PROJECT_ROOT) # Import the Base FIRST from backend.data.database import Base + # Import your application settings from backend.config.settings import settings @@ -22,7 +24,8 @@ # This ensures they register with Base.metadata *before* we assign it below # Wrapped in a try-except just in case there's an import error during testing try: - import backend.data.models # This should trigger models/__init__.py + import backend.data.models # This should trigger models/__init__.py + print("Models package imported successfully in env.py") except ImportError as e: print(f"ERROR importing models package in env.py: {e}", file=sys.stderr) @@ -53,13 +56,14 @@ # (Rest of the file remains the same - run_migrations_offline / run_migrations_online) # ... + def run_migrations_offline() -> None: """Run migrations in 'offline' mode. # ... (rest of docstring) ... """ # --- MOSS MODIFICATION START --- if not settings.DATABASE_URL: - raise ValueError("DATABASE_URL not found in settings for offline migration.") + raise ValueError("DATABASE_URL not found in settings for offline migration.") url = settings.DATABASE_URL # --- MOSS MODIFICATION END --- @@ -81,7 +85,7 @@ def run_migrations_online() -> None: # --- MOSS MODIFICATION START --- configuration = config.get_section(config.config_ini_section) if configuration is None: - raise Exception("Alembic config section [alembic] not found in alembic.ini") + raise Exception("Alembic config section [alembic] not found in alembic.ini") if not settings.DATABASE_URL: raise ValueError("DATABASE_URL not found in settings for online migration.") @@ -94,7 +98,6 @@ def run_migrations_online() -> None: ) # --- MOSS MODIFICATION END --- - with connectable.connect() as connection: context.configure( connection=connection, @@ -108,4 +111,4 @@ def run_migrations_online() -> None: if context.is_offline_mode(): run_migrations_offline() else: - run_migrations_online() \ No newline at end of file + run_migrations_online() diff --git a/backend/data/migrations/versions/1b4fdd19cc31_phase_10_add_repository_institution_.py b/backend/data/migrations/versions/1b4fdd19cc31_phase_10_add_repository_institution_.py index 452b780..ea10bc9 100644 --- a/backend/data/migrations/versions/1b4fdd19cc31_phase_10_add_repository_institution_.py +++ b/backend/data/migrations/versions/1b4fdd19cc31_phase_10_add_repository_institution_.py @@ -8,6 +8,7 @@ Create Date: YYYY-MM-DD HH:MM:SS.ffffff # Replace with actual new timestamp """ + from typing import Sequence, Union from alembic import op @@ -15,8 +16,8 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision: str = '' # Replace with actual new ID -down_revision: Union[str, None] = 'c9b46f9c64e5' +revision: str = "" # Replace with actual new ID +down_revision: Union[str, None] = "c9b46f9c64e5" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -24,30 +25,65 @@ def upgrade() -> None: """Upgrade schema.""" # ### commands auto generated by Alembic - START ### - op.create_table('repository_institution_affiliations', - sa.Column('repository_id', sa.Integer(), nullable=False), - sa.Column('institution_id', sa.Integer(), nullable=False), - sa.Column('algorithm_name', sa.String(), nullable=False), - sa.Column('algorithm_version', sa.String(), nullable=False), - sa.Column('confidence_score', sa.Float(), nullable=False), - sa.Column('evidence', postgresql.JSONB(astext_type=sa.Text()), nullable=True), - sa.Column('parameters_used', postgresql.JSONB(astext_type=sa.Text()), nullable=True), - sa.Column('calculated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['institution_id'], ['institutions.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['repository_id'], ['repositories.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('repository_id', 'institution_id', 'algorithm_name', 'algorithm_version') + op.create_table( + "repository_institution_affiliations", + sa.Column("repository_id", sa.Integer(), nullable=False), + sa.Column("institution_id", sa.Integer(), nullable=False), + sa.Column("algorithm_name", sa.String(), nullable=False), + sa.Column("algorithm_version", sa.String(), nullable=False), + sa.Column("confidence_score", sa.Float(), nullable=False), + sa.Column("evidence", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column( + "parameters_used", postgresql.JSONB(astext_type=sa.Text()), nullable=True + ), + sa.Column( + "calculated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["institution_id"], ["institutions.id"], ondelete="CASCADE" + ), + sa.ForeignKeyConstraint( + ["repository_id"], ["repositories.id"], ondelete="CASCADE" + ), + sa.PrimaryKeyConstraint( + "repository_id", "institution_id", "algorithm_name", "algorithm_version" + ), + ) + op.create_index( + "ix_repo_inst_affil_algo_name", + "repository_institution_affiliations", + ["algorithm_name"], + unique=False, + ) + op.create_index( + "ix_repo_inst_affil_inst_id", + "repository_institution_affiliations", + ["institution_id"], + unique=False, + ) + op.create_index( + "ix_repo_inst_affil_repo_id", + "repository_institution_affiliations", + ["repository_id"], + unique=False, ) - op.create_index('ix_repo_inst_affil_algo_name', 'repository_institution_affiliations', ['algorithm_name'], unique=False) - op.create_index('ix_repo_inst_affil_inst_id', 'repository_institution_affiliations', ['institution_id'], unique=False) - op.create_index('ix_repo_inst_affil_repo_id', 'repository_institution_affiliations', ['repository_id'], unique=False) # ### end Alembic commands ### def downgrade() -> None: """Downgrade schema.""" # ### commands auto generated by Alembic - START ### - op.drop_index('ix_repo_inst_affil_repo_id', table_name='repository_institution_affiliations') - op.drop_index('ix_repo_inst_affil_inst_id', table_name='repository_institution_affiliations') - op.drop_index('ix_repo_inst_affil_algo_name', table_name='repository_institution_affiliations') - op.drop_table('repository_institution_affiliations') - # ### end Alembic commands ### \ No newline at end of file + op.drop_index( + "ix_repo_inst_affil_repo_id", table_name="repository_institution_affiliations" + ) + op.drop_index( + "ix_repo_inst_affil_inst_id", table_name="repository_institution_affiliations" + ) + op.drop_index( + "ix_repo_inst_affil_algo_name", table_name="repository_institution_affiliations" + ) + op.drop_table("repository_institution_affiliations") + # ### end Alembic commands ### diff --git a/backend/data/migrations/versions/1cc8bbb9702b_phase_2_add_keyword_search_models.py b/backend/data/migrations/versions/1cc8bbb9702b_phase_2_add_keyword_search_models.py index abe36bf..0dd88d2 100644 --- a/backend/data/migrations/versions/1cc8bbb9702b_phase_2_add_keyword_search_models.py +++ b/backend/data/migrations/versions/1cc8bbb9702b_phase_2_add_keyword_search_models.py @@ -5,6 +5,7 @@ Create Date: 2025-04-07 06:37:53.016044 """ + from typing import Sequence, Union from alembic import op @@ -12,8 +13,8 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision: str = '1cc8bbb9702b' -down_revision: Union[str, None] = 'ac00d539ca94' +revision: str = "1cc8bbb9702b" +down_revision: Union[str, None] = "ac00d539ca94" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -21,29 +22,74 @@ def upgrade() -> None: """Upgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.create_table('keyword_search_sessions', - sa.Column('keywords_raw', sa.Text(), nullable=False), - sa.Column('status', sa.String(), nullable=False), - sa.Column('results_count', sa.Integer(), nullable=True), - sa.Column('started_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('completed_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_keyword_search_sessions_id'), 'keyword_search_sessions', ['id'], unique=False) - op.create_index('ix_keyword_search_sessions_status', 'keyword_search_sessions', ['status'], unique=False) - op.create_table('keyword_repository_associations', - sa.Column('keyword_search_session_id', sa.Integer(), nullable=False), - sa.Column('repository_id', sa.Integer(), nullable=False), - sa.Column('match_details', postgresql.JSONB(astext_type=sa.Text()), nullable=True), - sa.ForeignKeyConstraint(['keyword_search_session_id'], ['keyword_search_sessions.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['repository_id'], ['repositories.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('keyword_search_session_id', 'repository_id') - ) - op.create_index(op.f('ix_keyword_repository_associations_keyword_search_session_id'), 'keyword_repository_associations', ['keyword_search_session_id'], unique=False) - op.create_index(op.f('ix_keyword_repository_associations_repository_id'), 'keyword_repository_associations', ['repository_id'], unique=False) + op.create_table( + "keyword_search_sessions", + sa.Column("keywords_raw", sa.Text(), nullable=False), + sa.Column("status", sa.String(), nullable=False), + sa.Column("results_count", sa.Integer(), nullable=True), + sa.Column( + "started_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column("completed_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_keyword_search_sessions_id"), + "keyword_search_sessions", + ["id"], + unique=False, + ) + op.create_index( + "ix_keyword_search_sessions_status", + "keyword_search_sessions", + ["status"], + unique=False, + ) + op.create_table( + "keyword_repository_associations", + sa.Column("keyword_search_session_id", sa.Integer(), nullable=False), + sa.Column("repository_id", sa.Integer(), nullable=False), + sa.Column( + "match_details", postgresql.JSONB(astext_type=sa.Text()), nullable=True + ), + sa.ForeignKeyConstraint( + ["keyword_search_session_id"], + ["keyword_search_sessions.id"], + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["repository_id"], ["repositories.id"], ondelete="CASCADE" + ), + sa.PrimaryKeyConstraint("keyword_search_session_id", "repository_id"), + ) + op.create_index( + op.f("ix_keyword_repository_associations_keyword_search_session_id"), + "keyword_repository_associations", + ["keyword_search_session_id"], + unique=False, + ) + op.create_index( + op.f("ix_keyword_repository_associations_repository_id"), + "keyword_repository_associations", + ["repository_id"], + unique=False, + ) # op.create_unique_constraint('uq_repo_contrib', 'repository_contributors', ['repository_id', 'contributor_id']) # ### end Alembic commands ### @@ -51,11 +97,21 @@ def upgrade() -> None: def downgrade() -> None: """Downgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.drop_constraint('uq_repo_contrib', 'repository_contributors', type_='unique') - op.drop_index(op.f('ix_keyword_repository_associations_repository_id'), table_name='keyword_repository_associations') - op.drop_index(op.f('ix_keyword_repository_associations_keyword_search_session_id'), table_name='keyword_repository_associations') - op.drop_table('keyword_repository_associations') - op.drop_index('ix_keyword_search_sessions_status', table_name='keyword_search_sessions') - op.drop_index(op.f('ix_keyword_search_sessions_id'), table_name='keyword_search_sessions') - op.drop_table('keyword_search_sessions') + op.drop_constraint("uq_repo_contrib", "repository_contributors", type_="unique") + op.drop_index( + op.f("ix_keyword_repository_associations_repository_id"), + table_name="keyword_repository_associations", + ) + op.drop_index( + op.f("ix_keyword_repository_associations_keyword_search_session_id"), + table_name="keyword_repository_associations", + ) + op.drop_table("keyword_repository_associations") + op.drop_index( + "ix_keyword_search_sessions_status", table_name="keyword_search_sessions" + ) + op.drop_index( + op.f("ix_keyword_search_sessions_id"), table_name="keyword_search_sessions" + ) + op.drop_table("keyword_search_sessions") # ### end Alembic commands ### diff --git a/backend/data/migrations/versions/3ab81a4cf052_allow_null_entity_id_in_entity_.py b/backend/data/migrations/versions/3ab81a4cf052_allow_null_entity_id_in_entity_.py index f7bc2e9..fc0197e 100644 --- a/backend/data/migrations/versions/3ab81a4cf052_allow_null_entity_id_in_entity_.py +++ b/backend/data/migrations/versions/3ab81a4cf052_allow_null_entity_id_in_entity_.py @@ -5,6 +5,7 @@ Create Date: 2025-04-07 14:32:07.068379 """ + from typing import Sequence, Union from alembic import op @@ -12,8 +13,8 @@ # revision identifiers, used by Alembic. -revision: str = '3ab81a4cf052' -down_revision: Union[str, None] = 'a7e01fc1d2e8' +revision: str = "3ab81a4cf052" +down_revision: Union[str, None] = "a7e01fc1d2e8" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -21,9 +22,12 @@ def upgrade() -> None: """Upgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.alter_column('entity_discovery_associations', 'entity_id', - existing_type=sa.INTEGER(), - nullable=True) + op.alter_column( + "entity_discovery_associations", + "entity_id", + existing_type=sa.INTEGER(), + nullable=True, + ) # --- REMOVE THIS LINE --- # op.create_unique_constraint('uq_repo_contrib', 'repository_contributors', ['repository_id', 'contributor_id']) # --- END REMOVAL --- @@ -36,7 +40,10 @@ def downgrade() -> None: # --- REMOVE THIS LINE --- # op.drop_constraint('uq_repo_contrib', 'repository_contributors', type_='unique') # --- END REMOVAL --- - op.alter_column('entity_discovery_associations', 'entity_id', - existing_type=sa.INTEGER(), - nullable=False) - # ### end Alembic commands ### \ No newline at end of file + op.alter_column( + "entity_discovery_associations", + "entity_id", + existing_type=sa.INTEGER(), + nullable=False, + ) + # ### end Alembic commands ### diff --git a/backend/data/migrations/versions/4c5ec8e48a9c_phase_19_add_domain_field_subfield_.py b/backend/data/migrations/versions/4c5ec8e48a9c_phase_19_add_domain_field_subfield_.py index 8e7532b..9acaad7 100644 --- a/backend/data/migrations/versions/4c5ec8e48a9c_phase_19_add_domain_field_subfield_.py +++ b/backend/data/migrations/versions/4c5ec8e48a9c_phase_19_add_domain_field_subfield_.py @@ -7,6 +7,7 @@ Create Date: 2025-04-15 21:48:21.467935 # Or your actual timestamp """ + from typing import Sequence, Union from alembic import op @@ -14,8 +15,8 @@ # revision identifiers, used by Alembic. -revision: str = '4c5ec8e48a9c' -down_revision: Union[str, None] = 'dd1449ba853a' +revision: str = "4c5ec8e48a9c" +down_revision: Union[str, None] = "dd1449ba853a" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -23,79 +24,148 @@ def upgrade() -> None: """Upgrade schema.""" # ### commands auto generated by Alembic - START ### - op.create_table('domains', - sa.Column('openalex_id', sa.String(), nullable=False), - sa.Column('display_name', sa.String(), nullable=False), - sa.Column('description', sa.Text(), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_domains_display_name'), 'domains', ['display_name'], unique=False) - op.create_index(op.f('ix_domains_id'), 'domains', ['id'], unique=False) - op.create_index(op.f('ix_domains_openalex_id'), 'domains', ['openalex_id'], unique=True) - op.create_table('fields', - sa.Column('openalex_id', sa.String(), nullable=False), - sa.Column('display_name', sa.String(), nullable=False), - sa.Column('description', sa.Text(), nullable=True), - sa.Column('domain_id', sa.Integer(), nullable=False), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['domain_id'], ['domains.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('id') + op.create_table( + "domains", + sa.Column("openalex_id", sa.String(), nullable=False), + sa.Column("display_name", sa.String(), nullable=False), + sa.Column("description", sa.Text(), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_domains_display_name"), "domains", ["display_name"], unique=False + ) + op.create_index(op.f("ix_domains_id"), "domains", ["id"], unique=False) + op.create_index( + op.f("ix_domains_openalex_id"), "domains", ["openalex_id"], unique=True + ) + op.create_table( + "fields", + sa.Column("openalex_id", sa.String(), nullable=False), + sa.Column("display_name", sa.String(), nullable=False), + sa.Column("description", sa.Text(), nullable=True), + sa.Column("domain_id", sa.Integer(), nullable=False), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint(["domain_id"], ["domains.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("id"), ) # Using op.f for consistency where possible - op.create_index(op.f('ix_fields_display_name'), 'fields', ['display_name'], unique=False) - op.create_index(op.f('ix_fields_domain_id'), 'fields', ['domain_id'], unique=False) - op.create_index(op.f('ix_fields_id'), 'fields', ['id'], unique=False) - op.create_index(op.f('ix_fields_openalex_id'), 'fields', ['openalex_id'], unique=True) - op.create_table('subfields', - sa.Column('openalex_id', sa.String(), nullable=False), - sa.Column('display_name', sa.String(), nullable=False), - sa.Column('description', sa.Text(), nullable=True), - sa.Column('field_id', sa.Integer(), nullable=False), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['field_id'], ['fields.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('id') + op.create_index( + op.f("ix_fields_display_name"), "fields", ["display_name"], unique=False + ) + op.create_index(op.f("ix_fields_domain_id"), "fields", ["domain_id"], unique=False) + op.create_index(op.f("ix_fields_id"), "fields", ["id"], unique=False) + op.create_index( + op.f("ix_fields_openalex_id"), "fields", ["openalex_id"], unique=True + ) + op.create_table( + "subfields", + sa.Column("openalex_id", sa.String(), nullable=False), + sa.Column("display_name", sa.String(), nullable=False), + sa.Column("description", sa.Text(), nullable=True), + sa.Column("field_id", sa.Integer(), nullable=False), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint(["field_id"], ["fields.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("id"), ) # Using op.f for consistency where possible - op.create_index(op.f('ix_subfields_display_name'), 'subfields', ['display_name'], unique=False) - op.create_index(op.f('ix_subfields_field_id'), 'subfields', ['field_id'], unique=False) - op.create_index(op.f('ix_subfields_id'), 'subfields', ['id'], unique=False) + op.create_index( + op.f("ix_subfields_display_name"), "subfields", ["display_name"], unique=False + ) + op.create_index( + op.f("ix_subfields_field_id"), "subfields", ["field_id"], unique=False + ) + op.create_index(op.f("ix_subfields_id"), "subfields", ["id"], unique=False) # --- CORRECTION HERE: unique=True --- - op.create_index(op.f('ix_subfields_openalex_id'), 'subfields', ['openalex_id'], unique=True) + op.create_index( + op.f("ix_subfields_openalex_id"), "subfields", ["openalex_id"], unique=True + ) # --- END CORRECTION --- - op.create_table('topics', - sa.Column('openalex_id', sa.String(), nullable=False), - sa.Column('display_name', sa.String(), nullable=False), - sa.Column('description', sa.Text(), nullable=True), - sa.Column('subfield_id', sa.Integer(), nullable=False), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['subfield_id'], ['subfields.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_topics_display_name'), 'topics', ['display_name'], unique=False) - op.create_index(op.f('ix_topics_id'), 'topics', ['id'], unique=False) - op.create_index(op.f('ix_topics_openalex_id'), 'topics', ['openalex_id'], unique=True) + op.create_table( + "topics", + sa.Column("openalex_id", sa.String(), nullable=False), + sa.Column("display_name", sa.String(), nullable=False), + sa.Column("description", sa.Text(), nullable=True), + sa.Column("subfield_id", sa.Integer(), nullable=False), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint(["subfield_id"], ["subfields.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_topics_display_name"), "topics", ["display_name"], unique=False + ) + op.create_index(op.f("ix_topics_id"), "topics", ["id"], unique=False) + op.create_index( + op.f("ix_topics_openalex_id"), "topics", ["openalex_id"], unique=True + ) # Using op.f for consistency where possible - op.create_index(op.f('ix_topics_subfield_id'), 'topics', ['subfield_id'], unique=False) - op.create_table('work_topics', - sa.Column('work_id', sa.Integer(), nullable=False), - sa.Column('topic_id', sa.Integer(), nullable=False), - sa.Column('score', sa.Float(), nullable=True), - sa.Column('is_primary', sa.Boolean(), nullable=False), - sa.ForeignKeyConstraint(['topic_id'], ['topics.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['work_id'], ['works.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('work_id', 'topic_id') - ) - op.create_index(op.f('ix_work_topics_topic_id'), 'work_topics', ['topic_id'], unique=False) - op.create_index(op.f('ix_work_topics_work_id'), 'work_topics', ['work_id'], unique=False) + op.create_index( + op.f("ix_topics_subfield_id"), "topics", ["subfield_id"], unique=False + ) + op.create_table( + "work_topics", + sa.Column("work_id", sa.Integer(), nullable=False), + sa.Column("topic_id", sa.Integer(), nullable=False), + sa.Column("score", sa.Float(), nullable=True), + sa.Column("is_primary", sa.Boolean(), nullable=False), + sa.ForeignKeyConstraint(["topic_id"], ["topics.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["work_id"], ["works.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("work_id", "topic_id"), + ) + op.create_index( + op.f("ix_work_topics_topic_id"), "work_topics", ["topic_id"], unique=False + ) + op.create_index( + op.f("ix_work_topics_work_id"), "work_topics", ["work_id"], unique=False + ) # --- REMOVED UNRELATED PERSONS INDEX CHANGES --- # op.drop_index('ix_persons_orcid', table_name='persons') # op.create_index('ix_persons_orcid', 'persons', ['orcid'], unique=False) @@ -110,26 +180,26 @@ def downgrade() -> None: # op.drop_index('ix_persons_orcid', table_name='persons') # op.create_index('ix_persons_orcid', 'persons', ['orcid'], unique=True) # --- END REMOVAL --- - op.drop_index(op.f('ix_work_topics_work_id'), table_name='work_topics') - op.drop_index(op.f('ix_work_topics_topic_id'), table_name='work_topics') - op.drop_table('work_topics') - op.drop_index(op.f('ix_topics_subfield_id'), table_name='topics') - op.drop_index(op.f('ix_topics_openalex_id'), table_name='topics') - op.drop_index(op.f('ix_topics_id'), table_name='topics') - op.drop_index(op.f('ix_topics_display_name'), table_name='topics') - op.drop_table('topics') - op.drop_index(op.f('ix_subfields_openalex_id'), table_name='subfields') - op.drop_index(op.f('ix_subfields_id'), table_name='subfields') - op.drop_index(op.f('ix_subfields_field_id'), table_name='subfields') - op.drop_index(op.f('ix_subfields_display_name'), table_name='subfields') - op.drop_table('subfields') - op.drop_index(op.f('ix_fields_openalex_id'), table_name='fields') - op.drop_index(op.f('ix_fields_id'), table_name='fields') - op.drop_index(op.f('ix_fields_domain_id'), table_name='fields') - op.drop_index(op.f('ix_fields_display_name'), table_name='fields') - op.drop_table('fields') - op.drop_index(op.f('ix_domains_openalex_id'), table_name='domains') - op.drop_index(op.f('ix_domains_id'), table_name='domains') - op.drop_index(op.f('ix_domains_display_name'), table_name='domains') - op.drop_table('domains') - # ### end Alembic commands ### \ No newline at end of file + op.drop_index(op.f("ix_work_topics_work_id"), table_name="work_topics") + op.drop_index(op.f("ix_work_topics_topic_id"), table_name="work_topics") + op.drop_table("work_topics") + op.drop_index(op.f("ix_topics_subfield_id"), table_name="topics") + op.drop_index(op.f("ix_topics_openalex_id"), table_name="topics") + op.drop_index(op.f("ix_topics_id"), table_name="topics") + op.drop_index(op.f("ix_topics_display_name"), table_name="topics") + op.drop_table("topics") + op.drop_index(op.f("ix_subfields_openalex_id"), table_name="subfields") + op.drop_index(op.f("ix_subfields_id"), table_name="subfields") + op.drop_index(op.f("ix_subfields_field_id"), table_name="subfields") + op.drop_index(op.f("ix_subfields_display_name"), table_name="subfields") + op.drop_table("subfields") + op.drop_index(op.f("ix_fields_openalex_id"), table_name="fields") + op.drop_index(op.f("ix_fields_id"), table_name="fields") + op.drop_index(op.f("ix_fields_domain_id"), table_name="fields") + op.drop_index(op.f("ix_fields_display_name"), table_name="fields") + op.drop_table("fields") + op.drop_index(op.f("ix_domains_openalex_id"), table_name="domains") + op.drop_index(op.f("ix_domains_id"), table_name="domains") + op.drop_index(op.f("ix_domains_display_name"), table_name="domains") + op.drop_table("domains") + # ### end Alembic commands ### diff --git a/backend/data/migrations/versions/6caa9c3d1fa0_phase_21_add_pullrequest_issue_comment_.py b/backend/data/migrations/versions/6caa9c3d1fa0_phase_21_add_pullrequest_issue_comment_.py index fd250ce..b83c728 100644 --- a/backend/data/migrations/versions/6caa9c3d1fa0_phase_21_add_pullrequest_issue_comment_.py +++ b/backend/data/migrations/versions/6caa9c3d1fa0_phase_21_add_pullrequest_issue_comment_.py @@ -7,6 +7,7 @@ Create Date: 2025-04-30 17:11:24.546255 """ + from typing import Sequence, Union from alembic import op @@ -14,8 +15,10 @@ # revision identifiers, used by Alembic. -revision: str = '6caa9c3d1fa0' -down_revision: Union[str, None] = 'd19968da140c' # Ensure this points to your actual previous revision +revision: str = "6caa9c3d1fa0" +down_revision: Union[str, None] = ( + "d19968da140c" # Ensure this points to your actual previous revision +) branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -23,95 +26,207 @@ def upgrade() -> None: """Upgrade schema.""" # ### commands auto generated by Alembic - START ### - op.create_table('issues', - sa.Column('github_id', sa.BigInteger(), nullable=False), - sa.Column('repository_id', sa.Integer(), nullable=False), - sa.Column('user_id', sa.Integer(), nullable=False), - sa.Column('number', sa.Integer(), nullable=False), - sa.Column('title', sa.Text(), nullable=True), - sa.Column('state', sa.String(), nullable=False), - sa.Column('gh_created_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('gh_updated_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('gh_closed_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['repository_id'], ['repositories.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['user_id'], ['contributors.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_issues_github_id'), 'issues', ['github_id'], unique=True) - op.create_index(op.f('ix_issues_id'), 'issues', ['id'], unique=False) - op.create_index('ix_issues_number', 'issues', ['number'], unique=False) - op.create_index('ix_issues_repo_id', 'issues', ['repository_id'], unique=False) - op.create_index('ix_issues_repo_number', 'issues', ['repository_id', 'number'], unique=False) - op.create_index(op.f('ix_issues_repository_id'), 'issues', ['repository_id'], unique=False) # Keep op.f if generated - op.create_index('ix_issues_state', 'issues', ['state'], unique=False) - op.create_index('ix_issues_user_id', 'issues', ['user_id'], unique=False) - op.create_table('pull_requests', - sa.Column('github_id', sa.BigInteger(), nullable=False), - sa.Column('repository_id', sa.Integer(), nullable=False), - sa.Column('user_id', sa.Integer(), nullable=False), - sa.Column('number', sa.Integer(), nullable=False), - sa.Column('title', sa.Text(), nullable=True), - sa.Column('state', sa.String(), nullable=False), - sa.Column('gh_created_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('gh_updated_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('gh_closed_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('gh_merged_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['repository_id'], ['repositories.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['user_id'], ['contributors.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_pull_requests_github_id'), 'pull_requests', ['github_id'], unique=True) - op.create_index(op.f('ix_pull_requests_id'), 'pull_requests', ['id'], unique=False) - op.create_index('ix_pull_requests_number', 'pull_requests', ['number'], unique=False) - op.create_index('ix_pull_requests_repo_id', 'pull_requests', ['repository_id'], unique=False) - op.create_index('ix_pull_requests_repo_number', 'pull_requests', ['repository_id', 'number'], unique=False) - op.create_index(op.f('ix_pull_requests_repository_id'), 'pull_requests', ['repository_id'], unique=False) # Keep op.f if generated - op.create_index('ix_pull_requests_state', 'pull_requests', ['state'], unique=False) - op.create_index(op.f('ix_pull_requests_user_id'), 'pull_requests', ['user_id'], unique=False) # Keep op.f if generated - op.create_table('issue_comments', - sa.Column('github_id', sa.BigInteger(), nullable=False), - sa.Column('issue_id', sa.Integer(), nullable=False), - sa.Column('user_id', sa.Integer(), nullable=False), - sa.Column('body', sa.Text(), nullable=True), - sa.Column('gh_created_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('gh_updated_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['issue_id'], ['issues.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['user_id'], ['contributors.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_issue_comments_github_id'), 'issue_comments', ['github_id'], unique=True) - op.create_index(op.f('ix_issue_comments_id'), 'issue_comments', ['id'], unique=False) - op.create_index('ix_issue_comments_issue_id', 'issue_comments', ['issue_id'], unique=False) - op.create_index('ix_issue_comments_user_id', 'issue_comments', ['user_id'], unique=False) - op.create_table('pr_review_comments', - sa.Column('github_id', sa.BigInteger(), nullable=False), - sa.Column('pull_request_review_id', sa.BigInteger(), nullable=True), - sa.Column('pr_id', sa.Integer(), nullable=False), - sa.Column('user_id', sa.Integer(), nullable=False), - sa.Column('body', sa.Text(), nullable=True), - sa.Column('gh_created_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('gh_updated_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['pr_id'], ['pull_requests.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['user_id'], ['contributors.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_pr_review_comments_github_id'), 'pr_review_comments', ['github_id'], unique=True) - op.create_index(op.f('ix_pr_review_comments_id'), 'pr_review_comments', ['id'], unique=False) - op.create_index(op.f('ix_pr_review_comments_pr_id'), 'pr_review_comments', ['pr_id'], unique=False) # Keep op.f if generated - op.create_index('ix_pr_review_comments_review_id', 'pr_review_comments', ['pull_request_review_id'], unique=False) - op.create_index('ix_pr_review_comments_user_id', 'pr_review_comments', ['user_id'], unique=False) + op.create_table( + "issues", + sa.Column("github_id", sa.BigInteger(), nullable=False), + sa.Column("repository_id", sa.Integer(), nullable=False), + sa.Column("user_id", sa.Integer(), nullable=False), + sa.Column("number", sa.Integer(), nullable=False), + sa.Column("title", sa.Text(), nullable=True), + sa.Column("state", sa.String(), nullable=False), + sa.Column("gh_created_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("gh_updated_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("gh_closed_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["repository_id"], ["repositories.id"], ondelete="CASCADE" + ), + sa.ForeignKeyConstraint( + ["user_id"], + ["contributors.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index(op.f("ix_issues_github_id"), "issues", ["github_id"], unique=True) + op.create_index(op.f("ix_issues_id"), "issues", ["id"], unique=False) + op.create_index("ix_issues_number", "issues", ["number"], unique=False) + op.create_index("ix_issues_repo_id", "issues", ["repository_id"], unique=False) + op.create_index( + "ix_issues_repo_number", "issues", ["repository_id", "number"], unique=False + ) + op.create_index( + op.f("ix_issues_repository_id"), "issues", ["repository_id"], unique=False + ) # Keep op.f if generated + op.create_index("ix_issues_state", "issues", ["state"], unique=False) + op.create_index("ix_issues_user_id", "issues", ["user_id"], unique=False) + op.create_table( + "pull_requests", + sa.Column("github_id", sa.BigInteger(), nullable=False), + sa.Column("repository_id", sa.Integer(), nullable=False), + sa.Column("user_id", sa.Integer(), nullable=False), + sa.Column("number", sa.Integer(), nullable=False), + sa.Column("title", sa.Text(), nullable=True), + sa.Column("state", sa.String(), nullable=False), + sa.Column("gh_created_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("gh_updated_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("gh_closed_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("gh_merged_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["repository_id"], ["repositories.id"], ondelete="CASCADE" + ), + sa.ForeignKeyConstraint( + ["user_id"], + ["contributors.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_pull_requests_github_id"), "pull_requests", ["github_id"], unique=True + ) + op.create_index(op.f("ix_pull_requests_id"), "pull_requests", ["id"], unique=False) + op.create_index( + "ix_pull_requests_number", "pull_requests", ["number"], unique=False + ) + op.create_index( + "ix_pull_requests_repo_id", "pull_requests", ["repository_id"], unique=False + ) + op.create_index( + "ix_pull_requests_repo_number", + "pull_requests", + ["repository_id", "number"], + unique=False, + ) + op.create_index( + op.f("ix_pull_requests_repository_id"), + "pull_requests", + ["repository_id"], + unique=False, + ) # Keep op.f if generated + op.create_index("ix_pull_requests_state", "pull_requests", ["state"], unique=False) + op.create_index( + op.f("ix_pull_requests_user_id"), "pull_requests", ["user_id"], unique=False + ) # Keep op.f if generated + op.create_table( + "issue_comments", + sa.Column("github_id", sa.BigInteger(), nullable=False), + sa.Column("issue_id", sa.Integer(), nullable=False), + sa.Column("user_id", sa.Integer(), nullable=False), + sa.Column("body", sa.Text(), nullable=True), + sa.Column("gh_created_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("gh_updated_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint(["issue_id"], ["issues.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint( + ["user_id"], + ["contributors.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_issue_comments_github_id"), + "issue_comments", + ["github_id"], + unique=True, + ) + op.create_index( + op.f("ix_issue_comments_id"), "issue_comments", ["id"], unique=False + ) + op.create_index( + "ix_issue_comments_issue_id", "issue_comments", ["issue_id"], unique=False + ) + op.create_index( + "ix_issue_comments_user_id", "issue_comments", ["user_id"], unique=False + ) + op.create_table( + "pr_review_comments", + sa.Column("github_id", sa.BigInteger(), nullable=False), + sa.Column("pull_request_review_id", sa.BigInteger(), nullable=True), + sa.Column("pr_id", sa.Integer(), nullable=False), + sa.Column("user_id", sa.Integer(), nullable=False), + sa.Column("body", sa.Text(), nullable=True), + sa.Column("gh_created_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("gh_updated_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint(["pr_id"], ["pull_requests.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint( + ["user_id"], + ["contributors.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_pr_review_comments_github_id"), + "pr_review_comments", + ["github_id"], + unique=True, + ) + op.create_index( + op.f("ix_pr_review_comments_id"), "pr_review_comments", ["id"], unique=False + ) + op.create_index( + op.f("ix_pr_review_comments_pr_id"), + "pr_review_comments", + ["pr_id"], + unique=False, + ) # Keep op.f if generated + op.create_index( + "ix_pr_review_comments_review_id", + "pr_review_comments", + ["pull_request_review_id"], + unique=False, + ) + op.create_index( + "ix_pr_review_comments_user_id", "pr_review_comments", ["user_id"], unique=False + ) # --- REMOVED UNRELATED INDEX CHANGES --- # op.drop_index('ix_fields_openalex_id', table_name='fields') @@ -136,33 +251,35 @@ def downgrade() -> None: # op.create_index('ix_fields_openalex_id', 'fields', ['openalex_id'], unique=True) # --- END REMOVAL --- - op.drop_index('ix_pr_review_comments_user_id', table_name='pr_review_comments') - op.drop_index('ix_pr_review_comments_review_id', table_name='pr_review_comments') - op.drop_index(op.f('ix_pr_review_comments_pr_id'), table_name='pr_review_comments') - op.drop_index(op.f('ix_pr_review_comments_id'), table_name='pr_review_comments') - op.drop_index(op.f('ix_pr_review_comments_github_id'), table_name='pr_review_comments') - op.drop_table('pr_review_comments') - op.drop_index('ix_issue_comments_user_id', table_name='issue_comments') - op.drop_index('ix_issue_comments_issue_id', table_name='issue_comments') - op.drop_index(op.f('ix_issue_comments_id'), table_name='issue_comments') - op.drop_index(op.f('ix_issue_comments_github_id'), table_name='issue_comments') - op.drop_table('issue_comments') - op.drop_index(op.f('ix_pull_requests_user_id'), table_name='pull_requests') - op.drop_index('ix_pull_requests_state', table_name='pull_requests') - op.drop_index(op.f('ix_pull_requests_repository_id'), table_name='pull_requests') - op.drop_index('ix_pull_requests_repo_number', table_name='pull_requests') - op.drop_index('ix_pull_requests_repo_id', table_name='pull_requests') - op.drop_index('ix_pull_requests_number', table_name='pull_requests') - op.drop_index(op.f('ix_pull_requests_id'), table_name='pull_requests') - op.drop_index(op.f('ix_pull_requests_github_id'), table_name='pull_requests') - op.drop_table('pull_requests') - op.drop_index('ix_issues_user_id', table_name='issues') - op.drop_index('ix_issues_state', table_name='issues') - op.drop_index(op.f('ix_issues_repository_id'), table_name='issues') - op.drop_index('ix_issues_repo_number', table_name='issues') - op.drop_index('ix_issues_repo_id', table_name='issues') - op.drop_index('ix_issues_number', table_name='issues') - op.drop_index(op.f('ix_issues_id'), table_name='issues') - op.drop_index(op.f('ix_issues_github_id'), table_name='issues') - op.drop_table('issues') - # ### end Alembic commands ### \ No newline at end of file + op.drop_index("ix_pr_review_comments_user_id", table_name="pr_review_comments") + op.drop_index("ix_pr_review_comments_review_id", table_name="pr_review_comments") + op.drop_index(op.f("ix_pr_review_comments_pr_id"), table_name="pr_review_comments") + op.drop_index(op.f("ix_pr_review_comments_id"), table_name="pr_review_comments") + op.drop_index( + op.f("ix_pr_review_comments_github_id"), table_name="pr_review_comments" + ) + op.drop_table("pr_review_comments") + op.drop_index("ix_issue_comments_user_id", table_name="issue_comments") + op.drop_index("ix_issue_comments_issue_id", table_name="issue_comments") + op.drop_index(op.f("ix_issue_comments_id"), table_name="issue_comments") + op.drop_index(op.f("ix_issue_comments_github_id"), table_name="issue_comments") + op.drop_table("issue_comments") + op.drop_index(op.f("ix_pull_requests_user_id"), table_name="pull_requests") + op.drop_index("ix_pull_requests_state", table_name="pull_requests") + op.drop_index(op.f("ix_pull_requests_repository_id"), table_name="pull_requests") + op.drop_index("ix_pull_requests_repo_number", table_name="pull_requests") + op.drop_index("ix_pull_requests_repo_id", table_name="pull_requests") + op.drop_index("ix_pull_requests_number", table_name="pull_requests") + op.drop_index(op.f("ix_pull_requests_id"), table_name="pull_requests") + op.drop_index(op.f("ix_pull_requests_github_id"), table_name="pull_requests") + op.drop_table("pull_requests") + op.drop_index("ix_issues_user_id", table_name="issues") + op.drop_index("ix_issues_state", table_name="issues") + op.drop_index(op.f("ix_issues_repository_id"), table_name="issues") + op.drop_index("ix_issues_repo_number", table_name="issues") + op.drop_index("ix_issues_repo_id", table_name="issues") + op.drop_index("ix_issues_number", table_name="issues") + op.drop_index(op.f("ix_issues_id"), table_name="issues") + op.drop_index(op.f("ix_issues_github_id"), table_name="issues") + op.drop_table("issues") + # ### end Alembic commands ### diff --git a/backend/data/migrations/versions/a7e01fc1d2e8_phase_3_add_scholarly_entity_models_.py b/backend/data/migrations/versions/a7e01fc1d2e8_phase_3_add_scholarly_entity_models_.py index e2931de..c75d4d0 100644 --- a/backend/data/migrations/versions/a7e01fc1d2e8_phase_3_add_scholarly_entity_models_.py +++ b/backend/data/migrations/versions/a7e01fc1d2e8_phase_3_add_scholarly_entity_models_.py @@ -5,6 +5,7 @@ Create Date: 2025-04-07 11:19:41.305053 """ + from typing import Sequence, Union from alembic import op @@ -12,8 +13,8 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision: str = 'a7e01fc1d2e8' -down_revision: Union[str, None] = '1cc8bbb9702b' +revision: str = "a7e01fc1d2e8" +down_revision: Union[str, None] = "1cc8bbb9702b" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -21,69 +22,152 @@ def upgrade() -> None: """Upgrade schema.""" # ### commands auto generated by Alembic - START ### - op.create_table('institutions', - sa.Column('openalex_id', sa.String(), nullable=False), - sa.Column('ror', sa.String(), nullable=True), - sa.Column('display_name', sa.String(), nullable=False), - sa.Column('country_code', sa.String(length=2), nullable=True), - sa.Column('type', sa.String(), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.PrimaryKeyConstraint('id') + op.create_table( + "institutions", + sa.Column("openalex_id", sa.String(), nullable=False), + sa.Column("ror", sa.String(), nullable=True), + sa.Column("display_name", sa.String(), nullable=False), + sa.Column("country_code", sa.String(length=2), nullable=True), + sa.Column("type", sa.String(), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_institutions_display_name"), + "institutions", + ["display_name"], + unique=False, + ) + op.create_index(op.f("ix_institutions_id"), "institutions", ["id"], unique=False) + op.create_index( + op.f("ix_institutions_openalex_id"), + "institutions", + ["openalex_id"], + unique=True, ) - op.create_index(op.f('ix_institutions_display_name'), 'institutions', ['display_name'], unique=False) - op.create_index(op.f('ix_institutions_id'), 'institutions', ['id'], unique=False) - op.create_index(op.f('ix_institutions_openalex_id'), 'institutions', ['openalex_id'], unique=True) - op.create_index(op.f('ix_institutions_ror'), 'institutions', ['ror'], unique=True) - op.create_index('ix_institutions_type', 'institutions', ['type'], unique=False) # Keep non-unique as per model - op.create_table('persons', - sa.Column('openalex_id', sa.String(), nullable=False), - sa.Column('orcid', sa.String(), nullable=True), - sa.Column('display_name', sa.String(), nullable=False), - sa.Column('display_name_alternatives', postgresql.JSONB(astext_type=sa.Text()), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.PrimaryKeyConstraint('id') + op.create_index(op.f("ix_institutions_ror"), "institutions", ["ror"], unique=True) + op.create_index( + "ix_institutions_type", "institutions", ["type"], unique=False + ) # Keep non-unique as per model + op.create_table( + "persons", + sa.Column("openalex_id", sa.String(), nullable=False), + sa.Column("orcid", sa.String(), nullable=True), + sa.Column("display_name", sa.String(), nullable=False), + sa.Column( + "display_name_alternatives", + postgresql.JSONB(astext_type=sa.Text()), + nullable=True, + ), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), ) # --- CORRECTED INDEXES for persons --- - op.create_index(op.f('ix_persons_display_name'), 'persons', ['display_name'], unique=False) # Keep non-unique as per model - op.create_index(op.f('ix_persons_id'), 'persons', ['id'], unique=False) - op.create_index('ix_persons_openalex_id', 'persons', ['openalex_id'], unique=True) # Changed to unique=True - op.create_index('ix_persons_orcid', 'persons', ['orcid'], unique=True) # Changed to unique=True + op.create_index( + op.f("ix_persons_display_name"), "persons", ["display_name"], unique=False + ) # Keep non-unique as per model + op.create_index(op.f("ix_persons_id"), "persons", ["id"], unique=False) + op.create_index( + "ix_persons_openalex_id", "persons", ["openalex_id"], unique=True + ) # Changed to unique=True + op.create_index( + "ix_persons_orcid", "persons", ["orcid"], unique=True + ) # Changed to unique=True # --- END CORRECTIONS --- - op.create_table('authorships', - sa.Column('work_id', sa.Integer(), nullable=False), - sa.Column('person_id', sa.Integer(), nullable=False), - sa.Column('author_position', sa.String(), nullable=True), - sa.Column('is_corresponding', sa.Boolean(), nullable=True), - sa.ForeignKeyConstraint(['person_id'], ['persons.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['work_id'], ['works.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('work_id', 'person_id') + op.create_table( + "authorships", + sa.Column("work_id", sa.Integer(), nullable=False), + sa.Column("person_id", sa.Integer(), nullable=False), + sa.Column("author_position", sa.String(), nullable=True), + sa.Column("is_corresponding", sa.Boolean(), nullable=True), + sa.ForeignKeyConstraint(["person_id"], ["persons.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["work_id"], ["works.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("work_id", "person_id"), + ) + op.create_index( + "ix_authorships_person_id", "authorships", ["person_id"], unique=False + ) + op.create_index("ix_authorships_work_id", "authorships", ["work_id"], unique=False) + op.create_table( + "work_citations", + sa.Column("citing_work_id", sa.Integer(), nullable=False), + sa.Column("cited_work_id", sa.Integer(), nullable=False), + sa.ForeignKeyConstraint(["cited_work_id"], ["works.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["citing_work_id"], ["works.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("citing_work_id", "cited_work_id"), ) - op.create_index('ix_authorships_person_id', 'authorships', ['person_id'], unique=False) - op.create_index('ix_authorships_work_id', 'authorships', ['work_id'], unique=False) - op.create_table('work_citations', - sa.Column('citing_work_id', sa.Integer(), nullable=False), - sa.Column('cited_work_id', sa.Integer(), nullable=False), - sa.ForeignKeyConstraint(['cited_work_id'], ['works.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['citing_work_id'], ['works.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('citing_work_id', 'cited_work_id') + op.create_index( + "ix_work_citations_cited_work_id", + "work_citations", + ["cited_work_id"], + unique=False, ) - op.create_index('ix_work_citations_cited_work_id', 'work_citations', ['cited_work_id'], unique=False) - op.create_index('ix_work_citations_citing_work_id', 'work_citations', ['citing_work_id'], unique=False) - op.create_table('affiliations', - sa.Column('authorship_work_id', sa.Integer(), nullable=False), - sa.Column('authorship_person_id', sa.Integer(), nullable=False), - sa.Column('institution_id', sa.Integer(), nullable=False), - sa.ForeignKeyConstraint(['authorship_work_id', 'authorship_person_id'], ['authorships.work_id', 'authorships.person_id'], name='fk_affiliation_authorship', ondelete='CASCADE'), - sa.ForeignKeyConstraint(['institution_id'], ['institutions.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('authorship_work_id', 'authorship_person_id', 'institution_id') + op.create_index( + "ix_work_citations_citing_work_id", + "work_citations", + ["citing_work_id"], + unique=False, + ) + op.create_table( + "affiliations", + sa.Column("authorship_work_id", sa.Integer(), nullable=False), + sa.Column("authorship_person_id", sa.Integer(), nullable=False), + sa.Column("institution_id", sa.Integer(), nullable=False), + sa.ForeignKeyConstraint( + ["authorship_work_id", "authorship_person_id"], + ["authorships.work_id", "authorships.person_id"], + name="fk_affiliation_authorship", + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["institution_id"], ["institutions.id"], ondelete="CASCADE" + ), + sa.PrimaryKeyConstraint( + "authorship_work_id", "authorship_person_id", "institution_id" + ), + ) + op.create_index( + "ix_affiliations_authorship_person_id", + "affiliations", + ["authorship_person_id"], + unique=False, + ) + op.create_index( + "ix_affiliations_authorship_work_id", + "affiliations", + ["authorship_work_id"], + unique=False, + ) + op.create_index( + "ix_affiliations_institution_id", + "affiliations", + ["institution_id"], + unique=False, ) - op.create_index('ix_affiliations_authorship_person_id', 'affiliations', ['authorship_person_id'], unique=False) - op.create_index('ix_affiliations_authorship_work_id', 'affiliations', ['authorship_work_id'], unique=False) - op.create_index('ix_affiliations_institution_id', 'affiliations', ['institution_id'], unique=False) # --- REMOVED REDUNDANT CONSTRAINT --- # op.create_unique_constraint('uq_repo_contrib', 'repository_contributors', ['repository_id', 'contributor_id']) # --- END REMOVAL --- @@ -96,27 +180,27 @@ def downgrade() -> None: # --- REMOVED REDUNDANT CONSTRAINT DROP --- # op.drop_constraint('uq_repo_contrib', 'repository_contributors', type_='unique') # --- END REMOVAL --- - op.drop_index('ix_affiliations_institution_id', table_name='affiliations') - op.drop_index('ix_affiliations_authorship_work_id', table_name='affiliations') - op.drop_index('ix_affiliations_authorship_person_id', table_name='affiliations') - op.drop_table('affiliations') - op.drop_index('ix_work_citations_citing_work_id', table_name='work_citations') - op.drop_index('ix_work_citations_cited_work_id', table_name='work_citations') - op.drop_table('work_citations') - op.drop_index('ix_authorships_work_id', table_name='authorships') - op.drop_index('ix_authorships_person_id', table_name='authorships') - op.drop_table('authorships') + op.drop_index("ix_affiliations_institution_id", table_name="affiliations") + op.drop_index("ix_affiliations_authorship_work_id", table_name="affiliations") + op.drop_index("ix_affiliations_authorship_person_id", table_name="affiliations") + op.drop_table("affiliations") + op.drop_index("ix_work_citations_citing_work_id", table_name="work_citations") + op.drop_index("ix_work_citations_cited_work_id", table_name="work_citations") + op.drop_table("work_citations") + op.drop_index("ix_authorships_work_id", table_name="authorships") + op.drop_index("ix_authorships_person_id", table_name="authorships") + op.drop_table("authorships") # --- CORRECTED INDEX DROPS for persons --- - op.drop_index('ix_persons_orcid', table_name='persons') # Was unique=True - op.drop_index('ix_persons_openalex_id', table_name='persons') # Was unique=True - op.drop_index(op.f('ix_persons_id'), table_name='persons') - op.drop_index(op.f('ix_persons_display_name'), table_name='persons') + op.drop_index("ix_persons_orcid", table_name="persons") # Was unique=True + op.drop_index("ix_persons_openalex_id", table_name="persons") # Was unique=True + op.drop_index(op.f("ix_persons_id"), table_name="persons") + op.drop_index(op.f("ix_persons_display_name"), table_name="persons") # --- END CORRECTIONS --- - op.drop_table('persons') - op.drop_index('ix_institutions_type', table_name='institutions') - op.drop_index(op.f('ix_institutions_ror'), table_name='institutions') - op.drop_index(op.f('ix_institutions_openalex_id'), table_name='institutions') - op.drop_index(op.f('ix_institutions_id'), table_name='institutions') - op.drop_index(op.f('ix_institutions_display_name'), table_name='institutions') - op.drop_table('institutions') - # ### end Alembic commands ### \ No newline at end of file + op.drop_table("persons") + op.drop_index("ix_institutions_type", table_name="institutions") + op.drop_index(op.f("ix_institutions_ror"), table_name="institutions") + op.drop_index(op.f("ix_institutions_openalex_id"), table_name="institutions") + op.drop_index(op.f("ix_institutions_id"), table_name="institutions") + op.drop_index(op.f("ix_institutions_display_name"), table_name="institutions") + op.drop_table("institutions") + # ### end Alembic commands ### diff --git a/backend/data/migrations/versions/ac00d539ca94_phase_1_initial_core_schema_with_.py b/backend/data/migrations/versions/ac00d539ca94_phase_1_initial_core_schema_with_.py index f515b45..a5d348d 100644 --- a/backend/data/migrations/versions/ac00d539ca94_phase_1_initial_core_schema_with_.py +++ b/backend/data/migrations/versions/ac00d539ca94_phase_1_initial_core_schema_with_.py @@ -1,10 +1,11 @@ """Phase 1: Initial core schema with hierarchical discovery Revision ID: ac00d539ca94 -Revises: +Revises: Create Date: 2025-04-05 09:31:05.488592 """ + from typing import Sequence, Union from alembic import op @@ -12,7 +13,7 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision: str = 'ac00d539ca94' +revision: str = "ac00d539ca94" down_revision: Union[str, None] = None branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -21,150 +22,328 @@ def upgrade() -> None: """Upgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.create_table('contributors', - sa.Column('github_id', sa.BigInteger(), nullable=False), - sa.Column('login', sa.String(), nullable=False), - sa.Column('type', sa.String(), nullable=False), - sa.Column('avatar_url', sa.String(), nullable=True), - sa.Column('html_url', sa.String(), nullable=True), - sa.Column('api_url', sa.String(), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_contributors_github_id'), 'contributors', ['github_id'], unique=True) - op.create_index(op.f('ix_contributors_id'), 'contributors', ['id'], unique=False) - op.create_index(op.f('ix_contributors_login'), 'contributors', ['login'], unique=True) - op.create_index(op.f('ix_contributors_type'), 'contributors', ['type'], unique=False) - op.create_table('discovery_chains', - sa.Column('id', sa.UUID(), nullable=False), - sa.Column('parent_chain_id', sa.UUID(), nullable=True), - sa.Column('root_chain_id', sa.UUID(), nullable=False), - sa.Column('level', sa.Integer(), nullable=False), - sa.Column('discovery_type', sa.String(), nullable=False), - sa.Column('parameters', postgresql.JSONB(astext_type=sa.Text()), nullable=True), - sa.Column('status', sa.String(), nullable=False), - sa.Column('started_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('completed_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['parent_chain_id'], ['discovery_chains.id'], ), - sa.ForeignKeyConstraint(['root_chain_id'], ['discovery_chains.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_discovery_chains_root_chain_id'), 'discovery_chains', ['root_chain_id'], unique=False) - op.create_index('ix_discovery_chains_root_id', 'discovery_chains', ['root_chain_id'], unique=False) - op.create_index(op.f('ix_discovery_chains_status'), 'discovery_chains', ['status'], unique=False) - op.create_table('owners', - sa.Column('github_id', sa.BigInteger(), nullable=False), - sa.Column('login', sa.String(), nullable=False), - sa.Column('type', sa.String(), nullable=False), - sa.Column('avatar_url', sa.String(), nullable=True), - sa.Column('html_url', sa.String(), nullable=True), - sa.Column('api_url', sa.String(), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_owners_github_id'), 'owners', ['github_id'], unique=True) - op.create_index(op.f('ix_owners_id'), 'owners', ['id'], unique=False) - op.create_index(op.f('ix_owners_login'), 'owners', ['login'], unique=True) - op.create_index(op.f('ix_owners_type'), 'owners', ['type'], unique=False) - op.create_table('works', - sa.Column('openalex_id', sa.String(), nullable=False), - sa.Column('doi', sa.String(), nullable=False), - sa.Column('title', sa.Text(), nullable=True), - sa.Column('publication_year', sa.Integer(), nullable=True), - sa.Column('type', sa.String(), nullable=True), - sa.Column('cited_by_count', sa.Integer(), nullable=True), - sa.Column('host_venue_display_name', sa.String(), nullable=True), - sa.Column('openalex_url', sa.String(), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_works_doi'), 'works', ['doi'], unique=True) - op.create_index(op.f('ix_works_id'), 'works', ['id'], unique=False) - op.create_index(op.f('ix_works_openalex_id'), 'works', ['openalex_id'], unique=True) - op.create_index(op.f('ix_works_publication_year'), 'works', ['publication_year'], unique=False) - op.create_index('ix_works_type', 'works', ['type'], unique=False) - op.create_table('entity_discovery_associations', - sa.Column('discovery_chain_id', sa.UUID(), nullable=False), - sa.Column('entity_type', sa.String(), nullable=False), - sa.Column('entity_id', sa.Integer(), nullable=False), - sa.Column('is_direct_discovery', sa.Boolean(), nullable=False), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['discovery_chain_id'], ['discovery_chains.id'], ), - sa.PrimaryKeyConstraint('id'), - sa.UniqueConstraint('discovery_chain_id', 'entity_type', 'entity_id', name='uq_discovery_entity') - ) - op.create_index(op.f('ix_entity_discovery_associations_discovery_chain_id'), 'entity_discovery_associations', ['discovery_chain_id'], unique=False) - op.create_index(op.f('ix_entity_discovery_associations_entity_id'), 'entity_discovery_associations', ['entity_id'], unique=False) - op.create_index(op.f('ix_entity_discovery_associations_entity_type'), 'entity_discovery_associations', ['entity_type'], unique=False) - op.create_index(op.f('ix_entity_discovery_associations_id'), 'entity_discovery_associations', ['id'], unique=False) - op.create_index('ix_entity_discovery_chain_id', 'entity_discovery_associations', ['discovery_chain_id'], unique=False) - op.create_index('ix_entity_discovery_entity', 'entity_discovery_associations', ['entity_type', 'entity_id'], unique=False) - op.create_table('repositories', - sa.Column('github_id', sa.BigInteger(), nullable=False), - sa.Column('name', sa.String(), nullable=False), - sa.Column('full_name', sa.String(), nullable=False), - sa.Column('description', sa.Text(), nullable=True), - sa.Column('homepage', sa.String(), nullable=True), - sa.Column('html_url', sa.String(), nullable=False), - sa.Column('api_url', sa.String(), nullable=False), - sa.Column('language', sa.String(), nullable=True), - sa.Column('default_branch', sa.String(), nullable=True), - sa.Column('stargazers_count', sa.Integer(), nullable=False), - sa.Column('watchers_count', sa.Integer(), nullable=False), - sa.Column('forks_count', sa.Integer(), nullable=False), - sa.Column('open_issues_count', sa.Integer(), nullable=False), - sa.Column('is_fork', sa.Boolean(), nullable=False), - sa.Column('gh_created_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('gh_updated_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('gh_pushed_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('owner_id', sa.Integer(), nullable=False), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['owner_id'], ['owners.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_repositories_full_name'), 'repositories', ['full_name'], unique=True) - op.create_index(op.f('ix_repositories_github_id'), 'repositories', ['github_id'], unique=True) - op.create_index(op.f('ix_repositories_id'), 'repositories', ['id'], unique=False) - op.create_index(op.f('ix_repositories_language'), 'repositories', ['language'], unique=False) - op.create_index(op.f('ix_repositories_owner_id'), 'repositories', ['owner_id'], unique=False) - op.create_table('doi_references', - sa.Column('doi', sa.String(), nullable=False), - sa.Column('repository_id', sa.Integer(), nullable=False), - sa.Column('work_id', sa.Integer(), nullable=True), - sa.Column('source_file', sa.String(), nullable=True), - sa.Column('context', sa.Text(), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['repository_id'], ['repositories.id'], ), - sa.ForeignKeyConstraint(['work_id'], ['works.id'], ), - sa.PrimaryKeyConstraint('id'), - sa.UniqueConstraint('repository_id', 'doi', 'source_file', name='uq_repo_doi_source') - ) - op.create_index('ix_doi_references_doi', 'doi_references', ['doi'], unique=False) - op.create_index(op.f('ix_doi_references_id'), 'doi_references', ['id'], unique=False) - op.create_index(op.f('ix_doi_references_repository_id'), 'doi_references', ['repository_id'], unique=False) - op.create_index(op.f('ix_doi_references_work_id'), 'doi_references', ['work_id'], unique=False) - op.create_table('repository_contributors', - sa.Column('repository_id', sa.Integer(), nullable=False), - sa.Column('contributor_id', sa.Integer(), nullable=False), - sa.Column('contributions_count', sa.Integer(), nullable=True), - sa.ForeignKeyConstraint(['contributor_id'], ['contributors.id'], ), - sa.ForeignKeyConstraint(['repository_id'], ['repositories.id'], ), - sa.PrimaryKeyConstraint('repository_id', 'contributor_id'), - sa.UniqueConstraint('repository_id', 'contributor_id', name='uq_repo_contrib') + op.create_table( + "contributors", + sa.Column("github_id", sa.BigInteger(), nullable=False), + sa.Column("login", sa.String(), nullable=False), + sa.Column("type", sa.String(), nullable=False), + sa.Column("avatar_url", sa.String(), nullable=True), + sa.Column("html_url", sa.String(), nullable=True), + sa.Column("api_url", sa.String(), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_contributors_github_id"), "contributors", ["github_id"], unique=True + ) + op.create_index(op.f("ix_contributors_id"), "contributors", ["id"], unique=False) + op.create_index( + op.f("ix_contributors_login"), "contributors", ["login"], unique=True + ) + op.create_index( + op.f("ix_contributors_type"), "contributors", ["type"], unique=False + ) + op.create_table( + "discovery_chains", + sa.Column("id", sa.UUID(), nullable=False), + sa.Column("parent_chain_id", sa.UUID(), nullable=True), + sa.Column("root_chain_id", sa.UUID(), nullable=False), + sa.Column("level", sa.Integer(), nullable=False), + sa.Column("discovery_type", sa.String(), nullable=False), + sa.Column("parameters", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column("status", sa.String(), nullable=False), + sa.Column( + "started_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column("completed_at", sa.DateTime(timezone=True), nullable=True), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["parent_chain_id"], + ["discovery_chains.id"], + ), + sa.ForeignKeyConstraint( + ["root_chain_id"], + ["discovery_chains.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_discovery_chains_root_chain_id"), + "discovery_chains", + ["root_chain_id"], + unique=False, + ) + op.create_index( + "ix_discovery_chains_root_id", + "discovery_chains", + ["root_chain_id"], + unique=False, + ) + op.create_index( + op.f("ix_discovery_chains_status"), "discovery_chains", ["status"], unique=False + ) + op.create_table( + "owners", + sa.Column("github_id", sa.BigInteger(), nullable=False), + sa.Column("login", sa.String(), nullable=False), + sa.Column("type", sa.String(), nullable=False), + sa.Column("avatar_url", sa.String(), nullable=True), + sa.Column("html_url", sa.String(), nullable=True), + sa.Column("api_url", sa.String(), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index(op.f("ix_owners_github_id"), "owners", ["github_id"], unique=True) + op.create_index(op.f("ix_owners_id"), "owners", ["id"], unique=False) + op.create_index(op.f("ix_owners_login"), "owners", ["login"], unique=True) + op.create_index(op.f("ix_owners_type"), "owners", ["type"], unique=False) + op.create_table( + "works", + sa.Column("openalex_id", sa.String(), nullable=False), + sa.Column("doi", sa.String(), nullable=False), + sa.Column("title", sa.Text(), nullable=True), + sa.Column("publication_year", sa.Integer(), nullable=True), + sa.Column("type", sa.String(), nullable=True), + sa.Column("cited_by_count", sa.Integer(), nullable=True), + sa.Column("host_venue_display_name", sa.String(), nullable=True), + sa.Column("openalex_url", sa.String(), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index(op.f("ix_works_doi"), "works", ["doi"], unique=True) + op.create_index(op.f("ix_works_id"), "works", ["id"], unique=False) + op.create_index(op.f("ix_works_openalex_id"), "works", ["openalex_id"], unique=True) + op.create_index( + op.f("ix_works_publication_year"), "works", ["publication_year"], unique=False + ) + op.create_index("ix_works_type", "works", ["type"], unique=False) + op.create_table( + "entity_discovery_associations", + sa.Column("discovery_chain_id", sa.UUID(), nullable=False), + sa.Column("entity_type", sa.String(), nullable=False), + sa.Column("entity_id", sa.Integer(), nullable=False), + sa.Column("is_direct_discovery", sa.Boolean(), nullable=False), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["discovery_chain_id"], + ["discovery_chains.id"], + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint( + "discovery_chain_id", "entity_type", "entity_id", name="uq_discovery_entity" + ), + ) + op.create_index( + op.f("ix_entity_discovery_associations_discovery_chain_id"), + "entity_discovery_associations", + ["discovery_chain_id"], + unique=False, + ) + op.create_index( + op.f("ix_entity_discovery_associations_entity_id"), + "entity_discovery_associations", + ["entity_id"], + unique=False, + ) + op.create_index( + op.f("ix_entity_discovery_associations_entity_type"), + "entity_discovery_associations", + ["entity_type"], + unique=False, + ) + op.create_index( + op.f("ix_entity_discovery_associations_id"), + "entity_discovery_associations", + ["id"], + unique=False, + ) + op.create_index( + "ix_entity_discovery_chain_id", + "entity_discovery_associations", + ["discovery_chain_id"], + unique=False, + ) + op.create_index( + "ix_entity_discovery_entity", + "entity_discovery_associations", + ["entity_type", "entity_id"], + unique=False, + ) + op.create_table( + "repositories", + sa.Column("github_id", sa.BigInteger(), nullable=False), + sa.Column("name", sa.String(), nullable=False), + sa.Column("full_name", sa.String(), nullable=False), + sa.Column("description", sa.Text(), nullable=True), + sa.Column("homepage", sa.String(), nullable=True), + sa.Column("html_url", sa.String(), nullable=False), + sa.Column("api_url", sa.String(), nullable=False), + sa.Column("language", sa.String(), nullable=True), + sa.Column("default_branch", sa.String(), nullable=True), + sa.Column("stargazers_count", sa.Integer(), nullable=False), + sa.Column("watchers_count", sa.Integer(), nullable=False), + sa.Column("forks_count", sa.Integer(), nullable=False), + sa.Column("open_issues_count", sa.Integer(), nullable=False), + sa.Column("is_fork", sa.Boolean(), nullable=False), + sa.Column("gh_created_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("gh_updated_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("gh_pushed_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("owner_id", sa.Integer(), nullable=False), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["owner_id"], + ["owners.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_repositories_full_name"), "repositories", ["full_name"], unique=True + ) + op.create_index( + op.f("ix_repositories_github_id"), "repositories", ["github_id"], unique=True + ) + op.create_index(op.f("ix_repositories_id"), "repositories", ["id"], unique=False) + op.create_index( + op.f("ix_repositories_language"), "repositories", ["language"], unique=False + ) + op.create_index( + op.f("ix_repositories_owner_id"), "repositories", ["owner_id"], unique=False + ) + op.create_table( + "doi_references", + sa.Column("doi", sa.String(), nullable=False), + sa.Column("repository_id", sa.Integer(), nullable=False), + sa.Column("work_id", sa.Integer(), nullable=True), + sa.Column("source_file", sa.String(), nullable=True), + sa.Column("context", sa.Text(), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["repository_id"], + ["repositories.id"], + ), + sa.ForeignKeyConstraint( + ["work_id"], + ["works.id"], + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint( + "repository_id", "doi", "source_file", name="uq_repo_doi_source" + ), + ) + op.create_index("ix_doi_references_doi", "doi_references", ["doi"], unique=False) + op.create_index( + op.f("ix_doi_references_id"), "doi_references", ["id"], unique=False + ) + op.create_index( + op.f("ix_doi_references_repository_id"), + "doi_references", + ["repository_id"], + unique=False, + ) + op.create_index( + op.f("ix_doi_references_work_id"), "doi_references", ["work_id"], unique=False + ) + op.create_table( + "repository_contributors", + sa.Column("repository_id", sa.Integer(), nullable=False), + sa.Column("contributor_id", sa.Integer(), nullable=False), + sa.Column("contributions_count", sa.Integer(), nullable=True), + sa.ForeignKeyConstraint( + ["contributor_id"], + ["contributors.id"], + ), + sa.ForeignKeyConstraint( + ["repository_id"], + ["repositories.id"], + ), + sa.PrimaryKeyConstraint("repository_id", "contributor_id"), + sa.UniqueConstraint("repository_id", "contributor_id", name="uq_repo_contrib"), ) # ### end Alembic commands ### @@ -172,43 +351,61 @@ def upgrade() -> None: def downgrade() -> None: """Downgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.drop_table('repository_contributors') - op.drop_index(op.f('ix_doi_references_work_id'), table_name='doi_references') - op.drop_index(op.f('ix_doi_references_repository_id'), table_name='doi_references') - op.drop_index(op.f('ix_doi_references_id'), table_name='doi_references') - op.drop_index('ix_doi_references_doi', table_name='doi_references') - op.drop_table('doi_references') - op.drop_index(op.f('ix_repositories_owner_id'), table_name='repositories') - op.drop_index(op.f('ix_repositories_language'), table_name='repositories') - op.drop_index(op.f('ix_repositories_id'), table_name='repositories') - op.drop_index(op.f('ix_repositories_github_id'), table_name='repositories') - op.drop_index(op.f('ix_repositories_full_name'), table_name='repositories') - op.drop_table('repositories') - op.drop_index('ix_entity_discovery_entity', table_name='entity_discovery_associations') - op.drop_index('ix_entity_discovery_chain_id', table_name='entity_discovery_associations') - op.drop_index(op.f('ix_entity_discovery_associations_id'), table_name='entity_discovery_associations') - op.drop_index(op.f('ix_entity_discovery_associations_entity_type'), table_name='entity_discovery_associations') - op.drop_index(op.f('ix_entity_discovery_associations_entity_id'), table_name='entity_discovery_associations') - op.drop_index(op.f('ix_entity_discovery_associations_discovery_chain_id'), table_name='entity_discovery_associations') - op.drop_table('entity_discovery_associations') - op.drop_index('ix_works_type', table_name='works') - op.drop_index(op.f('ix_works_publication_year'), table_name='works') - op.drop_index(op.f('ix_works_openalex_id'), table_name='works') - op.drop_index(op.f('ix_works_id'), table_name='works') - op.drop_index(op.f('ix_works_doi'), table_name='works') - op.drop_table('works') - op.drop_index(op.f('ix_owners_type'), table_name='owners') - op.drop_index(op.f('ix_owners_login'), table_name='owners') - op.drop_index(op.f('ix_owners_id'), table_name='owners') - op.drop_index(op.f('ix_owners_github_id'), table_name='owners') - op.drop_table('owners') - op.drop_index(op.f('ix_discovery_chains_status'), table_name='discovery_chains') - op.drop_index('ix_discovery_chains_root_id', table_name='discovery_chains') - op.drop_index(op.f('ix_discovery_chains_root_chain_id'), table_name='discovery_chains') - op.drop_table('discovery_chains') - op.drop_index(op.f('ix_contributors_type'), table_name='contributors') - op.drop_index(op.f('ix_contributors_login'), table_name='contributors') - op.drop_index(op.f('ix_contributors_id'), table_name='contributors') - op.drop_index(op.f('ix_contributors_github_id'), table_name='contributors') - op.drop_table('contributors') + op.drop_table("repository_contributors") + op.drop_index(op.f("ix_doi_references_work_id"), table_name="doi_references") + op.drop_index(op.f("ix_doi_references_repository_id"), table_name="doi_references") + op.drop_index(op.f("ix_doi_references_id"), table_name="doi_references") + op.drop_index("ix_doi_references_doi", table_name="doi_references") + op.drop_table("doi_references") + op.drop_index(op.f("ix_repositories_owner_id"), table_name="repositories") + op.drop_index(op.f("ix_repositories_language"), table_name="repositories") + op.drop_index(op.f("ix_repositories_id"), table_name="repositories") + op.drop_index(op.f("ix_repositories_github_id"), table_name="repositories") + op.drop_index(op.f("ix_repositories_full_name"), table_name="repositories") + op.drop_table("repositories") + op.drop_index( + "ix_entity_discovery_entity", table_name="entity_discovery_associations" + ) + op.drop_index( + "ix_entity_discovery_chain_id", table_name="entity_discovery_associations" + ) + op.drop_index( + op.f("ix_entity_discovery_associations_id"), + table_name="entity_discovery_associations", + ) + op.drop_index( + op.f("ix_entity_discovery_associations_entity_type"), + table_name="entity_discovery_associations", + ) + op.drop_index( + op.f("ix_entity_discovery_associations_entity_id"), + table_name="entity_discovery_associations", + ) + op.drop_index( + op.f("ix_entity_discovery_associations_discovery_chain_id"), + table_name="entity_discovery_associations", + ) + op.drop_table("entity_discovery_associations") + op.drop_index("ix_works_type", table_name="works") + op.drop_index(op.f("ix_works_publication_year"), table_name="works") + op.drop_index(op.f("ix_works_openalex_id"), table_name="works") + op.drop_index(op.f("ix_works_id"), table_name="works") + op.drop_index(op.f("ix_works_doi"), table_name="works") + op.drop_table("works") + op.drop_index(op.f("ix_owners_type"), table_name="owners") + op.drop_index(op.f("ix_owners_login"), table_name="owners") + op.drop_index(op.f("ix_owners_id"), table_name="owners") + op.drop_index(op.f("ix_owners_github_id"), table_name="owners") + op.drop_table("owners") + op.drop_index(op.f("ix_discovery_chains_status"), table_name="discovery_chains") + op.drop_index("ix_discovery_chains_root_id", table_name="discovery_chains") + op.drop_index( + op.f("ix_discovery_chains_root_chain_id"), table_name="discovery_chains" + ) + op.drop_table("discovery_chains") + op.drop_index(op.f("ix_contributors_type"), table_name="contributors") + op.drop_index(op.f("ix_contributors_login"), table_name="contributors") + op.drop_index(op.f("ix_contributors_id"), table_name="contributors") + op.drop_index(op.f("ix_contributors_github_id"), table_name="contributors") + op.drop_table("contributors") # ### end Alembic commands ### diff --git a/backend/data/migrations/versions/c9b46f9c64e5_phase_6_add_topics_and_license_to_.py b/backend/data/migrations/versions/c9b46f9c64e5_phase_6_add_topics_and_license_to_.py index 391c107..5195985 100644 --- a/backend/data/migrations/versions/c9b46f9c64e5_phase_6_add_topics_and_license_to_.py +++ b/backend/data/migrations/versions/c9b46f9c64e5_phase_6_add_topics_and_license_to_.py @@ -5,6 +5,7 @@ Create Date: 2025-04-08 17:36:44.473362 """ + from typing import Sequence, Union from alembic import op @@ -12,8 +13,8 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision: str = 'c9b46f9c64e5' -down_revision: Union[str, None] = '3ab81a4cf052' +revision: str = "c9b46f9c64e5" +down_revision: Union[str, None] = "3ab81a4cf052" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -27,9 +28,15 @@ def upgrade() -> None: # --- END REMOVAL --- # Correct: Adds topics column - op.add_column('repositories', sa.Column('topics', postgresql.JSONB(astext_type=sa.Text()), nullable=True)) + op.add_column( + "repositories", + sa.Column("topics", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + ) # Correct: Adds license column - op.add_column('repositories', sa.Column('license', postgresql.JSONB(astext_type=sa.Text()), nullable=True)) + op.add_column( + "repositories", + sa.Column("license", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + ) # ### end Alembic commands ### @@ -37,12 +44,12 @@ def downgrade() -> None: """Downgrade schema.""" # ### commands auto generated by Alembic - START ### # Correct: Drops license column - op.drop_column('repositories', 'license') + op.drop_column("repositories", "license") # Correct: Drops topics column - op.drop_column('repositories', 'topics') + op.drop_column("repositories", "topics") # --- REMOVED ORCID INDEX CHANGES --- # op.drop_index('ix_persons_orcid', table_name='persons') # op.create_index('ix_persons_orcid', 'persons', ['orcid'], unique=True) # --- END REMOVAL --- - # ### end Alembic commands ### \ No newline at end of file + # ### end Alembic commands ### diff --git a/backend/data/migrations/versions/d19968da140c_phase_19_add_concept_and_workconcept_.py b/backend/data/migrations/versions/d19968da140c_phase_19_add_concept_and_workconcept_.py index 8d3693a..ea5e462 100644 --- a/backend/data/migrations/versions/d19968da140c_phase_19_add_concept_and_workconcept_.py +++ b/backend/data/migrations/versions/d19968da140c_phase_19_add_concept_and_workconcept_.py @@ -5,15 +5,15 @@ Create Date: 2025-04-16 10:43:42.466277 """ + from typing import Sequence, Union from alembic import op -import sqlalchemy as sa # revision identifiers, used by Alembic. -revision: str = 'd19968da140c' -down_revision: Union[str, None] = '4c5ec8e48a9c' +revision: str = "d19968da140c" +down_revision: Union[str, None] = "4c5ec8e48a9c" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -21,18 +21,18 @@ def upgrade() -> None: """Upgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.drop_index('ix_domains_openalex_id', table_name='domains') - op.create_index('ix_domains_openalex_id', 'domains', ['openalex_id'], unique=False) - op.drop_index('ix_institutions_ror', table_name='institutions') - op.create_index('ix_institutions_ror', 'institutions', ['ror'], unique=False) + op.drop_index("ix_domains_openalex_id", table_name="domains") + op.create_index("ix_domains_openalex_id", "domains", ["openalex_id"], unique=False) + op.drop_index("ix_institutions_ror", table_name="institutions") + op.create_index("ix_institutions_ror", "institutions", ["ror"], unique=False) # ### end Alembic commands ### def downgrade() -> None: """Downgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.drop_index('ix_institutions_ror', table_name='institutions') - op.create_index('ix_institutions_ror', 'institutions', ['ror'], unique=True) - op.drop_index('ix_domains_openalex_id', table_name='domains') - op.create_index('ix_domains_openalex_id', 'domains', ['openalex_id'], unique=True) + op.drop_index("ix_institutions_ror", table_name="institutions") + op.create_index("ix_institutions_ror", "institutions", ["ror"], unique=True) + op.drop_index("ix_domains_openalex_id", table_name="domains") + op.create_index("ix_domains_openalex_id", "domains", ["openalex_id"], unique=True) # ### end Alembic commands ### diff --git a/backend/data/migrations/versions/dd1449ba853a_phase_18_add_software_dependency_table.py b/backend/data/migrations/versions/dd1449ba853a_phase_18_add_software_dependency_table.py index 3d9c6fa..610d064 100644 --- a/backend/data/migrations/versions/dd1449ba853a_phase_18_add_software_dependency_table.py +++ b/backend/data/migrations/versions/dd1449ba853a_phase_18_add_software_dependency_table.py @@ -5,6 +5,7 @@ Create Date: 2025-04-12 21:52:53.470471 """ + from typing import Sequence, Union from alembic import op @@ -12,8 +13,8 @@ # revision identifiers, used by Alembic. -revision: str = 'dd1449ba853a' -down_revision: Union[str, None] = 'ed4cc55634bf' +revision: str = "dd1449ba853a" +down_revision: Union[str, None] = "ed4cc55634bf" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -21,42 +22,116 @@ def upgrade() -> None: """Upgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.create_table('software_dependencies', - sa.Column('repository_id', sa.Integer(), nullable=False), - sa.Column('dependency_name', sa.String(), nullable=False), - sa.Column('version_constraint', sa.String(), nullable=True), - sa.Column('source_file', sa.String(), nullable=False), - sa.Column('dependency_type', sa.String(), nullable=False), - sa.Column('is_dev_dependency', sa.Boolean(), nullable=True), - sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.ForeignKeyConstraint(['repository_id'], ['repositories.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_software_dependencies_dependency_name'), 'software_dependencies', ['dependency_name'], unique=False) - op.create_index(op.f('ix_software_dependencies_dependency_type'), 'software_dependencies', ['dependency_type'], unique=False) - op.create_index(op.f('ix_software_dependencies_id'), 'software_dependencies', ['id'], unique=False) - op.create_index('ix_software_dependencies_is_dev', 'software_dependencies', ['is_dev_dependency'], unique=False) - op.create_index(op.f('ix_software_dependencies_is_dev_dependency'), 'software_dependencies', ['is_dev_dependency'], unique=False) - op.create_index('ix_software_dependencies_name', 'software_dependencies', ['dependency_name'], unique=False) - op.create_index('ix_software_dependencies_repo_id', 'software_dependencies', ['repository_id'], unique=False) - op.create_index(op.f('ix_software_dependencies_repository_id'), 'software_dependencies', ['repository_id'], unique=False) - op.create_index('ix_software_dependencies_type', 'software_dependencies', ['dependency_type'], unique=False) + op.create_table( + "software_dependencies", + sa.Column("repository_id", sa.Integer(), nullable=False), + sa.Column("dependency_name", sa.String(), nullable=False), + sa.Column("version_constraint", sa.String(), nullable=True), + sa.Column("source_file", sa.String(), nullable=False), + sa.Column("dependency_type", sa.String(), nullable=False), + sa.Column("is_dev_dependency", sa.Boolean(), nullable=True), + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["repository_id"], ["repositories.id"], ondelete="CASCADE" + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_software_dependencies_dependency_name"), + "software_dependencies", + ["dependency_name"], + unique=False, + ) + op.create_index( + op.f("ix_software_dependencies_dependency_type"), + "software_dependencies", + ["dependency_type"], + unique=False, + ) + op.create_index( + op.f("ix_software_dependencies_id"), + "software_dependencies", + ["id"], + unique=False, + ) + op.create_index( + "ix_software_dependencies_is_dev", + "software_dependencies", + ["is_dev_dependency"], + unique=False, + ) + op.create_index( + op.f("ix_software_dependencies_is_dev_dependency"), + "software_dependencies", + ["is_dev_dependency"], + unique=False, + ) + op.create_index( + "ix_software_dependencies_name", + "software_dependencies", + ["dependency_name"], + unique=False, + ) + op.create_index( + "ix_software_dependencies_repo_id", + "software_dependencies", + ["repository_id"], + unique=False, + ) + op.create_index( + op.f("ix_software_dependencies_repository_id"), + "software_dependencies", + ["repository_id"], + unique=False, + ) + op.create_index( + "ix_software_dependencies_type", + "software_dependencies", + ["dependency_type"], + unique=False, + ) # ### end Alembic commands ### def downgrade() -> None: """Downgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.drop_index('ix_software_dependencies_type', table_name='software_dependencies') - op.drop_index(op.f('ix_software_dependencies_repository_id'), table_name='software_dependencies') - op.drop_index('ix_software_dependencies_repo_id', table_name='software_dependencies') - op.drop_index('ix_software_dependencies_name', table_name='software_dependencies') - op.drop_index(op.f('ix_software_dependencies_is_dev_dependency'), table_name='software_dependencies') - op.drop_index('ix_software_dependencies_is_dev', table_name='software_dependencies') - op.drop_index(op.f('ix_software_dependencies_id'), table_name='software_dependencies') - op.drop_index(op.f('ix_software_dependencies_dependency_type'), table_name='software_dependencies') - op.drop_index(op.f('ix_software_dependencies_dependency_name'), table_name='software_dependencies') - op.drop_table('software_dependencies') + op.drop_index("ix_software_dependencies_type", table_name="software_dependencies") + op.drop_index( + op.f("ix_software_dependencies_repository_id"), + table_name="software_dependencies", + ) + op.drop_index( + "ix_software_dependencies_repo_id", table_name="software_dependencies" + ) + op.drop_index("ix_software_dependencies_name", table_name="software_dependencies") + op.drop_index( + op.f("ix_software_dependencies_is_dev_dependency"), + table_name="software_dependencies", + ) + op.drop_index("ix_software_dependencies_is_dev", table_name="software_dependencies") + op.drop_index( + op.f("ix_software_dependencies_id"), table_name="software_dependencies" + ) + op.drop_index( + op.f("ix_software_dependencies_dependency_type"), + table_name="software_dependencies", + ) + op.drop_index( + op.f("ix_software_dependencies_dependency_name"), + table_name="software_dependencies", + ) + op.drop_table("software_dependencies") # ### end Alembic commands ### diff --git a/backend/data/migrations/versions/ed4cc55634bf_phase_10_3_add_github_organization_.py b/backend/data/migrations/versions/ed4cc55634bf_phase_10_3_add_github_organization_.py index 39dca37..435a918 100644 --- a/backend/data/migrations/versions/ed4cc55634bf_phase_10_3_add_github_organization_.py +++ b/backend/data/migrations/versions/ed4cc55634bf_phase_10_3_add_github_organization_.py @@ -5,6 +5,7 @@ Create Date: 2025-04-11 08:18:38.324876 """ + from typing import Sequence, Union from alembic import op @@ -12,8 +13,8 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision: str = 'ed4cc55634bf' -down_revision: Union[str, None] = '' +revision: str = "ed4cc55634bf" +down_revision: Union[str, None] = "" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -21,12 +22,19 @@ def upgrade() -> None: """Upgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.add_column('institutions', sa.Column('github_organization_logins', postgresql.JSONB(astext_type=sa.Text()), nullable=True)) + op.add_column( + "institutions", + sa.Column( + "github_organization_logins", + postgresql.JSONB(astext_type=sa.Text()), + nullable=True, + ), + ) # ### end Alembic commands ### def downgrade() -> None: """Downgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.drop_column('institutions', 'github_organization_logins') + op.drop_column("institutions", "github_organization_logins") # ### end Alembic commands ### diff --git a/backend/data/models/__init__.py b/backend/data/models/__init__.py index 500f8aa..15e1e6f 100644 --- a/backend/data/models/__init__.py +++ b/backend/data/models/__init__.py @@ -2,7 +2,7 @@ # Import base first if other models rely on it implicitly from .base import BaseModel -from .types import * # Import custom types +from .types import * # Import custom types # Import all the models to make them visible to SQLAlchemy and Alembic from .owner import Owner @@ -29,8 +29,8 @@ from .work_topic import WorkTopic from .pull_request import PullRequest from .issue import Issue -from .issue_comment import IssueComment # <<< Added -from .pr_review_comment import PRReviewComment # <<< Added +from .issue_comment import IssueComment # <<< Added +from .pr_review_comment import PRReviewComment # <<< Added # Optionally define __all__ to control `from backend.data.models import *` behavior @@ -60,6 +60,6 @@ "WorkTopic", "PullRequest", "Issue", - "IssueComment", # <<< Added - "PRReviewComment", # <<< Added -] \ No newline at end of file + "IssueComment", # <<< Added + "PRReviewComment", # <<< Added +] diff --git a/backend/data/models/affiliation.py b/backend/data/models/affiliation.py index 524061e..8c9a232 100644 --- a/backend/data/models/affiliation.py +++ b/backend/data/models/affiliation.py @@ -24,6 +24,7 @@ logger = logging.getLogger(__name__) + class Affiliation(Base): """ Represents the association between an Authorship (Work+Person) and an Institution. @@ -39,6 +40,7 @@ class Affiliation(Base): authorship: Relationship back to the specific Authorship record. institution: Relationship back to the specific Institution record. """ + __tablename__ = "affiliations" # --- Composite Primary Key Definition --- @@ -51,7 +53,7 @@ class Affiliation(Base): institution_id: Mapped[int] = mapped_column( # Define the foreign key constraint directly here ForeignKey("institutions.id", ondelete="CASCADE"), - primary_key=True # This column is also part of the composite primary key + primary_key=True, # This column is also part of the composite primary key ) # --- Relationships --- @@ -73,20 +75,22 @@ class Affiliation(Base): # 'ondelete="CASCADE"' ensures that if an Authorship record is deleted, # all corresponding Affiliation records are also automatically deleted. ForeignKeyConstraint( - ['authorship_work_id', 'authorship_person_id'], - ['authorships.work_id', 'authorships.person_id'], + ["authorship_work_id", "authorship_person_id"], + ["authorships.work_id", "authorships.person_id"], ondelete="CASCADE", - name='fk_affiliation_authorship' # Optional: Provides a specific name for the constraint + name="fk_affiliation_authorship", # Optional: Provides a specific name for the constraint ), # Define indexes on individual foreign key columns to speed up lookups # based on institution or parts of the authorship key. - Index('ix_affiliations_institution_id', 'institution_id'), - Index('ix_affiliations_authorship_work_id', 'authorship_work_id'), - Index('ix_affiliations_authorship_person_id', 'authorship_person_id'), + Index("ix_affiliations_institution_id", "institution_id"), + Index("ix_affiliations_authorship_work_id", "authorship_work_id"), + Index("ix_affiliations_authorship_person_id", "authorship_person_id"), # Note: The composite primary key implicitly creates an index on (work_id, person_id, inst_id). ) def __repr__(self): """Provides a developer-friendly string representation of the Affiliation.""" - return (f"") \ No newline at end of file + return ( + f"" + ) diff --git a/backend/data/models/authorship.py b/backend/data/models/authorship.py index 4b16896..0539f3d 100644 --- a/backend/data/models/authorship.py +++ b/backend/data/models/authorship.py @@ -9,7 +9,7 @@ import logging from typing import List, Optional, TYPE_CHECKING -from sqlalchemy import String, Integer, Boolean, ForeignKey, Index +from sqlalchemy import String, Boolean, ForeignKey, Index from sqlalchemy.orm import relationship, Mapped, mapped_column @@ -22,10 +22,11 @@ if TYPE_CHECKING: from .work import Work from .person import Person - from .affiliation import Affiliation # Required for the 'affiliations' relationship + from .affiliation import Affiliation # Required for the 'affiliations' relationship logger = logging.getLogger(__name__) + class Authorship(Base): """ Represents the association between a Person (author) and a Work. @@ -45,6 +46,7 @@ class Authorship(Base): person: Relationship back to the Person object. affiliations: Relationship to associated Affiliation records for this specific authorship. """ + __tablename__ = "authorships" # --- Composite Primary Key and Foreign Keys --- @@ -53,14 +55,14 @@ class Authorship(Base): work_id: Mapped[int] = mapped_column( # Define the foreign key constraint to the 'works' table. ForeignKey("works.id", ondelete="CASCADE"), - primary_key=True # This column is part of the composite primary key. + primary_key=True, # This column is part of the composite primary key. # 'ondelete="CASCADE"' ensures that if a Work is deleted, all its Authorship # records (and consequently their Affiliations) are also deleted. ) person_id: Mapped[int] = mapped_column( # Define the foreign key constraint to the 'persons' table. ForeignKey("persons.id", ondelete="CASCADE"), - primary_key=True # This column is also part of the composite primary key. + primary_key=True, # This column is also part of the composite primary key. # 'ondelete="CASCADE"' ensures that if a Person is deleted, all their Authorship # records (and consequently their Affiliations) are also deleted. ) @@ -69,10 +71,10 @@ class Authorship(Base): # Optional fields providing more context about the specific authorship role. author_position: Mapped[Optional[str]] = mapped_column( String, nullable=True - ) # E.g., 'first', 'middle', 'last' - useful for author contribution analysis. + ) # E.g., 'first', 'middle', 'last' - useful for author contribution analysis. is_corresponding: Mapped[Optional[bool]] = mapped_column( Boolean, nullable=True - ) # Indicates if this author handled correspondence for the publication. + ) # Indicates if this author handled correspondence for the publication. # --- Relationships --- # Define bidirectional relationships for easier data access and navigation. @@ -93,7 +95,7 @@ class Authorship(Base): # 'cascade="all, delete-orphan"' means that if an Authorship record is deleted, # all Affiliation records associated *only* with this Authorship will also be deleted. # Operations like adding an Affiliation via this Authorship object will be cascaded. - cascade="all, delete-orphan" + cascade="all, delete-orphan", ) # --- Table Arguments --- @@ -101,10 +103,10 @@ class Authorship(Base): # provides an index on (work_id, person_id), separate indexes on each column # can improve performance for queries filtering only by work_id or only by person_id. __table_args__ = ( - Index('ix_authorships_work_id', 'work_id'), - Index('ix_authorships_person_id', 'person_id'), + Index("ix_authorships_work_id", "work_id"), + Index("ix_authorships_person_id", "person_id"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" - return f"" \ No newline at end of file + return f"" diff --git a/backend/data/models/base.py b/backend/data/models/base.py index 9c11505..1331e68 100644 --- a/backend/data/models/base.py +++ b/backend/data/models/base.py @@ -9,12 +9,12 @@ # Keep these necessary imports for defining mapped columns and declared attributes from sqlalchemy.orm import Mapped -from sqlalchemy.ext.declarative import declared_attr # Import the custom type definitions from the local 'types.py' file # This promotes consistency and reusability across different models. from .types import intpk, timestamp_created, timestamp_updated + class BaseModel: """ Base mixin class providing common columns for database models. @@ -31,6 +31,7 @@ class BaseModel: Models inheriting from this mixin should also inherit from the SQLAlchemy declarative base (e.g., `Base` from `database.py`). """ + # --- Common Columns --- # Define the primary key column. @@ -47,7 +48,6 @@ class BaseModel: # with a server_default and an onupdate trigger to set the current time. updated_at: Mapped[timestamp_updated] - # --- Optional: Automatic Tablename Generation --- # This commented-out section shows how you could automatically generate # table names based on the class name (e.g., 'MyModel' -> 'mymodels'). @@ -55,4 +55,4 @@ class BaseModel: # @declared_attr # def __tablename__(cls): # # Example: Converts 'ModelName' to 'modelnames' - # return cls.__name__.lower() + "s" \ No newline at end of file + # return cls.__name__.lower() + "s" diff --git a/backend/data/models/contributor.py b/backend/data/models/contributor.py index 158836e..b50b416 100644 --- a/backend/data/models/contributor.py +++ b/backend/data/models/contributor.py @@ -18,6 +18,7 @@ if TYPE_CHECKING: from .repository import Repository + class Contributor(BaseModel, Base): """ Represents a GitHub User or Bot identified as a contributor. @@ -39,13 +40,16 @@ class Contributor(BaseModel, Base): Repositories they have contributed to, via the 'repository_contributors' association table. """ + __tablename__ = "contributors" # --- GitHub Identifiers and Details --- # Store key information directly retrieved from the GitHub API. # GitHub's unique ID for the user or bot. Indexed for fast lookups. - github_id: Mapped[int] = mapped_column(BigInteger, unique=True, index=True, nullable=False) + github_id: Mapped[int] = mapped_column( + BigInteger, unique=True, index=True, nullable=False + ) # GitHub login username. Should be unique and indexed. login: Mapped[str] = mapped_column(String, unique=True, index=True, nullable=False) @@ -55,8 +59,12 @@ class Contributor(BaseModel, Base): # Optional profile details from GitHub. avatar_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) - html_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) # Link to GitHub profile - api_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) # Link to GitHub API endpoint + html_url: Mapped[Optional[str]] = mapped_column( + String, nullable=True + ) # Link to GitHub profile + api_url: Mapped[Optional[str]] = mapped_column( + String, nullable=True + ) # Link to GitHub API endpoint # --- Relationships --- # Define the many-to-many relationship to Repositories. @@ -67,8 +75,8 @@ class Contributor(BaseModel, Base): # `back_populates` establishes the bidirectional link to the 'contributors' # attribute defined in the Repository model. repositories: Mapped[List["Repository"]] = relationship( - secondary="repository_contributors", # Name of the intermediary association table - back_populates="contributors" # Connects to Repository.contributors + secondary="repository_contributors", # Name of the intermediary association table + back_populates="contributors", # Connects to Repository.contributors ) # --- Table Arguments --- @@ -77,11 +85,11 @@ class Contributor(BaseModel, Base): __table_args__ = ( # Explicitly create an index on the 'type' column for faster filtering # queries based on contributor type (e.g., finding all 'User' contributors). - Index('ix_contributors_type', 'type'), + Index("ix_contributors_type", "type"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Uses getattr for id in case the instance isn't flushed yet - obj_id = getattr(self, 'id', None) - return f"" \ No newline at end of file + obj_id = getattr(self, "id", None) + return f"" diff --git a/backend/data/models/discovery_chain.py b/backend/data/models/discovery_chain.py index 245fc9d..b20074c 100644 --- a/backend/data/models/discovery_chain.py +++ b/backend/data/models/discovery_chain.py @@ -10,7 +10,10 @@ import uuid from typing import List, Optional, Any, TYPE_CHECKING from sqlalchemy import ( - Column, String, Integer, DateTime, ForeignKey, Index, func # Keep necessary imports + String, + Integer, + ForeignKey, + Index, # Keep necessary imports ) from sqlalchemy.dialects.postgresql import UUID as PG_UUID from sqlalchemy.dialects.postgresql import JSONB @@ -19,6 +22,7 @@ # Assuming Base is correctly defined elsewhere # Adjust import path as necessary from ..database import Base + # Import custom timestamp types for consistency from .types import timestamp_nullable, timestamp_created, timestamp_updated @@ -26,6 +30,7 @@ if TYPE_CHECKING: from .entity_discovery_association import EntityDiscoveryAssociation + class DiscoveryChain(Base): """ Represents a single step or node in the discovery provenance graph. @@ -56,24 +61,27 @@ class DiscoveryChain(Base): children: Relationship to child DiscoveryChain nodes initiated from this one. entity_associations: Relationship to entities discovered during this step. """ + __tablename__ = "discovery_chains" # --- Core Attributes --- # Unique identifier using UUID - more robust for distributed/parallel discovery processes. - id: Mapped[uuid.UUID] = mapped_column(PG_UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + id: Mapped[uuid.UUID] = mapped_column( + PG_UUID(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) # --- Hierarchy Tracking --- # Links to establish the tree/graph structure. parent_chain_id: Mapped[Optional[uuid.UUID]] = mapped_column( - ForeignKey("discovery_chains.id"), # Self-referential foreign key - nullable=True # Root nodes have no parent + ForeignKey("discovery_chains.id"), # Self-referential foreign key + nullable=True, # Root nodes have no parent ) # Storing the root ID allows quick traversal to the origin of any discovery chain. # Indexed for efficient lookup of all nodes belonging to the same root process. root_chain_id: Mapped[uuid.UUID] = mapped_column( - ForeignKey("discovery_chains.id"), # Also self-referential - index=True, # Index this column - nullable=False # Every node must belong to a root + ForeignKey("discovery_chains.id"), # Also self-referential + index=True, # Index this column + nullable=False, # Every node must belong to a root ) # Level indicates the depth in the discovery hierarchy (0 = root). level: Mapped[int] = mapped_column(Integer, nullable=False, default=0) @@ -85,14 +93,18 @@ class DiscoveryChain(Base): # Flexible storage for parameters used, e.g., {'keywords': ['AI', 'HPC'], 'source': 'GitHub'}. parameters: Mapped[Optional[dict[str, Any]]] = mapped_column(JSONB, nullable=True) # Tracks the execution state of this discovery step. Indexed for querying active/failed jobs. - status: Mapped[str] = mapped_column(String, index=True, nullable=False, default='PENDING') + status: Mapped[str] = mapped_column( + String, index=True, nullable=False, default="PENDING" + ) # --- Timestamps --- # Use custom timestamp types for consistency. - started_at: Mapped[timestamp_created] # When the task began processing - completed_at: Mapped[timestamp_nullable] # When the task finished (null if pending/running/failed early) - created_at: Mapped[timestamp_created] # Standard record creation timestamp - updated_at: Mapped[timestamp_updated] # Standard record update timestamp + started_at: Mapped[timestamp_created] # When the task began processing + completed_at: Mapped[ + timestamp_nullable + ] # When the task finished (null if pending/running/failed early) + created_at: Mapped[timestamp_created] # Standard record creation timestamp + updated_at: Mapped[timestamp_updated] # Standard record update timestamp # --- Relationships --- # Define relationships for navigating the discovery graph and associated entities. @@ -102,30 +114,32 @@ class DiscoveryChain(Base): # which column on the 'remote' side (the DiscoveryChain table itself) the # foreign key points to. parent: Mapped[Optional["DiscoveryChain"]] = relationship( - foreign_keys=[parent_chain_id], # Specifies the FK column for this relationship - remote_side=[id], # Specifies the PK column on the remote side - back_populates="children" # Links to the 'children' collection below + foreign_keys=[parent_chain_id], # Specifies the FK column for this relationship + remote_side=[id], # Specifies the PK column on the remote side + back_populates="children", # Links to the 'children' collection below ) # Relationship to child nodes spawned from this discovery step. children: Mapped[List["DiscoveryChain"]] = relationship( - foreign_keys=[parent_chain_id], # Child nodes point back to this node's ID via parent_chain_id - back_populates="parent", # Links back to the 'parent' relationship above - cascade="all, delete-orphan" # If a parent node is deleted, its children are also deleted + foreign_keys=[ + parent_chain_id + ], # Child nodes point back to this node's ID via parent_chain_id + back_populates="parent", # Links back to the 'parent' relationship above + cascade="all, delete-orphan", # If a parent node is deleted, its children are also deleted ) # Relationship to the entities (e.g., Repositories, Works) found during this step. # Linked via the EntityDiscoveryAssociation table. entity_associations: Mapped[List["EntityDiscoveryAssociation"]] = relationship( - back_populates="discovery_chain", # Links to the 'discovery_chain' attribute in EntityDiscoveryAssociation - cascade="all, delete-orphan" # If a discovery node is deleted, its entity links are removed + back_populates="discovery_chain", # Links to the 'discovery_chain' attribute in EntityDiscoveryAssociation + cascade="all, delete-orphan", # If a discovery node is deleted, its entity links are removed ) # --- Table Arguments --- # Explicitly define indexes for commonly queried columns. __table_args__ = ( # Index on 'status' column for efficient querying of jobs by state. - Index('ix_discovery_chains_status', 'status'), + Index("ix_discovery_chains_status", "status"), # Index on 'root_chain_id' for efficiently finding all nodes in a specific discovery tree. - Index('ix_discovery_chains_root_id', 'root_chain_id'), + Index("ix_discovery_chains_root_id", "root_chain_id"), # Note: The index=True on the root_chain_id column definition above is slightly redundant # but kept for clarity; __table_args__ provides central control over indexes. ) @@ -133,6 +147,8 @@ class DiscoveryChain(Base): def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Use short UUID representation for brevity - short_id = str(self.id).split('-')[0] if self.id else None - return (f"") \ No newline at end of file + short_id = str(self.id).split("-")[0] if self.id else None + return ( + f"" + ) diff --git a/backend/data/models/doi_reference.py b/backend/data/models/doi_reference.py index de9cf44..5a45a51 100644 --- a/backend/data/models/doi_reference.py +++ b/backend/data/models/doi_reference.py @@ -8,9 +8,7 @@ """ from typing import Optional, TYPE_CHECKING -from sqlalchemy import ( - String, Integer, Text, ForeignKey, Index, UniqueConstraint -) +from sqlalchemy import String, Text, ForeignKey, Index, UniqueConstraint from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere @@ -23,6 +21,7 @@ from .repository import Repository from .work import Work + class DOIReference(BaseModel, Base): """ Represents an instance of a DOI found within a repository file. @@ -43,6 +42,7 @@ class DOIReference(BaseModel, Base): repository: Relationship back to the Repository object. work: Relationship back to the resolved Work object (or None). """ + __tablename__ = "doi_references" # --- Core DOI Information --- @@ -58,7 +58,9 @@ class DOIReference(BaseModel, Base): ) # Reference to the Work record if the DOI could be resolved. Nullable. Indexed. work_id: Mapped[Optional[int]] = mapped_column( - ForeignKey("works.id"), index=True, nullable=True + ForeignKey("works.id"), + index=True, + nullable=True, # Nullable=True is crucial, as not all found DOIs might resolve # or correspond to Works currently in the database. ) @@ -89,20 +91,25 @@ class DOIReference(BaseModel, Base): # Ensure that the same DOI isn't recorded multiple times for the exact same file # within the same repository. This prevents duplicate entries from reappearing if # a file is scanned multiple times without changes. - UniqueConstraint('repository_id', 'doi', 'source_file', name='uq_repo_doi_source'), - + UniqueConstraint( + "repository_id", "doi", "source_file", name="uq_repo_doi_source" + ), # Explicit indexes on individual columns often used in queries. # While some are already indexed due to FKs or the `index=True` flag, # defining them here provides a central place to manage table-level indexing. - Index('ix_doi_references_doi', 'doi'), - Index('ix_doi_references_repository_id', 'repository_id'), - Index('ix_doi_references_work_id', 'work_id'), # Indexing nullable FK can still be useful. + Index("ix_doi_references_doi", "doi"), + Index("ix_doi_references_repository_id", "repository_id"), + Index( + "ix_doi_references_work_id", "work_id" + ), # Indexing nullable FK can still be useful. ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Uses getattr for id in case the instance isn't flushed yet - obj_id = getattr(self, 'id', None) + obj_id = getattr(self, "id", None) work_repr = f", work_id={self.work_id}" if self.work_id else ", work_id=None" - return (f"") \ No newline at end of file + return ( + f"" + ) diff --git a/backend/data/models/domain.py b/backend/data/models/domain.py index a5a919b..f938370 100644 --- a/backend/data/models/domain.py +++ b/backend/data/models/domain.py @@ -18,10 +18,11 @@ # Use TYPE_CHECKING to prevent circular imports for type hints if TYPE_CHECKING: - from .field import Field # For the one-to-many relationship to Fields + from .field import Field # For the one-to-many relationship to Fields logger = logging.getLogger(__name__) + class Domain(BaseModel, Base): """ Represents an OpenAlex Domain, the top tier in the subject hierarchy. @@ -37,13 +38,16 @@ class Domain(BaseModel, Base): description: An optional longer description of the Domain's scope. fields: One-to-many relationship linking this Domain to its constituent Fields. """ + __tablename__ = "domains" # --- Identifiers and Details --- # Core attributes defining the Domain based on OpenAlex data. # OpenAlex unique ID for the Domain. Indexed for fast lookups. - openalex_id: Mapped[str] = mapped_column(String, unique=True, index=True, nullable=False) + openalex_id: Mapped[str] = mapped_column( + String, unique=True, index=True, nullable=False + ) # Human-readable name. Indexed for searching and display. display_name: Mapped[str] = mapped_column(String, index=True, nullable=False) @@ -60,21 +64,20 @@ class Domain(BaseModel, Base): # `cascade="all, delete-orphan"` ensures that if a Domain is deleted, all its # associated Fields are also removed from the database. fields: Mapped[List["Field"]] = relationship( - back_populates="domain", - cascade="all, delete-orphan" + back_populates="domain", cascade="all, delete-orphan" ) # --- Table Arguments --- # Explicitly define indexes for optimized query performance. __table_args__ = ( # Redundant index on openalex_id (already unique), but explicitly defined for clarity. - Index('ix_domains_openalex_id', 'openalex_id'), + Index("ix_domains_openalex_id", "openalex_id"), # Index on display_name for faster text-based searches or sorting. - Index('ix_domains_display_name', 'display_name'), + Index("ix_domains_display_name", "display_name"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Uses getattr for id in case the instance isn't flushed yet - obj_id = getattr(self, 'id', None) - return f"" \ No newline at end of file + obj_id = getattr(self, "id", None) + return f"" diff --git a/backend/data/models/entity_discovery_association.py b/backend/data/models/entity_discovery_association.py index ec252cd..6d3f371 100644 --- a/backend/data/models/entity_discovery_association.py +++ b/backend/data/models/entity_discovery_association.py @@ -9,21 +9,20 @@ import uuid from typing import Optional, TYPE_CHECKING -from sqlalchemy import ( - String, Integer, Boolean, ForeignKey, Index, UniqueConstraint -) +from sqlalchemy import String, Integer, Boolean, ForeignKey, Index, UniqueConstraint from sqlalchemy.dialects.postgresql import UUID as PG_UUID from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere # Adjust import paths as necessary from ..database import Base -from .base import BaseModel # Inherits standard ID/timestamps +from .base import BaseModel # Inherits standard ID/timestamps # Use TYPE_CHECKING to prevent circular imports for type hints if TYPE_CHECKING: from .discovery_chain import DiscoveryChain + class EntityDiscoveryAssociation(BaseModel, Base): """ Association table linking a DiscoveryChain node to a discovered entity. @@ -46,15 +45,16 @@ class EntityDiscoveryAssociation(BaseModel, Base): linked discovery step, or indirectly (e.g., associated via a child step). discovery_chain: Relationship back to the DiscoveryChain node. """ + __tablename__ = "entity_discovery_associations" # --- Foreign Key to Discovery Chain --- # Links this association record back to the specific discovery step. Indexed. discovery_chain_id: Mapped[uuid.UUID] = mapped_column( - PG_UUID(as_uuid=True), # Match the UUID type of DiscoveryChain.id - ForeignKey("discovery_chains.id"), # Establishes the foreign key relationship - index=True, # Index for efficient lookup of entities associated with a chain - nullable=False + PG_UUID(as_uuid=True), # Match the UUID type of DiscoveryChain.id + ForeignKey("discovery_chains.id"), # Establishes the foreign key relationship + index=True, # Index for efficient lookup of entities associated with a chain + nullable=False, ) # --- Polymorphic Link to Discovered Entity --- @@ -74,7 +74,9 @@ class EntityDiscoveryAssociation(BaseModel, Base): # --- Association Metadata --- # Additional context about the discovery relationship. is_direct_discovery: Mapped[bool] = mapped_column( - Boolean, default=True, nullable=False + Boolean, + default=True, + nullable=False, # True if this entity was a primary result of the discovery_chain_id step. # False if it's associated indirectly (e.g., discovered by a child step but linked # here for aggregation). @@ -91,11 +93,9 @@ class EntityDiscoveryAssociation(BaseModel, Base): # Define indexes and constraints for data integrity and performance. __table_args__ = ( # Index on discovery_chain_id (already indexed via column definition, but explicit). - Index('ix_entity_discovery_chain_id', 'discovery_chain_id'), - + Index("ix_entity_discovery_chain_id", "discovery_chain_id"), # Composite index on the polymorphic entity identifier columns. - Index('ix_entity_discovery_entity', 'entity_type', 'entity_id'), - + Index("ix_entity_discovery_entity", "entity_type", "entity_id"), # Unique constraint: Prevents associating the *same entity* with the *same discovery chain* # multiple times. # Note on NULLs: The behavior of unique constraints with NULL values varies across @@ -104,20 +104,25 @@ class EntityDiscoveryAssociation(BaseModel, Base): # and entity_type if entity_id is NULL. This might be acceptable or require # application-level checks depending on exact requirements. UniqueConstraint( - 'discovery_chain_id', - 'entity_type', - 'entity_id', - name='uq_discovery_entity' + "discovery_chain_id", "entity_type", "entity_id", name="uq_discovery_entity" ), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Safely access 'id' attribute which comes from BaseModel - assoc_id = getattr(self, 'id', None) + assoc_id = getattr(self, "id", None) # Display entity_id appropriately if it's None - entity_id_repr = self.entity_id if self.entity_id is not None else '[NULL_or_CompositePK]' + entity_id_repr = ( + self.entity_id if self.entity_id is not None else "[NULL_or_CompositePK]" + ) # Use short UUID for chain_id - short_chain_id = str(self.discovery_chain_id).split('-')[0] + '...' if self.discovery_chain_id else None - return (f"") \ No newline at end of file + short_chain_id = ( + str(self.discovery_chain_id).split("-")[0] + "..." + if self.discovery_chain_id + else None + ) + return ( + f"" + ) diff --git a/backend/data/models/field.py b/backend/data/models/field.py index 42d960a..a39481f 100644 --- a/backend/data/models/field.py +++ b/backend/data/models/field.py @@ -8,7 +8,7 @@ import logging from typing import List, Optional, TYPE_CHECKING -from sqlalchemy import String, Text, Integer, ForeignKey, Index +from sqlalchemy import String, Text, ForeignKey, Index from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere @@ -18,11 +18,12 @@ # Use TYPE_CHECKING to prevent circular imports for type hints if TYPE_CHECKING: - from .domain import Domain # For the many-to-one relationship to Domain - from .subfield import Subfield # For the one-to-many relationship to Subfields + from .domain import Domain # For the many-to-one relationship to Domain + from .subfield import Subfield # For the one-to-many relationship to Subfields logger = logging.getLogger(__name__) + class Field(BaseModel, Base): """ Represents an OpenAlex Field, the second tier in the subject hierarchy. @@ -41,13 +42,16 @@ class Field(BaseModel, Base): domain: Many-to-one relationship back to the parent Domain object. subfields: One-to-many relationship linking this Field to its constituent Subfields. """ + __tablename__ = "fields" # --- Identifiers and Details --- # Core attributes defining the Field based on OpenAlex data. # OpenAlex unique ID for the Field. Indexed for fast lookups. - openalex_id: Mapped[str] = mapped_column(String, unique=True, index=True, nullable=False) + openalex_id: Mapped[str] = mapped_column( + String, unique=True, index=True, nullable=False + ) # Human-readable name. Indexed for searching and display. display_name: Mapped[str] = mapped_column(String, index=True, nullable=False) @@ -58,9 +62,9 @@ class Field(BaseModel, Base): # --- Foreign Key to Parent Domain --- # Establishes the hierarchical link within the subject classification. domain_id: Mapped[int] = mapped_column( - ForeignKey("domains.id", ondelete="CASCADE"), # Links to the parent Domain - index=True, # Index for efficient lookup of Fields within a Domain - nullable=False + ForeignKey("domains.id", ondelete="CASCADE"), # Links to the parent Domain + index=True, # Index for efficient lookup of Fields within a Domain + nullable=False, # 'ondelete="CASCADE"' ensures that if a Domain is deleted, all its child # Fields (and consequently their Subfields, etc.) are also deleted. ) @@ -79,23 +83,22 @@ class Field(BaseModel, Base): # `cascade="all, delete-orphan"` ensures that if a Field is deleted, all its # associated Subfields are also removed from the database. subfields: Mapped[List["Subfield"]] = relationship( - back_populates="field", - cascade="all, delete-orphan" + back_populates="field", cascade="all, delete-orphan" ) # --- Table Arguments --- # Explicitly define indexes for optimized query performance. __table_args__ = ( # Redundant index on openalex_id (already unique), but explicit for clarity. - Index('ix_fields_openalex_id', 'openalex_id'), + Index("ix_fields_openalex_id", "openalex_id"), # Index on display_name for faster text-based searches or sorting. - Index('ix_fields_display_name', 'display_name'), + Index("ix_fields_display_name", "display_name"), # Index on domain_id (already indexed via column definition, but explicit). - Index('ix_fields_domain_id', 'domain_id'), + Index("ix_fields_domain_id", "domain_id"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Uses getattr for id in case the instance isn't flushed yet - obj_id = getattr(self, 'id', None) - return f"" \ No newline at end of file + obj_id = getattr(self, "id", None) + return f"" diff --git a/backend/data/models/institution.py b/backend/data/models/institution.py index 9524e9e..410bc5b 100644 --- a/backend/data/models/institution.py +++ b/backend/data/models/institution.py @@ -8,7 +8,8 @@ """ import logging -from typing import List, Optional, TYPE_CHECKING, Dict, Any +from typing import List, Optional, TYPE_CHECKING + # Import JSONB type for handling JSON data in PostgreSQL from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy import String, Index @@ -22,10 +23,11 @@ # Use TYPE_CHECKING to prevent circular imports for type hints if TYPE_CHECKING: - from .affiliation import Affiliation # For the relationship to author affiliations + from .affiliation import Affiliation # For the relationship to author affiliations logger = logging.getLogger(__name__) + class Institution(BaseModel, Base): """ Represents an institution (university, company, hospital, etc.). @@ -48,16 +50,21 @@ class Institution(BaseModel, Base): affiliations: One-to-many relationship linking this institution to Affiliation records (representing author affiliations on works). """ + __tablename__ = "institutions" # --- Identifiers --- # Key identifiers linking this record to external systems. # OpenAlex unique ID. Crucial for linking with OpenAlex publication data. Indexed. - openalex_id: Mapped[str] = mapped_column(String, unique=True, index=True, nullable=False) + openalex_id: Mapped[str] = mapped_column( + String, unique=True, index=True, nullable=False + ) # Research Organization Registry ID. A globally unique and persistent identifier. Indexed. - ror: Mapped[Optional[str]] = mapped_column(String, unique=True, index=True, nullable=True) + ror: Mapped[Optional[str]] = mapped_column( + String, unique=True, index=True, nullable=True + ) # --- Descriptive Details --- # Core information about the institution. @@ -76,8 +83,8 @@ class Institution(BaseModel, Base): # This facilitates linking repositories or contributors directly via known orgs. # Populated manually or via specific discovery/matching processes. github_organization_logins: Mapped[Optional[List[str]]] = mapped_column( - JSONB, # Use JSONB for efficient storage and querying of list data in PostgreSQL. - nullable=True + JSONB, # Use JSONB for efficient storage and querying of list data in PostgreSQL. + nullable=True, ) # --- Relationships --- @@ -92,8 +99,7 @@ class Institution(BaseModel, Base): # cascade behavior is always desired, as it removes authorship affiliation data. # An alternative might be to set the FK to NULL or prevent deletion if affiliations exist. affiliations: Mapped[List["Affiliation"]] = relationship( - back_populates="institution", - cascade="all, delete-orphan" + back_populates="institution", cascade="all, delete-orphan" ) # --- Table Arguments --- @@ -101,15 +107,17 @@ class Institution(BaseModel, Base): # Indexes on unique columns are often created automatically but defining them here # provides clarity and central management. __table_args__ = ( - Index('ix_institutions_openalex_id', 'openalex_id'), # Index on OpenAlex ID - Index('ix_institutions_ror', 'ror'), # Index on ROR ID - Index('ix_institutions_display_name', 'display_name'),# Index on name for searching - Index('ix_institutions_type', 'type'), # Index for filtering by type + Index("ix_institutions_openalex_id", "openalex_id"), # Index on OpenAlex ID + Index("ix_institutions_ror", "ror"), # Index on ROR ID + Index( + "ix_institutions_display_name", "display_name" + ), # Index on name for searching + Index("ix_institutions_type", "type"), # Index for filtering by type ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Uses getattr for id in case the instance isn't flushed yet - obj_id = getattr(self, 'id', None) + obj_id = getattr(self, "id", None) ror_repr = f", ror={self.ror}" if self.ror else "" - return f"" \ No newline at end of file + return f"" diff --git a/backend/data/models/issue.py b/backend/data/models/issue.py index c86ed8b..71a7768 100644 --- a/backend/data/models/issue.py +++ b/backend/data/models/issue.py @@ -7,11 +7,9 @@ import logging from typing import Optional, TYPE_CHECKING -from datetime import datetime # Required for DateTime type hints +from datetime import datetime # Required for DateTime type hints -from sqlalchemy import ( - String, Integer, Text, Boolean, DateTime, BigInteger, ForeignKey, Index -) +from sqlalchemy import String, Integer, Text, DateTime, BigInteger, ForeignKey, Index from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere @@ -22,10 +20,13 @@ # Use TYPE_CHECKING to prevent circular imports for type hints if TYPE_CHECKING: from .repository import Repository - from .contributor import Contributor # Assumes the issue author is stored as a Contributor + from .contributor import ( + Contributor, + ) # Assumes the issue author is stored as a Contributor logger = logging.getLogger(__name__) + class Issue(BaseModel, Base): """ Represents a GitHub Issue linked to a Repository. @@ -49,13 +50,16 @@ class Issue(BaseModel, Base): repository: Relationship back to the parent Repository object. user: Relationship back to the Contributor (author) object. """ + __tablename__ = "issues" # --- GitHub Identifiers --- # Unique IDs connecting this record to the source GitHub data. # GitHub's unique ID for this specific issue. Indexed for efficient lookup. - github_id: Mapped[int] = mapped_column(BigInteger, unique=True, index=True, nullable=False) + github_id: Mapped[int] = mapped_column( + BigInteger, unique=True, index=True, nullable=False + ) # --- Foreign Keys --- # Links to related entities (Repository, Contributor). @@ -86,11 +90,17 @@ class Issue(BaseModel, Base): # Stores the original timestamps from GitHub, preserving timezone information. # When the issue was created on GitHub. - gh_created_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_created_at: Mapped[Optional[datetime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # When the issue was last updated on GitHub. - gh_updated_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_updated_at: Mapped[Optional[datetime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # When the issue was closed on GitHub (NULL if still open). - gh_closed_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_closed_at: Mapped[Optional[datetime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # --- Relationships --- # Define relationships for navigating from an Issue instance. @@ -107,18 +117,20 @@ class Issue(BaseModel, Base): # Define indexes to optimize common query patterns. __table_args__ = ( # Individual indexes on foreign keys and state/number for common filtering/sorting. - Index('ix_issues_repo_id', 'repository_id'), - Index('ix_issues_user_id', 'user_id'), - Index('ix_issues_state', 'state'), - Index('ix_issues_number', 'number'), + Index("ix_issues_repo_id", "repository_id"), + Index("ix_issues_user_id", "user_id"), + Index("ix_issues_state", "state"), + Index("ix_issues_number", "number"), # Composite index for efficiently finding a specific issue number within a specific repo. - Index('ix_issues_repo_number', 'repository_id', 'number'), + Index("ix_issues_repo_number", "repository_id", "number"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Uses getattr for id in case the instance isn't flushed yet - obj_id = getattr(self, 'id', None) - return (f"") \ No newline at end of file + obj_id = getattr(self, "id", None) + return ( + f"" + ) diff --git a/backend/data/models/issue_comment.py b/backend/data/models/issue_comment.py index 33b70a0..fd63079 100644 --- a/backend/data/models/issue_comment.py +++ b/backend/data/models/issue_comment.py @@ -7,11 +7,9 @@ import logging from typing import Optional, TYPE_CHECKING -from datetime import datetime # Required for DateTime type hints +from datetime import datetime # Required for DateTime type hints -from sqlalchemy import ( - String, Integer, Text, Boolean, DateTime, BigInteger, ForeignKey, Index -) +from sqlalchemy import Text, DateTime, BigInteger, ForeignKey, Index from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere @@ -22,10 +20,13 @@ # Use TYPE_CHECKING to prevent circular imports for type hints if TYPE_CHECKING: from .issue import Issue - from .contributor import Contributor # Assumes the comment author is stored as a Contributor + from .contributor import ( + Contributor, + ) # Assumes the comment author is stored as a Contributor logger = logging.getLogger(__name__) + class IssueComment(BaseModel, Base): """ Represents a comment on a GitHub Issue. @@ -45,13 +46,16 @@ class IssueComment(BaseModel, Base): issue: Relationship back to the parent Issue object. user: Relationship back to the Contributor (author) object. """ + __tablename__ = "issue_comments" # --- GitHub Identifier --- # Unique ID connecting this record to the source GitHub data. # GitHub's unique ID for this specific comment. Indexed for efficient lookup. - github_id: Mapped[int] = mapped_column(BigInteger, unique=True, index=True, nullable=False) + github_id: Mapped[int] = mapped_column( + BigInteger, unique=True, index=True, nullable=False + ) # --- Foreign Keys --- # Links to related entities (Issue, Contributor). @@ -69,15 +73,21 @@ class IssueComment(BaseModel, Base): # --- Comment Content --- # The main textual body of the comment. - body: Mapped[Optional[str]] = mapped_column(Text, nullable=True) # Stored as Text for potentially long comments. + body: Mapped[Optional[str]] = mapped_column( + Text, nullable=True + ) # Stored as Text for potentially long comments. # --- GitHub Timestamps --- # Stores the original timestamps from GitHub, preserving timezone information. # When the comment was created on GitHub. - gh_created_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_created_at: Mapped[Optional[datetime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # When the comment was last updated on GitHub. - gh_updated_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_updated_at: Mapped[Optional[datetime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # --- Relationships --- # Define relationships for navigating from an IssueComment instance. @@ -93,13 +103,19 @@ class IssueComment(BaseModel, Base): # --- Table Arguments --- # Define indexes to optimize common query patterns, especially filtering by issue or user. __table_args__ = ( - Index('ix_issue_comments_issue_id', 'issue_id'), # Index for finding comments by issue - Index('ix_issue_comments_user_id', 'user_id'), # Index for finding comments by user + Index( + "ix_issue_comments_issue_id", "issue_id" + ), # Index for finding comments by issue + Index( + "ix_issue_comments_user_id", "user_id" + ), # Index for finding comments by user ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Uses getattr for id in case the instance isn't flushed yet - obj_id = getattr(self, 'id', None) - return (f"") \ No newline at end of file + obj_id = getattr(self, "id", None) + return ( + f"" + ) diff --git a/backend/data/models/keyword_repository_association.py b/backend/data/models/keyword_repository_association.py index 5d4ee2d..9af96e8 100644 --- a/backend/data/models/keyword_repository_association.py +++ b/backend/data/models/keyword_repository_association.py @@ -9,10 +9,10 @@ import logging from typing import Optional, Dict, Any, TYPE_CHECKING from sqlalchemy import ( - ForeignKey, Index # Index might be used if specific indexing beyond PK/FK is needed + ForeignKey, # Index might be used if specific indexing beyond PK/FK is needed ) from sqlalchemy.dialects.postgresql import JSONB -from sqlalchemy.orm import relationship, Mapped, mapped_column +from sqlalchemy.orm import Mapped, mapped_column # Assuming Base is correctly defined elsewhere # Adjust import path as necessary @@ -23,8 +23,8 @@ # Use TYPE_CHECKING to prevent circular imports for type hints, # although direct relationships are commented out in this version. if TYPE_CHECKING: - from .keyword_search_session import KeywordSearchSession - from .repository import Repository + pass + class KeywordRepositoryAssociation(Base): """ @@ -44,6 +44,7 @@ class KeywordRepositoryAssociation(Base): repository_id: Foreign key linking to the Repository. Part of the composite PK. match_details: Optional JSON field to store data about the match, like relevance score or matched terms. """ + __tablename__ = "keyword_repository_associations" # --- Composite Primary Key and Foreign Keys --- @@ -55,21 +56,22 @@ class KeywordRepositoryAssociation(Base): keyword_search_session_id: Mapped[int] = mapped_column( ForeignKey("keyword_search_sessions.id", ondelete="CASCADE"), primary_key=True, - index=True # Index this foreign key + index=True, # Index this foreign key ) # Foreign key to the Repositories table. Part of the composite PK. # Indexed to optimize queries finding all sessions that discovered a given repository. repository_id: Mapped[int] = mapped_column( ForeignKey("repositories.id", ondelete="CASCADE"), primary_key=True, - index=True # Index this foreign key + index=True, # Index this foreign key ) # --- Optional Match Metadata --- # Store additional details about why this repository was considered a match # during the search process. This is flexible using JSONB. match_details: Mapped[Optional[Dict[str, Any]]] = mapped_column( - JSONB, nullable=True + JSONB, + nullable=True, # Example: {'score': 0.85, 'matched_in': ['description', 'readme'], 'terms': ['quantum computing']} ) @@ -95,5 +97,7 @@ class KeywordRepositoryAssociation(Base): def __repr__(self): """Provides a concise string representation for debugging and logging.""" - return (f"") \ No newline at end of file + return ( + f"" + ) diff --git a/backend/data/models/keyword_search_session.py b/backend/data/models/keyword_search_session.py index ed4e5a7..5385062 100644 --- a/backend/data/models/keyword_search_session.py +++ b/backend/data/models/keyword_search_session.py @@ -6,18 +6,23 @@ """ import logging -from datetime import datetime # Required for DateTime type hints -from typing import Optional, TYPE_CHECKING # TYPE_CHECKING if relationships are used +from datetime import datetime # Required for DateTime type hints +from typing import Optional # TYPE_CHECKING if relationships are used from sqlalchemy import ( - String, Integer, Text, Index, DateTime, func # func needed for server_default + String, + Integer, + Text, + Index, + DateTime, + func, # func needed for server_default ) -from sqlalchemy.orm import relationship, Mapped, mapped_column +from sqlalchemy.orm import Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere # Adjust import paths as necessary from ..database import Base -from .base import BaseModel # Inherits id, created_at, updated_at -from .types import timestamp_nullable # Import custom type for nullable timestamp +from .base import BaseModel # Inherits id, created_at, updated_at +from .types import timestamp_nullable # Import custom type for nullable timestamp logger = logging.getLogger(__name__) @@ -25,6 +30,7 @@ # if TYPE_CHECKING: # from .keyword_repository_association import KeywordRepositoryAssociation + class KeywordSearchSession(BaseModel, Base): """ Represents a single execution of a keyword search task. @@ -44,6 +50,7 @@ class KeywordSearchSession(BaseModel, Base): completed_at: Timestamp when the search task finished (successfully or failed). # repository_associations: Optional relationship to link to the actual results. """ + __tablename__ = "keyword_search_sessions" # --- Search Parameters --- @@ -56,7 +63,7 @@ class KeywordSearchSession(BaseModel, Base): # Current status, e.g., 'PENDING', 'RUNNING', 'COMPLETED', 'FAILED'. Indexed for easy querying of task states. status: Mapped[str] = mapped_column( - String, index=True, nullable=False, default='PENDING' + String, index=True, nullable=False, default="PENDING" ) # Stores the number of results found upon successful completion. results_count: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) @@ -89,14 +96,20 @@ class KeywordSearchSession(BaseModel, Base): __table_args__ = ( # Index on the 'status' column is crucial for efficiently finding sessions # that are pending, running, failed, etc., for monitoring or retries. - Index('ix_keyword_search_sessions_status', 'status'), + Index("ix_keyword_search_sessions_status", "status"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Safely access 'id' which comes from BaseModel - repr_id = getattr(self, 'id', None) + repr_id = getattr(self, "id", None) # Truncate long keyword strings for readability - keywords_repr = (self.keywords_raw[:50] + '...') if len(self.keywords_raw) > 50 else self.keywords_raw - return (f"") \ No newline at end of file + keywords_repr = ( + (self.keywords_raw[:50] + "...") + if len(self.keywords_raw) > 50 + else self.keywords_raw + ) + return ( + f"" + ) diff --git a/backend/data/models/owner.py b/backend/data/models/owner.py index 4889d88..1e7eadb 100644 --- a/backend/data/models/owner.py +++ b/backend/data/models/owner.py @@ -5,18 +5,23 @@ User or an Organization) that can own repositories. """ -from typing import List, TYPE_CHECKING # TYPE_CHECKING needed for relationship hint -from sqlalchemy import String, BigInteger, Index, ForeignKey # ForeignKey needed if relationships defined on this side +from typing import List, TYPE_CHECKING # TYPE_CHECKING needed for relationship hint +from sqlalchemy import ( + String, + BigInteger, + Index, +) # ForeignKey needed if relationships defined on this side from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere # Adjust import paths as necessary from ..database import Base -from .base import BaseModel # Inherits id, created_at, updated_at +from .base import BaseModel # Inherits id, created_at, updated_at # Use TYPE_CHECKING to prevent circular imports when type hinting the relationship if TYPE_CHECKING: - from .repository import Repository # For the one-to-many relationship + from .repository import Repository # For the one-to-many relationship + class Owner(BaseModel, Base): """ @@ -37,24 +42,35 @@ class Owner(BaseModel, Base): api_url: URL to the owner's data endpoint in the GitHub API. repositories: One-to-many relationship linking this owner to the Repositories they own. """ + __tablename__ = "owners" # --- GitHub Identifiers and Details --- # Core information identifying the GitHub owner account. # GitHub's unique numerical ID for the User or Organization. Indexed for fast lookups. - github_id: Mapped[int] = mapped_column(BigInteger, unique=True, index=True, nullable=False) + github_id: Mapped[int] = mapped_column( + BigInteger, unique=True, index=True, nullable=False + ) # GitHub login name (username or organization name). Must be unique and indexed. login: Mapped[str] = mapped_column(String, unique=True, index=True, nullable=False) # Type distinguishes between individual users and organizations. Indexed for filtering. - type: Mapped[str] = mapped_column(String, index=True, nullable=False) # Typically 'User' or 'Organization' + type: Mapped[str] = mapped_column( + String, index=True, nullable=False + ) # Typically 'User' or 'Organization' # Optional profile details retrieved from GitHub. - avatar_url: Mapped[str | None] = mapped_column(String, nullable=True) # Accepts str or None - html_url: Mapped[str | None] = mapped_column(String, nullable=True) # Link to GitHub profile page - api_url: Mapped[str | None] = mapped_column(String, nullable=True) # Link to GitHub API data for this owner + avatar_url: Mapped[str | None] = mapped_column( + String, nullable=True + ) # Accepts str or None + html_url: Mapped[str | None] = mapped_column( + String, nullable=True + ) # Link to GitHub profile page + api_url: Mapped[str | None] = mapped_column( + String, nullable=True + ) # Link to GitHub API data for this owner # --- Relationships --- # Defines the connection to the repositories owned by this entity. @@ -67,8 +83,7 @@ class Owner(BaseModel, Base): # carefully considered. Alternatives might include preventing deletion if # repositories exist or setting the repository's owner_id to NULL (if allowed). repositories: Mapped[List["Repository"]] = relationship( - back_populates="owner", - cascade="all, delete-orphan" + back_populates="owner", cascade="all, delete-orphan" ) # --- Table Arguments --- @@ -76,11 +91,11 @@ class Owner(BaseModel, Base): __table_args__ = ( # Explicitly create an index on the 'type' column. This is useful for queries # that specifically target only users or only organizations. - Index('ix_owners_type', 'type'), + Index("ix_owners_type", "type"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Safely access 'id' which comes from BaseModel - obj_id = getattr(self, 'id', None) - return f"" \ No newline at end of file + obj_id = getattr(self, "id", None) + return f"" diff --git a/backend/data/models/person.py b/backend/data/models/person.py index c3ad300..0ac029b 100644 --- a/backend/data/models/person.py +++ b/backend/data/models/person.py @@ -8,8 +8,9 @@ """ import logging -from typing import List, Optional, Dict, Any, TYPE_CHECKING +from typing import List, Optional, TYPE_CHECKING from sqlalchemy import String, Index + # Import JSONB type for handling JSON data in PostgreSQL, specifically for alternative names. from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.orm import relationship, Mapped, mapped_column @@ -21,10 +22,13 @@ # Use TYPE_CHECKING to prevent circular imports for type hints, especially for relationships. if TYPE_CHECKING: - from .authorship import Authorship # For the one-to-many relationship to Authorship records + from .authorship import ( + Authorship, + ) # For the one-to-many relationship to Authorship records logger = logging.getLogger(__name__) + class Person(BaseModel, Base): """ Represents a person, typically identified via scholarly metadata sources. @@ -45,16 +49,21 @@ class Person(BaseModel, Base): authorships: One-to-many relationship linking this person to their Authorship records (representing their role on specific Works). """ + __tablename__ = "persons" # --- Identifiers --- # Key unique identifiers linking this person to external scholarly systems. # OpenAlex unique ID. Essential for linking to OpenAlex data. Indexed. - openalex_id: Mapped[str] = mapped_column(String, unique=True, index=True, nullable=False) + openalex_id: Mapped[str] = mapped_column( + String, unique=True, index=True, nullable=False + ) # ORCID iD provides a persistent digital identifier for researchers. Unique and indexed. - orcid: Mapped[Optional[str]] = mapped_column(String, unique=True, index=True, nullable=True) + orcid: Mapped[Optional[str]] = mapped_column( + String, unique=True, index=True, nullable=True + ) # --- Name Information --- # Stores the person's name and known variations. @@ -64,7 +73,9 @@ class Person(BaseModel, Base): # Stores a list of alternative names (e.g., ["J. Smith", "Johnathan Smith"]) # using JSONB for flexibility and efficient querying within the list in PostgreSQL. - display_name_alternatives: Mapped[Optional[List[str]]] = mapped_column(JSONB, nullable=True) + display_name_alternatives: Mapped[Optional[List[str]]] = mapped_column( + JSONB, nullable=True + ) # --- Relationships --- # Defines how Persons connect to their contributions (Works via Authorships). @@ -76,22 +87,23 @@ class Person(BaseModel, Base): # Authorship records (and consequently their Affiliations) are also deleted. # This implies that removing a person removes all their recorded publication links. authorships: Mapped[List["Authorship"]] = relationship( - back_populates="person", - cascade="all, delete-orphan" + back_populates="person", cascade="all, delete-orphan" ) # --- Table Arguments --- # Explicitly define indexes for optimized query performance, particularly on identifiers. # While unique=True implies an index, defining them here ensures clarity. __table_args__ = ( - Index('ix_persons_openalex_id', 'openalex_id'), # Index on OpenAlex ID - Index('ix_persons_orcid', 'orcid'), # Index on ORCID - Index('ix_persons_display_name', 'display_name'), # Index on primary name for searching + Index("ix_persons_openalex_id", "openalex_id"), # Index on OpenAlex ID + Index("ix_persons_orcid", "orcid"), # Index on ORCID + Index( + "ix_persons_display_name", "display_name" + ), # Index on primary name for searching ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Safely access 'id' which comes from BaseModel - obj_id = getattr(self, 'id', None) + obj_id = getattr(self, "id", None) orcid_repr = f", orcid={self.orcid}" if self.orcid else "" - return f"" \ No newline at end of file + return f"" diff --git a/backend/data/models/pr_review_comment.py b/backend/data/models/pr_review_comment.py index c74d26e..7886508 100644 --- a/backend/data/models/pr_review_comment.py +++ b/backend/data/models/pr_review_comment.py @@ -7,11 +7,9 @@ import logging from typing import Optional, TYPE_CHECKING -from datetime import datetime # Required for DateTime type hints +from datetime import datetime # Required for DateTime type hints -from sqlalchemy import ( - String, Integer, Text, Boolean, DateTime, BigInteger, ForeignKey, Index -) +from sqlalchemy import Text, DateTime, BigInteger, ForeignKey, Index from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere @@ -22,10 +20,13 @@ # Use TYPE_CHECKING to prevent circular imports for type hints if TYPE_CHECKING: from .pull_request import PullRequest - from .contributor import Contributor # Assumes the comment author is stored as a Contributor + from .contributor import ( + Contributor, + ) # Assumes the comment author is stored as a Contributor logger = logging.getLogger(__name__) + class PRReviewComment(BaseModel, Base): """ Represents a comment made during a GitHub Pull Request code review. @@ -48,17 +49,22 @@ class PRReviewComment(BaseModel, Base): pull_request: Relationship back to the parent PullRequest object. user: Relationship back to the Contributor (author) object. """ + __tablename__ = "pr_review_comments" # --- GitHub Identifiers --- # Unique IDs connecting this record to the source GitHub data. # GitHub's unique ID for this specific review comment. Indexed. - github_id: Mapped[int] = mapped_column(BigInteger, unique=True, index=True, nullable=False) + github_id: Mapped[int] = mapped_column( + BigInteger, unique=True, index=True, nullable=False + ) # The ID of the overarching review summary/submission this comment belongs to. # Can be nullable as some comments might exist outside a formal review submission. Indexed. - pull_request_review_id: Mapped[Optional[int]] = mapped_column(BigInteger, nullable=True, index=True) + pull_request_review_id: Mapped[Optional[int]] = mapped_column( + BigInteger, nullable=True, index=True + ) # --- Foreign Keys --- # Links to the parent Pull Request and the authoring Contributor. @@ -76,15 +82,21 @@ class PRReviewComment(BaseModel, Base): # --- Comment Content --- # The actual text of the review comment. - body: Mapped[Optional[str]] = mapped_column(Text, nullable=True) # Use Text for potentially long comments. + body: Mapped[Optional[str]] = mapped_column( + Text, nullable=True + ) # Use Text for potentially long comments. # --- GitHub Timestamps --- # Stores the original timestamps from GitHub, preserving timezone information. # When the comment was created on GitHub. - gh_created_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_created_at: Mapped[Optional[datetime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # When the comment was last updated on GitHub. - gh_updated_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_updated_at: Mapped[Optional[datetime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # --- Relationships --- # Define relationships for navigating from a PRReviewComment instance. @@ -101,17 +113,19 @@ class PRReviewComment(BaseModel, Base): # Define indexes to optimize common query patterns. __table_args__ = ( # Index on the foreign key to Pull Request. - Index('ix_pr_review_comments_pr_id', 'pr_id'), + Index("ix_pr_review_comments_pr_id", "pr_id"), # Index on the foreign key to the user (author). - Index('ix_pr_review_comments_user_id', 'user_id'), + Index("ix_pr_review_comments_user_id", "user_id"), # Index on the GitHub review ID (pull_request_review_id). Useful if querying comments by review. # This index was already present via `index=True` on the column, but explicit definition is fine. - Index('ix_pr_review_comments_review_id', 'pull_request_review_id'), + Index("ix_pr_review_comments_review_id", "pull_request_review_id"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Safely access 'id' which comes from BaseModel - obj_id = getattr(self, 'id', None) - return (f"") \ No newline at end of file + obj_id = getattr(self, "id", None) + return ( + f"" + ) diff --git a/backend/data/models/pull_request.py b/backend/data/models/pull_request.py index 7f0c45f..5f216ab 100644 --- a/backend/data/models/pull_request.py +++ b/backend/data/models/pull_request.py @@ -7,11 +7,9 @@ import logging from typing import Optional, TYPE_CHECKING -from datetime import datetime # Required for DateTime type hints +from datetime import datetime # Required for DateTime type hints -from sqlalchemy import ( - String, Integer, Text, Boolean, DateTime, BigInteger, ForeignKey, Index -) +from sqlalchemy import String, Integer, Text, DateTime, BigInteger, ForeignKey, Index from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere @@ -22,10 +20,13 @@ # Use TYPE_CHECKING to prevent circular imports for type hints if TYPE_CHECKING: from .repository import Repository - from .contributor import Contributor # Assumes the PR author is stored as a Contributor + from .contributor import ( + Contributor, + ) # Assumes the PR author is stored as a Contributor logger = logging.getLogger(__name__) + class PullRequest(BaseModel, Base): """ Represents a GitHub Pull Request linked to a Repository. @@ -51,13 +52,16 @@ class PullRequest(BaseModel, Base): repository: Relationship back to the parent Repository object. user: Relationship back to the Contributor (author) object. """ + __tablename__ = "pull_requests" # --- GitHub Identifier --- # Unique ID connecting this record to the source GitHub data. # GitHub's unique ID for this specific pull request. Indexed. - github_id: Mapped[int] = mapped_column(BigInteger, unique=True, index=True, nullable=False) + github_id: Mapped[int] = mapped_column( + BigInteger, unique=True, index=True, nullable=False + ) # --- Foreign Keys --- # Links to related entities (Repository, Contributor). @@ -87,13 +91,21 @@ class PullRequest(BaseModel, Base): # Stores key lifecycle timestamps from GitHub, preserving timezone information. # When the PR was created on GitHub. - gh_created_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_created_at: Mapped[Optional[datetime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # When the PR was last updated on GitHub. - gh_updated_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_updated_at: Mapped[Optional[datetime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # When the PR was closed on GitHub (whether merged or not). NULL if still open. - gh_closed_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_closed_at: Mapped[Optional[datetime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # When the PR was merged on GitHub. NULL if not merged (either open or closed without merge). - gh_merged_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_merged_at: Mapped[Optional[datetime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # --- Relationships --- # Define relationships for navigating from a PullRequest instance. @@ -110,20 +122,22 @@ class PullRequest(BaseModel, Base): # Define indexes to optimize common query patterns. __table_args__ = ( # Individual indexes on foreign keys, state, and number. - Index('ix_pull_requests_repo_id', 'repository_id'), - Index('ix_pull_requests_user_id', 'user_id'), - Index('ix_pull_requests_state', 'state'), - Index('ix_pull_requests_number', 'number'), + Index("ix_pull_requests_repo_id", "repository_id"), + Index("ix_pull_requests_user_id", "user_id"), + Index("ix_pull_requests_state", "state"), + Index("ix_pull_requests_number", "number"), # Composite index for efficiently finding a specific PR number within a specific repo. - Index('ix_pull_requests_repo_number', 'repository_id', 'number'), + Index("ix_pull_requests_repo_number", "repository_id", "number"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Safely access 'id' which comes from BaseModel - obj_id = getattr(self, 'id', None) + obj_id = getattr(self, "id", None) # Display 'merged' status explicitly if applicable, otherwise show 'open'/'closed'. merged_status = "merged" if self.gh_merged_at else self.state - return (f"") \ No newline at end of file + return ( + f"" + ) diff --git a/backend/data/models/repository.py b/backend/data/models/repository.py index ec38dc2..004ad85 100644 --- a/backend/data/models/repository.py +++ b/backend/data/models/repository.py @@ -9,7 +9,14 @@ from sqlalchemy.dialects.postgresql import JSONB from typing import List, Optional, TYPE_CHECKING, Dict, Any from sqlalchemy import ( - String, Integer, Text, Boolean, DateTime, BigInteger, ForeignKey, Index + String, + Integer, + Text, + Boolean, + DateTime, + BigInteger, + ForeignKey, + Index, ) from sqlalchemy.orm import relationship, Mapped, mapped_column @@ -26,6 +33,7 @@ from .doi_reference import DOIReference # If relationships to Issues, PullRequests, etc., are added here, import them too. + class Repository(BaseModel, Base): """ Represents a code repository, typically sourced from platforms like GitHub. @@ -62,17 +70,22 @@ class Repository(BaseModel, Base): contributors: Many-to-many relationship linking to Contributors via the association table. doi_references: One-to-many relationship linking to DOIReference records found within this repository. """ + __tablename__ = "repositories" # --- GitHub Identifiers and Core Metadata --- # Essential information retrieved directly from the source platform (e.g., GitHub). # GitHub's unique numerical ID. Indexed for fast lookups. - github_id: Mapped[int] = mapped_column(BigInteger, unique=True, index=True, nullable=False) + github_id: Mapped[int] = mapped_column( + BigInteger, unique=True, index=True, nullable=False + ) # Repository name (e.g., 'my-project'). name: Mapped[str] = mapped_column(String, nullable=False) # Full name including owner (e.g., 'my-org/my-project'). Unique and indexed. - full_name: Mapped[str] = mapped_column(String, unique=True, index=True, nullable=False) + full_name: Mapped[str] = mapped_column( + String, unique=True, index=True, nullable=False + ) # User-provided description. Text allows for longer content. description: Mapped[Optional[str]] = mapped_column(Text, nullable=True) # Link to an external project website. @@ -92,7 +105,9 @@ class Repository(BaseModel, Base): # Basic engagement metrics from GitHub. Defaults ensure non-null integer values. stargazers_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) - watchers_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) # GitHub API: 'subscribers_count' + watchers_count: Mapped[int] = mapped_column( + Integer, default=0, nullable=False + ) # GitHub API: 'subscribers_count' forks_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) open_issues_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) # Flag indicating if the repository is a direct copy (fork) of another. @@ -100,9 +115,15 @@ class Repository(BaseModel, Base): # --- GitHub Timestamps --- # Stores key lifecycle timestamps from GitHub, preserving timezone information. - gh_created_at: Mapped[Optional[DateTime]] = mapped_column(DateTime(timezone=True), nullable=True) - gh_updated_at: Mapped[Optional[DateTime]] = mapped_column(DateTime(timezone=True), nullable=True) - gh_pushed_at: Mapped[Optional[DateTime]] = mapped_column(DateTime(timezone=True), nullable=True) + gh_created_at: Mapped[Optional[DateTime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) + gh_updated_at: Mapped[Optional[DateTime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) + gh_pushed_at: Mapped[Optional[DateTime]] = mapped_column( + DateTime(timezone=True), nullable=True + ) # --- Enriched Metadata (Added Fields) --- # Storing structured data like topics and license info. @@ -116,7 +137,9 @@ class Repository(BaseModel, Base): # --- Foreign Key to Owner --- # Links the repository to its owning User or Organization. Indexed. - owner_id: Mapped[int] = mapped_column(ForeignKey("owners.id"), index=True, nullable=False) + owner_id: Mapped[int] = mapped_column( + ForeignKey("owners.id"), index=True, nullable=False + ) # --- Relationships --- # Defines connections to other related entities. @@ -129,8 +152,7 @@ class Repository(BaseModel, Base): # `secondary` specifies the association table ('repository_contributors'). # `back_populates` links to the 'repositories' collection on the Contributor model. contributors: Mapped[List["Contributor"]] = relationship( - secondary="repository_contributors", - back_populates="repositories" + secondary="repository_contributors", back_populates="repositories" ) # One-to-Many relationship to discovered DOI references within this repository. @@ -138,8 +160,7 @@ class Repository(BaseModel, Base): # `cascade="all, delete-orphan"` ensures that if a Repository is deleted, all # associated DOIReference records are also deleted. doi_references: Mapped[List["DOIReference"]] = relationship( - back_populates="repository", - cascade="all, delete-orphan" + back_populates="repository", cascade="all, delete-orphan" ) # --- Table Arguments --- @@ -147,11 +168,11 @@ class Repository(BaseModel, Base): __table_args__ = ( # Index on the primary language for efficient filtering or grouping by language. # Note: index=True on the column definition above achieves the same. - Index('ix_repositories_language', 'language'), + Index("ix_repositories_language", "language"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Safely access 'id' which comes from BaseModel - obj_id = getattr(self, 'id', None) - return f"" \ No newline at end of file + obj_id = getattr(self, "id", None) + return f"" diff --git a/backend/data/models/repository_contributor.py b/backend/data/models/repository_contributor.py index 638dc80..f4f876c 100644 --- a/backend/data/models/repository_contributor.py +++ b/backend/data/models/repository_contributor.py @@ -7,13 +7,14 @@ """ from typing import Optional -from sqlalchemy import Integer, ForeignKey, UniqueConstraint # UniqueConstraint might be needed elsewhere -from sqlalchemy.orm import Mapped, mapped_column, relationship +from sqlalchemy import Integer, ForeignKey # UniqueConstraint might be needed elsewhere +from sqlalchemy.orm import Mapped, mapped_column # Assuming Base is correctly defined elsewhere # Adjust import path as necessary from ..database import Base + class RepositoryContributorAssociation(Base): """ Association table linking Repositories and Contributors (Many-to-Many). @@ -32,6 +33,7 @@ class RepositoryContributorAssociation(Base): contributions_count: Optional field storing the number of contributions made by the contributor to the repository (e.g., from GitHub API). """ + __tablename__ = "repository_contributors" # --- Composite Primary Key / Foreign Keys --- @@ -39,10 +41,14 @@ class RepositoryContributorAssociation(Base): # specific repository and one specific contributor. # Foreign key referencing the Repository table. Part of the composite PK. - repository_id: Mapped[int] = mapped_column(ForeignKey("repositories.id"), primary_key=True) + repository_id: Mapped[int] = mapped_column( + ForeignKey("repositories.id"), primary_key=True + ) # Foreign key referencing the Contributor table. Part of the composite PK. - contributor_id: Mapped[int] = mapped_column(ForeignKey("contributors.id"), primary_key=True) + contributor_id: Mapped[int] = mapped_column( + ForeignKey("contributors.id"), primary_key=True + ) # --- Optional Association Metadata --- # Additional information about the specific contribution relationship. @@ -67,6 +73,12 @@ class RepositoryContributorAssociation(Base): def __repr__(self): """Provides a concise string representation for debugging and logging.""" - count_repr = f", count={self.contributions_count}" if self.contributions_count is not None else "" - return (f"") \ No newline at end of file + count_repr = ( + f", count={self.contributions_count}" + if self.contributions_count is not None + else "" + ) + return ( + f"" + ) diff --git a/backend/data/models/repository_institution_affiliation.py b/backend/data/models/repository_institution_affiliation.py index 5cccc69..ee42bd6 100644 --- a/backend/data/models/repository_institution_affiliation.py +++ b/backend/data/models/repository_institution_affiliation.py @@ -7,12 +7,10 @@ """ import logging -from datetime import datetime from typing import Dict, Any, Optional, TYPE_CHECKING -from sqlalchemy import ( - String, Integer, Float, DateTime, ForeignKey, Index, PrimaryKeyConstraint, func -) +from sqlalchemy import String, Float, ForeignKey, Index, PrimaryKeyConstraint + # Import JSONB type for storing structured evidence/parameters in PostgreSQL from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.orm import relationship, Mapped, mapped_column @@ -20,8 +18,11 @@ # Assuming Base is correctly defined elsewhere # Adjust import path as necessary from ..database import Base + # Import custom timestamp types for consistency -from .types import timestamp_created # Using created timestamp logic for calculation time +from .types import ( + timestamp_created, +) # Using created timestamp logic for calculation time # Use TYPE_CHECKING to prevent circular imports for type hints if TYPE_CHECKING: @@ -30,6 +31,7 @@ logger = logging.getLogger(__name__) + class RepositoryInstitutionAffiliation(Base): """ Represents a potential affiliation link between a Repository and an Institution. @@ -55,6 +57,7 @@ class RepositoryInstitutionAffiliation(Base): repository: Relationship back to the Repository object. institution: Relationship back to the Institution object. """ + __tablename__ = "repository_institution_affiliations" # --- Composite Primary Key Components --- @@ -62,12 +65,14 @@ class RepositoryInstitutionAffiliation(Base): # Foreign key to the Repository being linked. repository_id: Mapped[int] = mapped_column( - ForeignKey("repositories.id", ondelete="CASCADE"), primary_key=True + ForeignKey("repositories.id", ondelete="CASCADE"), + primary_key=True, # `ondelete="CASCADE"`: If the repository is deleted, associated affiliation results are removed. ) # Foreign key to the Institution being linked. institution_id: Mapped[int] = mapped_column( - ForeignKey("institutions.id", ondelete="CASCADE"), primary_key=True + ForeignKey("institutions.id", ondelete="CASCADE"), + primary_key=True, # `ondelete="CASCADE"`: If the institution is deleted, associated affiliation results are removed. ) # Identifier for the affiliation prediction algorithm used. @@ -85,7 +90,9 @@ class RepositoryInstitutionAffiliation(Base): evidence: Mapped[Optional[Dict[str, Any]]] = mapped_column(JSONB, nullable=True) # Records the parameters used by the algorithm for this specific run. # Example: {'threshold': 0.7, 'use_email_heuristics': True} - parameters_used: Mapped[Optional[Dict[str, Any]]] = mapped_column(JSONB, nullable=True) + parameters_used: Mapped[Optional[Dict[str, Any]]] = mapped_column( + JSONB, nullable=True + ) # --- Timestamping --- # Indicates when this specific affiliation record was created/calculated. @@ -106,16 +113,20 @@ class RepositoryInstitutionAffiliation(Base): # Define the composite primary key constraint explicitly and add indexes. __table_args__ = ( # Explicit definition of the composite primary key constraint. - PrimaryKeyConstraint('repository_id', 'institution_id', 'algorithm_name', 'algorithm_version'), + PrimaryKeyConstraint( + "repository_id", "institution_id", "algorithm_name", "algorithm_version" + ), # Indexes on individual foreign key columns and algorithm name facilitate efficient # lookups, e.g., finding all affiliations for a repo, or all results from a specific algorithm. - Index('ix_repo_inst_affil_repo_id', 'repository_id'), - Index('ix_repo_inst_affil_inst_id', 'institution_id'), - Index('ix_repo_inst_affil_algo_name', 'algorithm_name'), + Index("ix_repo_inst_affil_repo_id", "repository_id"), + Index("ix_repo_inst_affil_inst_id", "institution_id"), + Index("ix_repo_inst_affil_algo_name", "algorithm_name"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" - return (f"") # Format score for readability \ No newline at end of file + return ( + f"" + ) # Format score for readability diff --git a/backend/data/models/software_dependency.py b/backend/data/models/software_dependency.py index 2be1bea..8669273 100644 --- a/backend/data/models/software_dependency.py +++ b/backend/data/models/software_dependency.py @@ -9,13 +9,14 @@ import logging from typing import Optional, TYPE_CHECKING -from sqlalchemy import String, Integer, ForeignKey, Index, Boolean +from sqlalchemy import String, ForeignKey, Index, Boolean from sqlalchemy.orm import relationship, Mapped, mapped_column # Adjust the import path according to your project structure # Assuming Base is defined in backend.data.database from backend.data.database import Base + # Assuming BaseModel provides id, created_at, updated_at from .base import BaseModel @@ -27,6 +28,7 @@ logger = logging.getLogger(__name__) + class SoftwareDependency(BaseModel, Base): """ Represents a software dependency found within a repository file. @@ -46,14 +48,17 @@ class SoftwareDependency(BaseModel, Base): is_dev_dependency: Flag indicating if this is marked as a development dependency. repository: Relationship back to the parent Repository object. """ + __tablename__ = "software_dependencies" # --- Foreign Key --- # Links this dependency record back to the repository it was found in. repository_id: Mapped[int] = mapped_column( - ForeignKey("repositories.id", ondelete="CASCADE"), # Cascade delete if repo is removed - index=True, # Index for efficient lookup of dependencies by repository - nullable=False + ForeignKey( + "repositories.id", ondelete="CASCADE" + ), # Cascade delete if repo is removed + index=True, # Index for efficient lookup of dependencies by repository + nullable=False, ) # --- Dependency Details --- @@ -81,7 +86,9 @@ class SoftwareDependency(BaseModel, Base): # Flag indicating if the dependency is designated for development purposes only # (e.g., in 'devDependencies' in package.json). Indexed for filtering. # Nullable if the concept doesn't apply or wasn't determined. - is_dev_dependency: Mapped[Optional[bool]] = mapped_column(Boolean, index=True, nullable=True) + is_dev_dependency: Mapped[Optional[bool]] = mapped_column( + Boolean, index=True, nullable=True + ) # --- Relationships --- # Define relationship(s) for navigation. @@ -95,20 +102,24 @@ class SoftwareDependency(BaseModel, Base): # Define explicit indexes to optimize common query patterns. __table_args__ = ( # Index on repository_id (already indexed via column def, but explicit). - Index('ix_software_dependencies_repo_id', 'repository_id'), + Index("ix_software_dependencies_repo_id", "repository_id"), # Index on dependency_name for finding usage of specific packages across repos. - Index('ix_software_dependencies_name', 'dependency_name'), + Index("ix_software_dependencies_name", "dependency_name"), # Index on dependency_type for filtering by ecosystem. - Index('ix_software_dependencies_type', 'dependency_type'), + Index("ix_software_dependencies_type", "dependency_type"), # Index on is_dev_dependency flag for distinguishing runtime vs dev dependencies. - Index('ix_software_dependencies_is_dev', 'is_dev_dependency'), + Index("ix_software_dependencies_is_dev", "is_dev_dependency"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Safely access 'id' which comes from BaseModel - obj_id = getattr(self, 'id', None) - version_str = f", version='{self.version_constraint}'" if self.version_constraint else "" + obj_id = getattr(self, "id", None) + version_str = ( + f", version='{self.version_constraint}'" if self.version_constraint else "" + ) dev_flag = ", dev" if self.is_dev_dependency else "" - return (f"") \ No newline at end of file + return ( + f"" + ) diff --git a/backend/data/models/subfield.py b/backend/data/models/subfield.py index d8e45b9..d718b87 100644 --- a/backend/data/models/subfield.py +++ b/backend/data/models/subfield.py @@ -8,7 +8,7 @@ import logging from typing import List, Optional, TYPE_CHECKING -from sqlalchemy import String, Text, Integer, ForeignKey, Index +from sqlalchemy import String, Text, ForeignKey, Index from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere @@ -18,11 +18,12 @@ # Use TYPE_CHECKING to prevent circular imports for type hints if TYPE_CHECKING: - from .field import Field # For the many-to-one relationship to Field - from .topic import Topic # For the one-to-many relationship to Topics + from .field import Field # For the many-to-one relationship to Field + from .topic import Topic # For the one-to-many relationship to Topics logger = logging.getLogger(__name__) + class Subfield(BaseModel, Base): """ Represents an OpenAlex Subfield, the third tier in the subject hierarchy. @@ -41,13 +42,16 @@ class Subfield(BaseModel, Base): field: Many-to-one relationship back to the parent Field object. topics: One-to-many relationship linking this Subfield to its constituent Topics. """ + __tablename__ = "subfields" # --- Identifiers and Details --- # Core attributes defining the Subfield based on OpenAlex data. # OpenAlex unique ID for the Subfield. Indexed for fast lookups. - openalex_id: Mapped[str] = mapped_column(String, unique=True, index=True, nullable=False) + openalex_id: Mapped[str] = mapped_column( + String, unique=True, index=True, nullable=False + ) # Human-readable name. Indexed for searching and display. display_name: Mapped[str] = mapped_column(String, index=True, nullable=False) @@ -58,9 +62,9 @@ class Subfield(BaseModel, Base): # --- Foreign Key to Parent Field --- # Establishes the hierarchical link within the subject classification. field_id: Mapped[int] = mapped_column( - ForeignKey("fields.id", ondelete="CASCADE"), # Links to the parent Field - index=True, # Index for efficient lookup of Subfields within a Field - nullable=False + ForeignKey("fields.id", ondelete="CASCADE"), # Links to the parent Field + index=True, # Index for efficient lookup of Subfields within a Field + nullable=False, # 'ondelete="CASCADE"' ensures that if a Field is deleted, all its child # Subfields (and consequently their Topics) are also deleted. ) @@ -79,23 +83,22 @@ class Subfield(BaseModel, Base): # `cascade="all, delete-orphan"` ensures that if a Subfield is deleted, all its # associated Topics are also removed from the database. topics: Mapped[List["Topic"]] = relationship( - back_populates="subfield", - cascade="all, delete-orphan" + back_populates="subfield", cascade="all, delete-orphan" ) # --- Table Arguments --- # Explicitly define indexes for optimized query performance. __table_args__ = ( # Index on OpenAlex ID (unique already implies index, but explicit). - Index('ix_subfields_openalex_id', 'openalex_id'), + Index("ix_subfields_openalex_id", "openalex_id"), # Index on display name for text searches or sorting. - Index('ix_subfields_display_name', 'display_name'), + Index("ix_subfields_display_name", "display_name"), # Index on the foreign key to the parent Field (already indexed via column def, but explicit). - Index('ix_subfields_field_id', 'field_id'), + Index("ix_subfields_field_id", "field_id"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Safely access 'id' which comes from BaseModel - obj_id = getattr(self, 'id', None) - return f"" \ No newline at end of file + obj_id = getattr(self, "id", None) + return f"" diff --git a/backend/data/models/topic.py b/backend/data/models/topic.py index c37bb44..fa6c27b 100644 --- a/backend/data/models/topic.py +++ b/backend/data/models/topic.py @@ -7,9 +7,9 @@ """ import logging -from typing import List, Optional, TYPE_CHECKING +from typing import Optional, TYPE_CHECKING -from sqlalchemy import String, Text, Integer, ForeignKey, Index +from sqlalchemy import String, Text, ForeignKey, Index from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere @@ -19,11 +19,12 @@ # Use TYPE_CHECKING to prevent circular imports for type hints if TYPE_CHECKING: - from .subfield import Subfield # For the many-to-one relationship to Subfield + from .subfield import Subfield # For the many-to-one relationship to Subfield # The relationship to WorkTopic (and thus Works) is defined in WorkTopic model. logger = logging.getLogger(__name__) + class Topic(BaseModel, Base): """ Represents an OpenAlex Topic, the fourth and often most specific tier @@ -43,13 +44,16 @@ class Topic(BaseModel, Base): subfield: Many-to-one relationship back to the parent Subfield object. # Note: The link to Works is via the WorkTopic association model. """ + __tablename__ = "topics" # --- Identifiers and Details --- # Core attributes defining the Topic based on OpenAlex data. # OpenAlex unique ID for the Topic. Indexed for fast lookups. - openalex_id: Mapped[str] = mapped_column(String, unique=True, index=True, nullable=False) + openalex_id: Mapped[str] = mapped_column( + String, unique=True, index=True, nullable=False + ) # Human-readable name. Indexed for searching and display. display_name: Mapped[str] = mapped_column(String, index=True, nullable=False) @@ -60,9 +64,9 @@ class Topic(BaseModel, Base): # --- Foreign Key to Parent Subfield --- # Establishes the hierarchical link within the subject classification. subfield_id: Mapped[int] = mapped_column( - ForeignKey("subfields.id", ondelete="CASCADE"), # Links to the parent Subfield - index=True, # Index for efficient lookup of Topics within a Subfield - nullable=False + ForeignKey("subfields.id", ondelete="CASCADE"), # Links to the parent Subfield + index=True, # Index for efficient lookup of Topics within a Subfield + nullable=False, # 'ondelete="CASCADE"' ensures that if a Subfield is deleted, all its child Topics # are also deleted. This propagates deletions up the hierarchy if a Domain/Field is removed. ) @@ -90,15 +94,15 @@ class Topic(BaseModel, Base): # Explicitly define indexes for optimized query performance. __table_args__ = ( # Index on OpenAlex ID (unique already implies index, but explicit). - Index('ix_topics_openalex_id', 'openalex_id'), + Index("ix_topics_openalex_id", "openalex_id"), # Index on display name for text searches or sorting. - Index('ix_topics_display_name', 'display_name'), + Index("ix_topics_display_name", "display_name"), # Index on the foreign key to the parent Subfield (already indexed via column def, but explicit). - Index('ix_topics_subfield_id', 'subfield_id'), + Index("ix_topics_subfield_id", "subfield_id"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Safely access 'id' which comes from BaseModel - obj_id = getattr(self, 'id', None) - return f"" \ No newline at end of file + obj_id = getattr(self, "id", None) + return f"" diff --git a/backend/data/models/types.py b/backend/data/models/types.py index 53cd9ad..28abf50 100644 --- a/backend/data/models/types.py +++ b/backend/data/models/types.py @@ -20,42 +20,48 @@ # Define a standard integer primary key column. # Includes auto-incrementing, indexing, and marking as the primary key. intpk = Annotated[ - int, # Python type hint - mapped_column(Integer, primary_key=True, index=True, autoincrement=True) # SQLAlchemy config + int, # Python type hint + mapped_column( + Integer, primary_key=True, index=True, autoincrement=True + ), # SQLAlchemy config ] # Define a standard timestamp column, ensuring timezone awareness. # It expects a Python `datetime` object and maps to a database DateTime type # that stores timezone information (e.g., TIMESTAMPTZ in PostgreSQL). timestamp = Annotated[ - datetime, # Python type hint - mapped_column(DateTime(timezone=True), nullable=False) # SQLAlchemy config: timezone=True, not nullable + datetime, # Python type hint + mapped_column( + DateTime(timezone=True), nullable=False + ), # SQLAlchemy config: timezone=True, not nullable ] # Define a nullable version of the standard timestamp column. # Useful for optional timestamps like 'completed_at' or 'deleted_at'. timestamp_nullable = Annotated[ - datetime, # Python type hint - mapped_column(DateTime(timezone=True), nullable=True) # SQLAlchemy config: timezone=True, nullable + datetime, # Python type hint + mapped_column( + DateTime(timezone=True), nullable=True + ), # SQLAlchemy config: timezone=True, nullable ] # Define a timestamp column specifically for tracking creation time. # Automatically sets the timestamp using the database's clock (`func.now()`) # when a record is first inserted (`server_default`). It is not nullable. timestamp_created = Annotated[ - datetime, # Python type hint - mapped_column(DateTime(timezone=True), server_default=func.now(), nullable=False) + datetime, # Python type hint + mapped_column(DateTime(timezone=True), server_default=func.now(), nullable=False), ] # Define a timestamp column specifically for tracking the last update time. # Automatically sets the timestamp on creation (`server_default`) and updates # it whenever the record is modified (`onupdate`). It is not nullable. timestamp_updated = Annotated[ - datetime, # Python type hint + datetime, # Python type hint mapped_column( DateTime(timezone=True), - server_default=func.now(), # Set on creation - onupdate=func.now(), # Update on modification - nullable=False - ) -] \ No newline at end of file + server_default=func.now(), # Set on creation + onupdate=func.now(), # Update on modification + nullable=False, + ), +] diff --git a/backend/data/models/work.py b/backend/data/models/work.py index b00e982..aacc1ff 100644 --- a/backend/data/models/work.py +++ b/backend/data/models/work.py @@ -7,7 +7,7 @@ """ from typing import List, Optional, TYPE_CHECKING -from sqlalchemy import String, Integer, Text, Index, ForeignKey +from sqlalchemy import String, Integer, Text, Index from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base and BaseModel are correctly defined elsewhere @@ -18,10 +18,13 @@ # Use TYPE_CHECKING to prevent circular imports for type hints, # necessary for defining relationships to other models. if TYPE_CHECKING: - from .doi_reference import DOIReference # Links DOIs found in repos back to this Work - from .authorship import Authorship # Links Persons (authors) to this Work - from .work_citation import WorkCitation # Links this Work to cited/citing Works - from .work_topic import WorkTopic # Links this Work to classification Topics + from .doi_reference import ( + DOIReference, + ) # Links DOIs found in repos back to this Work + from .authorship import Authorship # Links Persons (authors) to this Work + from .work_citation import WorkCitation # Links this Work to cited/citing Works + from .work_topic import WorkTopic # Links this Work to classification Topics + class Work(BaseModel, Base): """ @@ -50,13 +53,16 @@ class Work(BaseModel, Base): citations: Relationship to WorkCitation records where this Work is the *cited* work. topics: Relationship to WorkTopic records linking this Work to subject Topics. """ + __tablename__ = "works" # --- Identifiers --- # Key unique identifiers for the scholarly work. # OpenAlex unique ID. Essential for linking with OpenAlex data. Indexed. - openalex_id: Mapped[str] = mapped_column(String, unique=True, index=True, nullable=False) + openalex_id: Mapped[str] = mapped_column( + String, unique=True, index=True, nullable=False + ) # Digital Object Identifier. Should be unique and is crucial for resolution. Indexed. doi: Mapped[str] = mapped_column(String, unique=True, index=True, nullable=False) @@ -66,13 +72,19 @@ class Work(BaseModel, Base): # Title of the publication. Text allows for long titles. title: Mapped[Optional[str]] = mapped_column(Text, nullable=True) # Year of publication. Indexed for filtering by year. - publication_year: Mapped[Optional[int]] = mapped_column(Integer, index=True, nullable=True) + publication_year: Mapped[Optional[int]] = mapped_column( + Integer, index=True, nullable=True + ) # Type of publication according to OpenAlex taxonomy. Indexed. type: Mapped[Optional[str]] = mapped_column(String, index=True, nullable=True) # Citation count as reported by the data source (e.g., OpenAlex). - cited_by_count: Mapped[Optional[int]] = mapped_column(Integer, default=0, nullable=True) + cited_by_count: Mapped[Optional[int]] = mapped_column( + Integer, default=0, nullable=True + ) # Display name of the host venue (journal, conference proceedings, etc.). - host_venue_display_name: Mapped[Optional[str]] = mapped_column(String, nullable=True) + host_venue_display_name: Mapped[Optional[str]] = mapped_column( + String, nullable=True + ) # URL linking back to the OpenAlex page for this work. openalex_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) @@ -83,16 +95,14 @@ class Work(BaseModel, Base): # `back_populates` links to the 'work' attribute in DOIReference. # `cascade` ensures associated DOIReferences are deleted if the Work is deleted. doi_references: Mapped[List["DOIReference"]] = relationship( - back_populates="work", - cascade="all, delete-orphan" + back_populates="work", cascade="all, delete-orphan" ) # One-to-Many: A Work typically has multiple Authorships (one per author). # `back_populates` links to the 'work' attribute in Authorship. # `cascade` ensures Authorships (and their Affiliations) are deleted if the Work is deleted. authorships: Mapped[List["Authorship"]] = relationship( - back_populates="work", - cascade="all, delete-orphan" + back_populates="work", cascade="all, delete-orphan" ) # One-to-Many (Self-Referential via WorkCitation): Represents works *cited by* this work. @@ -102,7 +112,7 @@ class Work(BaseModel, Base): references: Mapped[List["WorkCitation"]] = relationship( foreign_keys="WorkCitation.citing_work_id", back_populates="citing_work", - cascade="all, delete-orphan" + cascade="all, delete-orphan", ) # One-to-Many (Self-Referential via WorkCitation): Represents works *that cite* this work. @@ -112,31 +122,34 @@ class Work(BaseModel, Base): citations: Mapped[List["WorkCitation"]] = relationship( foreign_keys="WorkCitation.cited_work_id", back_populates="cited_work", - cascade="all, delete-orphan" + cascade="all, delete-orphan", ) # One-to-Many: A Work can be associated with multiple Topics via the WorkTopic association table. # `back_populates` links to the 'work' attribute in the WorkTopic model. # `cascade` ensures WorkTopic entries are deleted if the Work is deleted. topics: Mapped[List["WorkTopic"]] = relationship( - back_populates="work", - cascade="all, delete-orphan" + back_populates="work", cascade="all, delete-orphan" ) # --- Table Arguments --- # Define explicit indexes for commonly queried metadata fields. __table_args__ = ( # Index on publication type for filtering. - Index('ix_works_type', 'type'), + Index("ix_works_type", "type"), # Index on publication year for filtering or sorting by year. - Index('ix_works_publication_year', 'publication_year'), + Index("ix_works_publication_year", "publication_year"), # Note: Indexes on openalex_id and doi are created due to unique=True. ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" # Safely access 'id' which comes from BaseModel - obj_id = getattr(self, 'id', None) + obj_id = getattr(self, "id", None) # Truncate title for brevity - title_repr = (self.title[:50] + '...') if self.title and len(self.title) > 50 else self.title or '[No Title]' - return f"" \ No newline at end of file + title_repr = ( + (self.title[:50] + "...") + if self.title and len(self.title) > 50 + else self.title or "[No Title]" + ) + return f"" diff --git a/backend/data/models/work_citation.py b/backend/data/models/work_citation.py index e27fd68..b3e361c 100644 --- a/backend/data/models/work_citation.py +++ b/backend/data/models/work_citation.py @@ -8,7 +8,7 @@ import logging from typing import TYPE_CHECKING -from sqlalchemy import Integer, ForeignKey, Index +from sqlalchemy import ForeignKey, Index from sqlalchemy.orm import relationship, Mapped, mapped_column @@ -23,6 +23,7 @@ logger = logging.getLogger(__name__) + class WorkCitation(Base): """ Represents a citation link between two Works (citing -> cited). @@ -41,6 +42,7 @@ class WorkCitation(Base): citing_work: Relationship back to the Work object that is citing. cited_work: Relationship back to the Work object that is being cited. """ + __tablename__ = "work_citations" # --- Composite Primary Key and Foreign Keys --- @@ -68,7 +70,7 @@ class WorkCitation(Base): # (representing the list of works *cited by* that Work). citing_work: Mapped["Work"] = relationship( foreign_keys=[citing_work_id], - back_populates="references" # Corresponds to Work.references + back_populates="references", # Corresponds to Work.references ) # Relationship to the Work entity that is being cited (the cited work). @@ -77,7 +79,7 @@ class WorkCitation(Base): # (representing the list of works *that cite* that Work). cited_work: Mapped["Work"] = relationship( foreign_keys=[cited_work_id], - back_populates="citations" # Corresponds to Work.citations + back_populates="citations", # Corresponds to Work.citations ) # --- Table Arguments --- @@ -87,11 +89,12 @@ class WorkCitation(Base): # the citing work or only by the cited work (e.g., finding all references for a work, # or finding all citations of a work). __table_args__ = ( - Index('ix_work_citations_citing_work_id', 'citing_work_id'), - Index('ix_work_citations_cited_work_id', 'cited_work_id'), + Index("ix_work_citations_citing_work_id", "citing_work_id"), + Index("ix_work_citations_cited_work_id", "cited_work_id"), ) def __repr__(self): """Provides a concise string representation for debugging and logging.""" - return (f"") \ No newline at end of file + return ( + f"" + ) diff --git a/backend/data/models/work_topic.py b/backend/data/models/work_topic.py index 9fad2df..4d7556e 100644 --- a/backend/data/models/work_topic.py +++ b/backend/data/models/work_topic.py @@ -9,7 +9,7 @@ import logging from typing import Optional, TYPE_CHECKING -from sqlalchemy import Integer, Float, Boolean, ForeignKey, Index, PrimaryKeyConstraint +from sqlalchemy import Float, Boolean, ForeignKey, Index, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped, mapped_column # Assuming Base is correctly defined elsewhere @@ -24,6 +24,7 @@ logger = logging.getLogger(__name__) + class WorkTopic(Base): """ Association table linking Works to their assigned OpenAlex Topics. @@ -44,6 +45,7 @@ class WorkTopic(Base): work: Relationship back to the Work object. topic: Relationship back to the Topic object. """ + __tablename__ = "work_topics" # --- Composite Primary Key and Foreign Keys --- @@ -91,11 +93,11 @@ class WorkTopic(Base): # Define the primary key constraint explicitly and add indexes. __table_args__ = ( # Explicit definition of the composite primary key. - PrimaryKeyConstraint('work_id', 'topic_id'), + PrimaryKeyConstraint("work_id", "topic_id"), # Indexes on individual foreign keys improve performance when querying for # all topics of a work, or all works associated with a topic. - Index('ix_work_topics_work_id', 'work_id'), - Index('ix_work_topics_topic_id', 'topic_id'), + Index("ix_work_topics_work_id", "work_id"), + Index("ix_work_topics_topic_id", "topic_id"), # Potentially add index on 'score' or 'is_primary' if frequently used for filtering/sorting. # Index('ix_work_topics_is_primary', 'is_primary'), ) @@ -105,4 +107,4 @@ def __repr__(self): primary_flag = ", primary" if self.is_primary else "" # Format score nicely, handling potential None value. score_repr = f", score={self.score:.3f}" if self.score is not None else "" - return f"" \ No newline at end of file + return f"" diff --git a/backend/data/repositories/__init__.py b/backend/data/repositories/__init__.py index 914ef14..7687dfe 100644 --- a/backend/data/repositories/__init__.py +++ b/backend/data/repositories/__init__.py @@ -12,7 +12,9 @@ from .keyword_repository_association_repo import KeywordRepositoryAssociationRepository from .person_repo import PersonRepository from .institution_repo import InstitutionRepository -from .repository_institution_affiliation_repo import RepositoryInstitutionAffiliationRepository +from .repository_institution_affiliation_repo import ( + RepositoryInstitutionAffiliationRepository, +) from .software_dependency_repo import SoftwareDependencyRepository from .domain_repo import DomainRepository from .field_repo import FieldRepository @@ -20,6 +22,7 @@ from .topic_repo import TopicRepository from .pull_request_repo import PullRequestRepository from .issue_repo import IssueRepository + # --- ADDED --- from .issue_comment_repo import IssueCommentRepository from .pr_review_comment_repo import PRReviewCommentRepository @@ -47,6 +50,6 @@ "TopicRepository", "PullRequestRepository", "IssueRepository", - "IssueCommentRepository", # <<< Added - "PRReviewCommentRepository", # <<< Added -] \ No newline at end of file + "IssueCommentRepository", # <<< Added + "PRReviewCommentRepository", # <<< Added +] diff --git a/backend/data/repositories/base_repository.py b/backend/data/repositories/base_repository.py index 76a3a30..68cfa92 100644 --- a/backend/data/repositories/base_repository.py +++ b/backend/data/repositories/base_repository.py @@ -26,6 +26,7 @@ # CreateSchemaType = TypeVar("CreateSchemaType", bound=Dict[str, Any]) # UpdateSchemaType = TypeVar("UpdateSchemaType", bound=Dict[str, Any]) + class BaseRepository(Generic[ModelType]): """ Generic base class for data repositories. @@ -72,14 +73,15 @@ def get(self, id: Any) -> Optional[ModelType]: # Recommended way to fetch by PK in SQLAlchemy >= 1.4 return self.db.get(self.model, id) except SQLAlchemyError as e: - logger.error(f"Database error getting {self.model.__name__} id {id}: {e}", exc_info=True) + logger.error( + f"Database error getting {self.model.__name__} id {id}: {e}", + exc_info=True, + ) # Re-raise allows higher-level handlers (e.g., API endpoints) # to manage the error appropriately (e.g., return HTTP 500). raise - def get_multi( - self, *, skip: int = 0, limit: int = 100 - ) -> List[ModelType]: + def get_multi(self, *, skip: int = 0, limit: int = 100) -> List[ModelType]: """ Retrieves multiple objects with optional pagination. @@ -94,12 +96,17 @@ def get_multi( Raises: SQLAlchemyError: If a database-related error occurs during the query. """ - logger.debug(f"Getting multiple {self.model.__name__}s, skip={skip}, limit={limit}") + logger.debug( + f"Getting multiple {self.model.__name__}s, skip={skip}, limit={limit}" + ) try: # Basic query with offset and limit for pagination. return self.db.query(self.model).offset(skip).limit(limit).all() except SQLAlchemyError as e: - logger.error(f"Database error getting multiple {self.model.__name__}s: {e}", exc_info=True) + logger.error( + f"Database error getting multiple {self.model.__name__}s: {e}", + exc_info=True, + ) raise def create(self, *, obj_in_data: Dict[str, Any]) -> ModelType: @@ -128,22 +135,24 @@ def create(self, *, obj_in_data: Dict[str, Any]) -> ModelType: db_obj = self.model(**obj_in_data) try: self.db.add(db_obj) # Add the new object to the session. - self.db.commit() # Persist changes to the database. - self.db.refresh(db_obj) # Update the instance with DB defaults (e.g., ID). + self.db.commit() # Persist changes to the database. + self.db.refresh(db_obj) # Update the instance with DB defaults (e.g., ID). # Attempt to log the ID of the created object if it has an 'id' attribute. - obj_id = getattr(db_obj, 'id', '[unknown ID]') + obj_id = getattr(db_obj, "id", "[unknown ID]") logger.info(f"Created {self.model.__name__} with id: {obj_id}") return db_obj except SQLAlchemyError as e: - logger.error(f"Database error creating {self.model.__name__}: {e}", exc_info=True) - self.db.rollback() # Roll back the transaction on error. + logger.error( + f"Database error creating {self.model.__name__}: {e}", exc_info=True + ) + self.db.rollback() # Roll back the transaction on error. raise def update( self, *, db_obj: ModelType, - obj_in_data: Dict[str, Any] + obj_in_data: Dict[str, Any], # Union type for obj_in can be added later if using Pydantic schemas: # obj_in: Union[UpdateSchemaType, Dict[str, Any]] ) -> ModelType: @@ -167,7 +176,7 @@ def update( The session is rolled back before re-raising. """ # Retrieve the object's ID for logging, if available. - obj_id = getattr(db_obj, 'id', '[unknown ID]') + obj_id = getattr(db_obj, "id", "[unknown ID]") logger.debug(f"Updating {self.model.__name__} id: {obj_id}") # Iterate over the provided data and update the model instance. @@ -175,21 +184,26 @@ def update( if hasattr(db_obj, field): setattr(db_obj, field, value) else: - # Log a warning if a field in the input data doesn't exist on the model. - logger.warning(f"Field '{field}' not found in model {self.model.__name__} during update for ID {obj_id}.") + # Log a warning if a field in the input data doesn't exist on the model. + logger.warning( + f"Field '{field}' not found in model {self.model.__name__} during update for ID {obj_id}." + ) try: # Add the modified object to the session (marks it as dirty). # If the object was already persistent, add() is usually a no-op # but ensures it's tracked if detached/re-attached. self.db.add(db_obj) - self.db.commit() # Persist the changes. - self.db.refresh(db_obj) # Refresh the instance state from the DB. + self.db.commit() # Persist the changes. + self.db.refresh(db_obj) # Refresh the instance state from the DB. logger.info(f"Updated {self.model.__name__} with id: {obj_id}") return db_obj except SQLAlchemyError as e: - logger.error(f"Database error updating {self.model.__name__} id {obj_id}: {e}", exc_info=True) - self.db.rollback() # Roll back the transaction on error. + logger.error( + f"Database error updating {self.model.__name__} id {obj_id}: {e}", + exc_info=True, + ) + self.db.rollback() # Roll back the transaction on error. raise def remove(self, *, id: Any) -> Optional[ModelType]: @@ -215,17 +229,22 @@ def remove(self, *, id: Any) -> Optional[ModelType]: obj = self.get(id) if obj: try: - self.db.delete(obj) # Mark the object for deletion. - self.db.commit() # Persist the deletion. + self.db.delete(obj) # Mark the object for deletion. + self.db.commit() # Persist the deletion. logger.info(f"Successfully removed {self.model.__name__} with id: {id}") - return obj # Return the deleted object (now detached from session). + return obj # Return the deleted object (now detached from session). except SQLAlchemyError as e: # Log using the ID available on the object, if possible. - obj_id = getattr(obj, 'id', id) - logger.error(f"Database error removing {self.model.__name__} id {obj_id}: {e}", exc_info=True) - self.db.rollback() # Roll back the transaction on error. + obj_id = getattr(obj, "id", id) + logger.error( + f"Database error removing {self.model.__name__} id {obj_id}: {e}", + exc_info=True, + ) + self.db.rollback() # Roll back the transaction on error. raise else: # Log a warning if the object to be removed wasn't found. - logger.warning(f"{self.model.__name__} with id: {id} not found for removal.") - return None \ No newline at end of file + logger.warning( + f"{self.model.__name__} with id: {id} not found for removal." + ) + return None diff --git a/backend/data/repositories/contributor_repo.py b/backend/data/repositories/contributor_repo.py index 05c895b..a79431e 100644 --- a/backend/data/repositories/contributor_repo.py +++ b/backend/data/repositories/contributor_repo.py @@ -11,13 +11,14 @@ from typing import Optional, Dict, Any from sqlalchemy.orm import Session -from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception +from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception from .base_repository import BaseRepository -from backend.data.models import Contributor # The specific SQLAlchemy model +from backend.data.models import Contributor # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class ContributorRepository(BaseRepository[Contributor]): """ Repository dedicated to CRUD and specific query operations for Contributor entities. @@ -52,15 +53,24 @@ def get_by_github_id(self, *, github_id: int) -> Optional[Contributor]: logger.debug(f"Getting Contributor by github_id: {github_id}") # Basic check if the session is active, useful for debugging transaction issues. if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_github_id for GitHub ID {github_id}") + logger.warning( + f"Session is inactive in get_by_github_id for GitHub ID {github_id}" + ) # Depending on application logic, could raise an error or return None. # Returning None might hide issues, raising might be better in strict contexts. return None try: # Query the Contributor model, filtering by the github_id column. - return self.db.query(self.model).filter(self.model.github_id == github_id).first() + return ( + self.db.query(self.model) + .filter(self.model.github_id == github_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_github_id for {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_github_id for {github_id}: {e}", + exc_info=True, + ) raise def get_by_login(self, *, login: str) -> Optional[Contributor]: @@ -78,14 +88,16 @@ def get_by_login(self, *, login: str) -> Optional[Contributor]: """ logger.debug(f"Getting Contributor by login: {login}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_login for login '{login}'") - return None + logger.warning(f"Session is inactive in get_by_login for login '{login}'") + return None try: # Query the Contributor model, filtering by the login column. return self.db.query(self.model).filter(self.model.login == login).first() except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_login for {login}: {e}", exc_info=True) - raise + logger.error( + f"SQLAlchemyError during get_by_login for {login}: {e}", exc_info=True + ) + raise def get_or_create_by_github_id( self, *, github_id: int, obj_in_data: Dict[str, Any] @@ -127,11 +139,15 @@ def get_or_create_by_github_id( The caller should handle rollback. """ if not github_id: - raise ValueError("github_id cannot be empty for Contributor get_or_create") + raise ValueError("github_id cannot be empty for Contributor get_or_create") # Check session state at the beginning. Crucial for transactional integrity. if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_github_id for Contributor.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_github_id for Contributor." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First --- @@ -139,16 +155,25 @@ def get_or_create_by_github_id( if db_obj: # --- Step 2a: Record Found - Check for Updates --- - logger.debug(f"Found existing Contributor GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Contributor GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False - new_login = obj_in_data.get('login') + new_login = obj_in_data.get("login") # Check if login needs update and handle potential uniqueness conflicts. if new_login and db_obj.login != new_login: - if not self.db.is_active: # Re-check session before subsequent query - raise RuntimeError("Session became inactive before login conflict check during update.") + if ( + not self.db.is_active + ): # Re-check session before subsequent query + raise RuntimeError( + "Session became inactive before login conflict check during update." + ) existing_login_contributor = self.get_by_login(login=new_login) - if existing_login_contributor and existing_login_contributor.id != db_obj.id: + if ( + existing_login_contributor + and existing_login_contributor.id != db_obj.id + ): # Log a warning but proceed without changing the login to avoid unique constraint error. # Alternatively, could raise an error here depending on desired behavior. logger.warning( @@ -156,51 +181,68 @@ def get_or_create_by_github_id( f"because it's already assigned to Contributor DB ID {existing_login_contributor.id}. Skipping login update." ) else: - logger.info(f"Updating login for Contributor {db_obj.id} from '{db_obj.login}' to '{new_login}'") + logger.info( + f"Updating login for Contributor {db_obj.id} from '{db_obj.login}' to '{new_login}'" + ) db_obj.login = new_login updated = True # Check and update other fields if they differ. - if obj_in_data.get('type') is not None and db_obj.type != obj_in_data.get('type'): - db_obj.type = obj_in_data['type'] + if obj_in_data.get( + "type" + ) is not None and db_obj.type != obj_in_data.get("type"): + db_obj.type = obj_in_data["type"] + updated = True + if obj_in_data.get( + "avatar_url" + ) is not None and db_obj.avatar_url != obj_in_data.get("avatar_url"): + db_obj.avatar_url = obj_in_data["avatar_url"] + updated = True + if obj_in_data.get( + "html_url" + ) is not None and db_obj.html_url != obj_in_data.get("html_url"): + db_obj.html_url = obj_in_data["html_url"] updated = True - if obj_in_data.get('avatar_url') is not None and db_obj.avatar_url != obj_in_data.get('avatar_url'): - db_obj.avatar_url = obj_in_data['avatar_url'] - updated = True - if obj_in_data.get('html_url') is not None and db_obj.html_url != obj_in_data.get('html_url'): - db_obj.html_url = obj_in_data['html_url'] - updated = True # Add checks for other relevant fields here... if updated: # Add the modified object to the session to mark it for update on commit. self.db.add(db_obj) - logger.info(f"Contributor {db_obj.id} marked for update in the current session.") + logger.info( + f"Contributor {db_obj.id} marked for update in the current session." + ) # Optional: Flush here if the caller needs the updated state # reflected in the DB *before* the final commit. # self.db.flush() # self.db.refresh(db_obj) # Refresh if flushed - return db_obj # Return the existing (potentially updated) object. + return db_obj # Return the existing (potentially updated) object. else: # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"Contributor with GH ID {github_id} not found. Preparing to create new.") + logger.debug( + f"Contributor with GH ID {github_id} not found. Preparing to create new." + ) # Ensure the github_id is included in the data used for creation. obj_in_data["github_id"] = github_id # Create a new model instance. new_obj = self.model(**obj_in_data) - self.db.add(new_obj) # Add the new object to the session. + self.db.add(new_obj) # Add the new object to the session. # Flush the session to send the INSERT statement to the database. # This assigns the primary key (if auto-generated) and checks constraints. self.db.flush() # Refresh the instance to load any database-generated values (e.g., defaults). self.db.refresh(new_obj) - logger.info(f"Successfully created and flushed new Contributor GH ID {github_id} (DB ID: {new_obj.id})") - return new_obj # Return the newly created object. + logger.info( + f"Successfully created and flushed new Contributor GH ID {github_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the newly created object. except SQLAlchemyError as e: # Log the error occurred during the get_or_create process. - logger.error(f"SQLAlchemyError during get_or_create for Contributor GH ID {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create for Contributor GH ID {github_id}: {e}", + exc_info=True, + ) # Critical: Do NOT rollback here. The caller manages the transaction boundary. # self.db.rollback() # <-- DO NOT DO THIS HERE - raise # Re-raise the exception for the caller to handle. \ No newline at end of file + raise # Re-raise the exception for the caller to handle. diff --git a/backend/data/repositories/discovery_chain_repo.py b/backend/data/repositories/discovery_chain_repo.py index 847f08f..ca36eb1 100644 --- a/backend/data/repositories/discovery_chain_repo.py +++ b/backend/data/repositories/discovery_chain_repo.py @@ -8,17 +8,18 @@ """ import logging -import uuid # For handling UUID primary keys +import uuid # For handling UUID primary keys from typing import Optional, List from sqlalchemy.orm import Session from sqlalchemy.exc import SQLAlchemyError from .base_repository import BaseRepository -from backend.data.models import DiscoveryChain # The specific SQLAlchemy model +from backend.data.models import DiscoveryChain # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class DiscoveryChainRepository(BaseRepository[DiscoveryChain]): """ Repository for managing DiscoveryChain entities. @@ -85,10 +86,13 @@ def find_by_root_id(self, *, root_chain_id: uuid.UUID) -> List[DiscoveryChain]: .all() ) except SQLAlchemyError as e: - logger.error(f"Database error finding DiscoveryChains for root {root_chain_id}: {e}", exc_info=True) + logger.error( + f"Database error finding DiscoveryChains for root {root_chain_id}: {e}", + exc_info=True, + ) raise # Potential future methods: # - find_children(parent_id: uuid.UUID) -> List[DiscoveryChain]: Get direct children. # - find_by_status(status: str) -> List[DiscoveryChain]: Get chains by status. - # - find_by_entity_association(entity_type: str, entity_id: int): Find chains linked to a specific entity. \ No newline at end of file + # - find_by_entity_association(entity_type: str, entity_id: int): Find chains linked to a specific entity. diff --git a/backend/data/repositories/doi_reference_repo.py b/backend/data/repositories/doi_reference_repo.py index c964dd9..a75398b 100644 --- a/backend/data/repositories/doi_reference_repo.py +++ b/backend/data/repositories/doi_reference_repo.py @@ -15,10 +15,11 @@ from sqlalchemy.exc import SQLAlchemyError from .base_repository import BaseRepository -from backend.data.models import DOIReference # The specific SQLAlchemy model +from backend.data.models import DOIReference # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class DOIReferenceRepository(BaseRepository[DOIReference]): """ Repository specializing in operations for DOIReference entities. @@ -61,11 +62,16 @@ def find_by_repository_and_doi( try: return ( self.db.query(self.model) - .filter(self.model.repository_id == repository_id, self.model.doi == doi) + .filter( + self.model.repository_id == repository_id, self.model.doi == doi + ) .all() ) except SQLAlchemyError as e: - logger.error(f"DB error finding DOIReferences for repo {repository_id}, DOI {doi}: {e}", exc_info=True) + logger.error( + f"DB error finding DOIReferences for repo {repository_id}, DOI {doi}: {e}", + exc_info=True, + ) raise def find_by_repository_and_doi_and_source( @@ -88,7 +94,9 @@ def find_by_repository_and_doi_and_source( Raises: SQLAlchemyError: If a database error occurs during the query. """ - logger.debug(f"Finding unique DOIReference for repo_id {repository_id}, DOI {doi}, source {source_file}") + logger.debug( + f"Finding unique DOIReference for repo_id {repository_id}, DOI {doi}, source {source_file}" + ) try: # Querying based on the combination of fields that likely form a unique constraint or key. return ( @@ -96,14 +104,14 @@ def find_by_repository_and_doi_and_source( .filter( self.model.repository_id == repository_id, self.model.doi == doi, - self.model.source_file == source_file + self.model.source_file == source_file, ) - .first() # Expecting at most one result due to the specific filters. + .first() # Expecting at most one result due to the specific filters. ) except SQLAlchemyError as e: logger.error( f"Database error finding DOIReference for repo {repository_id}, doi {doi}, source {source_file}: {e}", - exc_info=True + exc_info=True, ) # Re-raise allows the service layer or API endpoint to handle the failure gracefully. raise @@ -130,7 +138,10 @@ def find_by_repository(self, *, repository_id: int) -> List[DOIReference]: .all() ) except SQLAlchemyError as e: - logger.error(f"DB error finding DOIReferences for repo {repository_id}: {e}", exc_info=True) + logger.error( + f"DB error finding DOIReferences for repo {repository_id}: {e}", + exc_info=True, + ) raise def find_by_work_id(self, *, work_id: int) -> List[DOIReference]: @@ -153,15 +164,13 @@ def find_by_work_id(self, *, work_id: int) -> List[DOIReference]: """ logger.debug(f"Finding DOIReferences associated with work_id {work_id}") try: - return ( - self.db.query(self.model) - .filter(self.model.work_id == work_id) - .all() - ) + return self.db.query(self.model).filter(self.model.work_id == work_id).all() except SQLAlchemyError as e: - logger.error(f"DB error finding DOIReferences for work {work_id}: {e}", exc_info=True) - raise + logger.error( + f"DB error finding DOIReferences for work {work_id}: {e}", exc_info=True + ) + raise # Other potential query methods could include: # - find_by_doi(doi: str) -> List[DOIReference]: Find all references to a DOI across all repositories. - # - find_unlinked() -> List[DOIReference]: Find references not yet associated with a Work entity. \ No newline at end of file + # - find_unlinked() -> List[DOIReference]: Find references not yet associated with a Work entity. diff --git a/backend/data/repositories/domain_repo.py b/backend/data/repositories/domain_repo.py index 8e770de..b5a70ae 100644 --- a/backend/data/repositories/domain_repo.py +++ b/backend/data/repositories/domain_repo.py @@ -11,13 +11,14 @@ from typing import Optional, Dict, Any from sqlalchemy.orm import Session -from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception +from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception from .base_repository import BaseRepository -from backend.data.models import Domain # The specific SQLAlchemy model +from backend.data.models import Domain # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class DomainRepository(BaseRepository[Domain]): """ Repository dedicated to CRUD and specific query operations for Domain entities. @@ -53,14 +54,23 @@ def get_by_openalex_id(self, *, openalex_id: str) -> Optional[Domain]: logger.debug(f"Getting Domain by openalex_id: {openalex_id}") # Check session state, helpful for debugging transaction issues. if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_openalex_id for OA ID {openalex_id}") - return None + logger.warning( + f"Session is inactive in get_by_openalex_id for OA ID {openalex_id}" + ) + return None try: # Query the Domain model, filtering by the openalex_id column. - return self.db.query(self.model).filter(self.model.openalex_id == openalex_id).first() + return ( + self.db.query(self.model) + .filter(self.model.openalex_id == openalex_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_openalex_id for {openalex_id}: {e}", exc_info=True) - raise + logger.error( + f"SQLAlchemyError during get_by_openalex_id for {openalex_id}: {e}", + exc_info=True, + ) + raise def get_or_create_by_openalex_id( self, *, openalex_id: str, obj_in_data: Dict[str, Any] @@ -97,8 +107,12 @@ def get_or_create_by_openalex_id( raise ValueError("openalex_id cannot be empty for Domain get_or_create") # Ensure the session is active before proceeding. if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_openalex_id for Domain.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_openalex_id for Domain." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First --- @@ -106,40 +120,57 @@ def get_or_create_by_openalex_id( if db_obj: # --- Step 2a: Record Found - Check for Updates --- - logger.debug(f"Found existing Domain OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Domain OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False # Compare and update fields if they differ from the input data. - if obj_in_data.get('display_name') is not None and db_obj.display_name != obj_in_data.get('display_name'): - db_obj.display_name = obj_in_data['display_name'] + if obj_in_data.get( + "display_name" + ) is not None and db_obj.display_name != obj_in_data.get( + "display_name" + ): + db_obj.display_name = obj_in_data["display_name"] updated = True - if obj_in_data.get('description') is not None and db_obj.description != obj_in_data.get('description'): - db_obj.description = obj_in_data['description'] + if obj_in_data.get( + "description" + ) is not None and db_obj.description != obj_in_data.get("description"): + db_obj.description = obj_in_data["description"] updated = True # Add checks for other relevant Domain fields if necessary. if updated: - self.db.add(db_obj) # Mark the object as dirty in the session. - logger.info(f"Domain {db_obj.id} marked for update in the current session.") + self.db.add(db_obj) # Mark the object as dirty in the session. + logger.info( + f"Domain {db_obj.id} marked for update in the current session." + ) # Optional: Flush here if needed before commit. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return the existing object. + return db_obj # Return the existing object. else: # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"Domain with OA ID {openalex_id} not found. Preparing to create new.") + logger.debug( + f"Domain with OA ID {openalex_id} not found. Preparing to create new." + ) # Ensure the openalex_id is set in the data for the new object. obj_in_data["openalex_id"] = openalex_id - new_obj = self.model(**obj_in_data) # Instantiate the new Domain. - self.db.add(new_obj) # Add to the session. + new_obj = self.model(**obj_in_data) # Instantiate the new Domain. + self.db.add(new_obj) # Add to the session. # Flush to send INSERT to DB, assign PK, check constraints. self.db.flush() # Refresh to get any DB-generated values. self.db.refresh(new_obj) - logger.info(f"Successfully created and flushed new Domain OA ID {openalex_id} (DB ID: {new_obj.id})") - return new_obj # Return the new object. + logger.info( + f"Successfully created and flushed new Domain OA ID {openalex_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the new object. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create for Domain OA ID {openalex_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create for Domain OA ID {openalex_id}: {e}", + exc_info=True, + ) # Rollback is the responsibility of the calling context. - raise # Re-raise the error. \ No newline at end of file + raise # Re-raise the error. diff --git a/backend/data/repositories/entity_discovery_repo.py b/backend/data/repositories/entity_discovery_repo.py index b0862ba..902565b 100644 --- a/backend/data/repositories/entity_discovery_repo.py +++ b/backend/data/repositories/entity_discovery_repo.py @@ -9,17 +9,18 @@ """ import logging -import uuid # For handling UUID foreign keys +import uuid # For handling UUID foreign keys from typing import Optional, List from sqlalchemy.orm import Session from sqlalchemy.exc import SQLAlchemyError from .base_repository import BaseRepository -from backend.data.models import EntityDiscoveryAssociation # The specific model +from backend.data.models import EntityDiscoveryAssociation # The specific model logger = logging.getLogger(__name__) + class EntityDiscoveryAssociationRepository(BaseRepository[EntityDiscoveryAssociation]): """ Repository for managing EntityDiscoveryAssociation records. @@ -73,10 +74,13 @@ def find_by_chain_and_entity( self.model.entity_type == entity_type, self.model.entity_id == entity_id, ) - .first() # Expecting at most one association for this specific combination. + .first() # Expecting at most one association for this specific combination. ) except SQLAlchemyError as e: - logger.error(f"DB error finding association for chain {discovery_chain_id}, entity {entity_type}:{entity_id}: {e}", exc_info=True) + logger.error( + f"DB error finding association for chain {discovery_chain_id}, entity {entity_type}:{entity_id}: {e}", + exc_info=True, + ) raise def find_by_entity( @@ -100,7 +104,9 @@ def find_by_entity( Raises: SQLAlchemyError: If a database error occurs during the query. """ - logger.debug(f"Finding EntityDiscoveryAssociations linked to entity type '{entity_type}', id {entity_id}") + logger.debug( + f"Finding EntityDiscoveryAssociations linked to entity type '{entity_type}', id {entity_id}" + ) try: return ( self.db.query(self.model) @@ -111,7 +117,10 @@ def find_by_entity( .all() ) except SQLAlchemyError as e: - logger.error(f"DB error finding associations for entity {entity_type}:{entity_id}: {e}", exc_info=True) + logger.error( + f"DB error finding associations for entity {entity_type}:{entity_id}: {e}", + exc_info=True, + ) raise def find_by_chain( @@ -133,7 +142,9 @@ def find_by_chain( Raises: SQLAlchemyError: If a database error occurs during the query. """ - logger.debug(f"Finding all EntityDiscoveryAssociations for chain node {discovery_chain_id}") + logger.debug( + f"Finding all EntityDiscoveryAssociations for chain node {discovery_chain_id}" + ) try: return ( self.db.query(self.model) @@ -141,8 +152,11 @@ def find_by_chain( .all() ) except SQLAlchemyError as e: - logger.error(f"DB error finding associations for chain {discovery_chain_id}: {e}", exc_info=True) - raise + logger.error( + f"DB error finding associations for chain {discovery_chain_id}: {e}", + exc_info=True, + ) + raise # Additional specific query methods can be added as needed, e.g., - # finding associations based on metadata within the association record itself. \ No newline at end of file + # finding associations based on metadata within the association record itself. diff --git a/backend/data/repositories/field_repo.py b/backend/data/repositories/field_repo.py index b9f10a0..251f909 100644 --- a/backend/data/repositories/field_repo.py +++ b/backend/data/repositories/field_repo.py @@ -11,13 +11,14 @@ from typing import Optional, Dict, Any from sqlalchemy.orm import Session -from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception +from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception from .base_repository import BaseRepository -from backend.data.models import Field # The specific SQLAlchemy model +from backend.data.models import Field # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class FieldRepository(BaseRepository[Field]): """ Repository managing CRUD and specific queries for Field entities. @@ -53,14 +54,23 @@ def get_by_openalex_id(self, *, openalex_id: str) -> Optional[Field]: logger.debug(f"Getting Field by openalex_id: {openalex_id}") # Pre-check for active session can help diagnose transaction issues. if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_openalex_id for OA ID {openalex_id}") + logger.warning( + f"Session is inactive in get_by_openalex_id for OA ID {openalex_id}" + ) return None try: # Standard query filtering by the unique OpenAlex ID. - return self.db.query(self.model).filter(self.model.openalex_id == openalex_id).first() + return ( + self.db.query(self.model) + .filter(self.model.openalex_id == openalex_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_openalex_id for {openalex_id}: {e}", exc_info=True) - raise + logger.error( + f"SQLAlchemyError during get_by_openalex_id for {openalex_id}: {e}", + exc_info=True, + ) + raise def get_or_create_by_openalex_id( self, *, openalex_id: str, obj_in_data: Dict[str, Any] @@ -102,8 +112,12 @@ def get_or_create_by_openalex_id( raise ValueError("openalex_id cannot be empty for Field get_or_create") # Ensure the session is usable at the start. if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_openalex_id for Field.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_openalex_id for Field." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First --- @@ -111,52 +125,73 @@ def get_or_create_by_openalex_id( if db_obj: # --- Step 2a: Record Found - Check for Updates --- - logger.debug(f"Found existing Field OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Field OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False # Check and update display name if provided and different. - if obj_in_data.get('display_name') is not None and db_obj.display_name != obj_in_data.get('display_name'): - db_obj.display_name = obj_in_data['display_name'] + if obj_in_data.get( + "display_name" + ) is not None and db_obj.display_name != obj_in_data.get( + "display_name" + ): + db_obj.display_name = obj_in_data["display_name"] updated = True # Check and update description if provided and different. - if obj_in_data.get('description') is not None and db_obj.description != obj_in_data.get('description'): - db_obj.description = obj_in_data['description'] - updated = True + if obj_in_data.get( + "description" + ) is not None and db_obj.description != obj_in_data.get("description"): + db_obj.description = obj_in_data["description"] + updated = True # Check if the parent domain_id needs updating (less common, but possible). - new_domain_id = obj_in_data.get('domain_id') + new_domain_id = obj_in_data.get("domain_id") if new_domain_id is not None and db_obj.domain_id != new_domain_id: - logger.warning(f"Field OA ID {openalex_id} exists but domain_id mismatch detected. " - f"DB has {db_obj.domain_id}, input data has {new_domain_id}. Updating.") - db_obj.domain_id = new_domain_id - updated = True + logger.warning( + f"Field OA ID {openalex_id} exists but domain_id mismatch detected. " + f"DB has {db_obj.domain_id}, input data has {new_domain_id}. Updating." + ) + db_obj.domain_id = new_domain_id + updated = True # Add other field update checks here if needed... if updated: - self.db.add(db_obj) # Mark as dirty in the session. - logger.info(f"Field {db_obj.id} marked for update in the current session.") + self.db.add(db_obj) # Mark as dirty in the session. + logger.info( + f"Field {db_obj.id} marked for update in the current session." + ) # Optional: Flush and refresh if immediate DB state is needed by caller before commit. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return the existing instance. + return db_obj # Return the existing instance. else: # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"Field OA ID {openalex_id} not found. Preparing to create new.") + logger.debug( + f"Field OA ID {openalex_id} not found. Preparing to create new." + ) # Crucial check: Ensure the foreign key `domain_id` is provided for creation. - if 'domain_id' not in obj_in_data or obj_in_data['domain_id'] is None: - raise ValueError(f"Missing required 'domain_id' in obj_in_data for creating new Field with OA ID {openalex_id}") + if "domain_id" not in obj_in_data or obj_in_data["domain_id"] is None: + raise ValueError( + f"Missing required 'domain_id' in obj_in_data for creating new Field with OA ID {openalex_id}" + ) # Ensure the openalex_id is part of the creation data. obj_in_data["openalex_id"] = openalex_id - new_obj = self.model(**obj_in_data) # Create the instance. - self.db.add(new_obj) # Add to session. + new_obj = self.model(**obj_in_data) # Create the instance. + self.db.add(new_obj) # Add to session. # Flush: Send INSERT, get PK, check constraints. self.db.flush() # Refresh: Update object with DB defaults. self.db.refresh(new_obj) - logger.info(f"Successfully created and flushed new Field OA ID {openalex_id} (DB ID: {new_obj.id})") - return new_obj # Return the new instance. + logger.info( + f"Successfully created and flushed new Field OA ID {openalex_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create for Field OA ID {openalex_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create for Field OA ID {openalex_id}: {e}", + exc_info=True, + ) # Let the caller handle transaction rollback. - raise # Re-raise the caught exception. \ No newline at end of file + raise # Re-raise the caught exception. diff --git a/backend/data/repositories/institution_repo.py b/backend/data/repositories/institution_repo.py index 90d41d9..df00040 100644 --- a/backend/data/repositories/institution_repo.py +++ b/backend/data/repositories/institution_repo.py @@ -11,13 +11,14 @@ from typing import Optional, Dict, Any from sqlalchemy.orm import Session -from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception +from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception from .base_repository import BaseRepository -from backend.data.models import Institution # The specific SQLAlchemy model +from backend.data.models import Institution # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class InstitutionRepository(BaseRepository[Institution]): """ Repository managing CRUD and specific queries for Institution entities. @@ -52,14 +53,23 @@ def get_by_openalex_id(self, *, openalex_id: str) -> Optional[Institution]: """ logger.debug(f"Getting Institution by openalex_id: {openalex_id}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_openalex_id for OA ID {openalex_id}") + logger.warning( + f"Session is inactive in get_by_openalex_id for OA ID {openalex_id}" + ) return None try: # Query based on the OpenAlex ID. - return self.db.query(self.model).filter(self.model.openalex_id == openalex_id).first() + return ( + self.db.query(self.model) + .filter(self.model.openalex_id == openalex_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_openalex_id for {openalex_id}: {e}", exc_info=True) - raise + logger.error( + f"SQLAlchemyError during get_by_openalex_id for {openalex_id}: {e}", + exc_info=True, + ) + raise def get_by_ror(self, *, ror: str) -> Optional[Institution]: """ @@ -76,14 +86,16 @@ def get_by_ror(self, *, ror: str) -> Optional[Institution]: """ logger.debug(f"Getting Institution by ROR: {ror}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_ror for ROR {ror}") - return None + logger.warning(f"Session is inactive in get_by_ror for ROR {ror}") + return None try: # Query based on the ROR ID. return self.db.query(self.model).filter(self.model.ror == ror).first() except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_ror for {ror}: {e}", exc_info=True) - raise + logger.error( + f"SQLAlchemyError during get_by_ror for {ror}: {e}", exc_info=True + ) + raise def get_or_create_by_openalex_id( self, *, openalex_id: str, obj_in_data: Dict[str, Any] @@ -121,25 +133,35 @@ def get_or_create_by_openalex_id( SQLAlchemyError: If any database operation fails. """ if not openalex_id: - raise ValueError("openalex_id cannot be empty for Institution get_or_create") + raise ValueError( + "openalex_id cannot be empty for Institution get_or_create" + ) if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_openalex_id for Institution.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_openalex_id for Institution." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First by OpenAlex ID --- db_obj = self.get_by_openalex_id(openalex_id=openalex_id) if db_obj: - # --- Step 2a: Found by OA ID - Update Check --- - logger.debug(f"Found existing Institution by OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates.") + # --- Step 2a: Found by OA ID - Update Check --- + logger.debug( + f"Found existing Institution by OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False new_ror = obj_in_data.get("ror") # Update ROR if provided and different, checking for conflicts. if new_ror and db_obj.ror != new_ror: - if not self.db.is_active: # Re-check session before dependent query - raise RuntimeError("Session became inactive before ROR conflict check.") + if not self.db.is_active: # Re-check session before dependent query + raise RuntimeError( + "Session became inactive before ROR conflict check." + ) existing_ror_inst = self.get_by_ror(ror=new_ror) if existing_ror_inst and existing_ror_inst.id != db_obj.id: # Log conflict but don't update ROR to avoid unique constraint error. @@ -148,76 +170,110 @@ def get_or_create_by_openalex_id( f"because it is already assigned to Institution DB ID {existing_ror_inst.id}. Skipping ROR update." ) else: - logger.info(f"Updating ROR for Institution {db_obj.id} from '{db_obj.ror}' to '{new_ror}'") + logger.info( + f"Updating ROR for Institution {db_obj.id} from '{db_obj.ror}' to '{new_ror}'" + ) db_obj.ror = new_ror updated = True # Update other fields if provided and different. - if obj_in_data.get('display_name') is not None and db_obj.display_name != obj_in_data.get('display_name'): - db_obj.display_name = obj_in_data['display_name'] + if obj_in_data.get( + "display_name" + ) is not None and db_obj.display_name != obj_in_data.get( + "display_name" + ): + db_obj.display_name = obj_in_data["display_name"] + updated = True + if obj_in_data.get( + "github_organization_logins" + ) is not None and db_obj.github_organization_logins != obj_in_data.get( + "github_organization_logins" + ): + db_obj.github_organization_logins = obj_in_data[ + "github_organization_logins" + ] updated = True - if obj_in_data.get('github_organization_logins') is not None and db_obj.github_organization_logins != obj_in_data.get('github_organization_logins'): - db_obj.github_organization_logins = obj_in_data['github_organization_logins'] - updated = True # Add other updatable fields... if updated: - self.db.add(db_obj) # Mark as dirty. - logger.info(f"Institution {db_obj.id} (found by OA ID) marked for update.") + self.db.add(db_obj) # Mark as dirty. + logger.info( + f"Institution {db_obj.id} (found by OA ID) marked for update." + ) # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return the instance found by OA ID. + return db_obj # Return the instance found by OA ID. else: - # --- Step 2b: Not Found by OA ID - Check ROR --- + # --- Step 2b: Not Found by OA ID - Check ROR --- ror_to_check = obj_in_data.get("ror") if ror_to_check: # --- Step 3: Query by ROR --- db_obj_ror = self.get_by_ror(ror=ror_to_check) if db_obj_ror: # --- Step 4: Found by ROR - Update with OA ID --- - logger.warning(f"Institution not found by OA ID {openalex_id}, but found existing " - f"Institution DB ID {db_obj_ror.id} by ROR {ror_to_check}. Attempting to merge/update.") + logger.warning( + f"Institution not found by OA ID {openalex_id}, but found existing " + f"Institution DB ID {db_obj_ror.id} by ROR {ror_to_check}. Attempting to merge/update." + ) updated = False # Add the OpenAlex ID if it was missing on the record found by ROR. if not db_obj_ror.openalex_id: - logger.info(f"Updating missing OA ID for Institution {db_obj_ror.id} (found by ROR {ror_to_check}) to {openalex_id}") + logger.info( + f"Updating missing OA ID for Institution {db_obj_ror.id} (found by ROR {ror_to_check}) to {openalex_id}" + ) db_obj_ror.openalex_id = openalex_id updated = True # Potentially update other fields if they were missing on the ROR-found record. - if obj_in_data.get('display_name') is not None and db_obj_ror.display_name is None: - db_obj_ror.display_name = obj_in_data['display_name'] + if ( + obj_in_data.get("display_name") is not None + and db_obj_ror.display_name is None + ): + db_obj_ror.display_name = obj_in_data["display_name"] updated = True - if obj_in_data.get('github_organization_logins') is not None and db_obj_ror.github_organization_logins is None: - db_obj_ror.github_organization_logins = obj_in_data['github_organization_logins'] + if ( + obj_in_data.get("github_organization_logins") is not None + and db_obj_ror.github_organization_logins is None + ): + db_obj_ror.github_organization_logins = obj_in_data[ + "github_organization_logins" + ] updated = True # Add other fields... if updated: - self.db.add(db_obj_ror) # Mark for update. - logger.info(f"Institution {db_obj_ror.id} (found by ROR) marked for update with OA ID {openalex_id}.") + self.db.add(db_obj_ror) # Mark for update. + logger.info( + f"Institution {db_obj_ror.id} (found by ROR) marked for update with OA ID {openalex_id}." + ) # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj_ror) - return db_obj_ror # Return the instance found by ROR. + return db_obj_ror # Return the instance found by ROR. # --- Step 5: Not Found by OA ID or ROR - Create New --- - logger.debug(f"Institution OA ID {openalex_id} (and ROR {ror_to_check or 'N/A'}) not found. Creating new.") - obj_in_data["openalex_id"] = openalex_id # Ensure OA ID is set. - new_obj = self.model(**obj_in_data) # Create instance. - self.db.add(new_obj) # Add to session. - self.db.flush() # Send INSERT. - self.db.refresh(new_obj) # Load DB defaults. - logger.info(f"Successfully created and flushed new Institution OA ID {openalex_id} (DB ID: {new_obj.id})") - return new_obj # Return the new instance. + logger.debug( + f"Institution OA ID {openalex_id} (and ROR {ror_to_check or 'N/A'}) not found. Creating new." + ) + obj_in_data["openalex_id"] = openalex_id # Ensure OA ID is set. + new_obj = self.model(**obj_in_data) # Create instance. + self.db.add(new_obj) # Add to session. + self.db.flush() # Send INSERT. + self.db.refresh(new_obj) # Load DB defaults. + logger.info( + f"Successfully created and flushed new Institution OA ID {openalex_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create_by_openalex_id for Inst OA ID {openalex_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create_by_openalex_id for Inst OA ID {openalex_id}: {e}", + exc_info=True, + ) # Caller handles rollback. raise - def get_or_create_by_ror( self, *, ror: str, obj_in_data: Dict[str, Any] ) -> Institution: @@ -254,95 +310,140 @@ def get_or_create_by_ror( SQLAlchemyError: If any database operation fails. """ if not ror: - raise ValueError("ROR must be provided for get_or_create_by_ror") + raise ValueError("ROR must be provided for get_or_create_by_ror") if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_ror for Institution.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_ror for Institution." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First by ROR --- db_obj = self.get_by_ror(ror=ror) if db_obj: - # --- Step 2a: Found by ROR - Update Check --- - logger.debug(f"Found existing Institution by ROR {ror} (DB ID: {db_obj.id}). Checking for updates.") + # --- Step 2a: Found by ROR - Update Check --- + logger.debug( + f"Found existing Institution by ROR {ror} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False new_oa_id = obj_in_data.get("openalex_id") # Update OpenAlex ID if provided and different, checking for conflicts. if new_oa_id and db_obj.openalex_id != new_oa_id: - if not self.db.is_active: # Re-check session - raise RuntimeError("Session inactive before OA ID check during ROR-based update.") + if not self.db.is_active: # Re-check session + raise RuntimeError( + "Session inactive before OA ID check during ROR-based update." + ) existing_oa_inst = self.get_by_openalex_id(openalex_id=new_oa_id) if existing_oa_inst and existing_oa_inst.id != db_obj.id: - # Log conflict, skip OA ID update. - logger.warning(f"Cannot update OA ID for Institution ROR {ror} (DB ID {db_obj.id}) to {new_oa_id} " - f"because it's already assigned to Institution DB ID {existing_oa_inst.id}. Skipping OA ID update.") + # Log conflict, skip OA ID update. + logger.warning( + f"Cannot update OA ID for Institution ROR {ror} (DB ID {db_obj.id}) to {new_oa_id} " + f"because it's already assigned to Institution DB ID {existing_oa_inst.id}. Skipping OA ID update." + ) else: - logger.info(f"Updating OA ID for Institution {db_obj.id} from '{db_obj.openalex_id}' to '{new_oa_id}'") - db_obj.openalex_id = new_oa_id - updated = True + logger.info( + f"Updating OA ID for Institution {db_obj.id} from '{db_obj.openalex_id}' to '{new_oa_id}'" + ) + db_obj.openalex_id = new_oa_id + updated = True # Update other fields if provided and different. - if obj_in_data.get('display_name') is not None and db_obj.display_name != obj_in_data.get('display_name'): - db_obj.display_name = obj_in_data['display_name'] + if obj_in_data.get( + "display_name" + ) is not None and db_obj.display_name != obj_in_data.get( + "display_name" + ): + db_obj.display_name = obj_in_data["display_name"] + updated = True + if obj_in_data.get( + "github_organization_logins" + ) is not None and db_obj.github_organization_logins != obj_in_data.get( + "github_organization_logins" + ): + db_obj.github_organization_logins = obj_in_data[ + "github_organization_logins" + ] updated = True - if obj_in_data.get('github_organization_logins') is not None and db_obj.github_organization_logins != obj_in_data.get('github_organization_logins'): - db_obj.github_organization_logins = obj_in_data['github_organization_logins'] - updated = True # Add other updatable fields ... if updated: - self.db.add(db_obj) # Mark as dirty. - logger.info(f"Institution {db_obj.id} (found by ROR) marked for update.") + self.db.add(db_obj) # Mark as dirty. + logger.info( + f"Institution {db_obj.id} (found by ROR) marked for update." + ) # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return instance found by ROR. + return db_obj # Return instance found by ROR. else: - # --- Step 2b: Not Found by ROR - Check OpenAlex ID --- + # --- Step 2b: Not Found by ROR - Check OpenAlex ID --- oa_id_to_check = obj_in_data.get("openalex_id") if oa_id_to_check: # --- Step 3: Query by OpenAlex ID --- db_obj_oa = self.get_by_openalex_id(openalex_id=oa_id_to_check) if db_obj_oa: # --- Step 4: Found by OA ID - Update with ROR --- - logger.warning(f"Institution not found by ROR {ror}, but found existing " - f"Institution DB ID {db_obj_oa.id} by OA ID {oa_id_to_check}. Attempting to merge/update.") + logger.warning( + f"Institution not found by ROR {ror}, but found existing " + f"Institution DB ID {db_obj_oa.id} by OA ID {oa_id_to_check}. Attempting to merge/update." + ) updated = False # Add the ROR ID if it was missing. if not db_obj_oa.ror: - logger.info(f"Updating missing ROR for Institution {db_obj_oa.id} (found by OA ID {oa_id_to_check}) to {ror}") + logger.info( + f"Updating missing ROR for Institution {db_obj_oa.id} (found by OA ID {oa_id_to_check}) to {ror}" + ) db_obj_oa.ror = ror updated = True # Potentially update other fields if missing. - if obj_in_data.get('display_name') is not None and db_obj_oa.display_name is None: - db_obj_oa.display_name = obj_in_data['display_name'] + if ( + obj_in_data.get("display_name") is not None + and db_obj_oa.display_name is None + ): + db_obj_oa.display_name = obj_in_data["display_name"] updated = True - if obj_in_data.get('github_organization_logins') is not None and db_obj_oa.github_organization_logins is None: - db_obj_oa.github_organization_logins = obj_in_data['github_organization_logins'] + if ( + obj_in_data.get("github_organization_logins") is not None + and db_obj_oa.github_organization_logins is None + ): + db_obj_oa.github_organization_logins = obj_in_data[ + "github_organization_logins" + ] updated = True # Add other fields ... if updated: - self.db.add(db_obj_oa) # Mark for update. - logger.info(f"Institution {db_obj_oa.id} (found by OA ID) marked for update with ROR {ror}.") + self.db.add(db_obj_oa) # Mark for update. + logger.info( + f"Institution {db_obj_oa.id} (found by OA ID) marked for update with ROR {ror}." + ) # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj_oa) - return db_obj_oa # Return instance found by OA ID. + return db_obj_oa # Return instance found by OA ID. # --- Step 5: Not Found by ROR or OA ID - Create New --- - logger.debug(f"Institution ROR {ror} (and OA ID {oa_id_to_check or 'N/A'}) not found. Creating new.") - obj_in_data["ror"] = ror # Ensure ROR ID is set. - new_obj = self.model(**obj_in_data) # Create instance. - self.db.add(new_obj) # Add to session. - self.db.flush() # Send INSERT. - self.db.refresh(new_obj) # Load DB defaults. - logger.info(f"Successfully created and flushed new Institution ROR {ror} (DB ID: {new_obj.id})") - return new_obj # Return new instance. + logger.debug( + f"Institution ROR {ror} (and OA ID {oa_id_to_check or 'N/A'}) not found. Creating new." + ) + obj_in_data["ror"] = ror # Ensure ROR ID is set. + new_obj = self.model(**obj_in_data) # Create instance. + self.db.add(new_obj) # Add to session. + self.db.flush() # Send INSERT. + self.db.refresh(new_obj) # Load DB defaults. + logger.info( + f"Successfully created and flushed new Institution ROR {ror} (DB ID: {new_obj.id})" + ) + return new_obj # Return new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create_by_ror for Inst ROR {ror}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create_by_ror for Inst ROR {ror}: {e}", + exc_info=True, + ) # Caller handles rollback. - raise \ No newline at end of file + raise diff --git a/backend/data/repositories/issue_comment_repo.py b/backend/data/repositories/issue_comment_repo.py index 756df06..8bfcd2a 100644 --- a/backend/data/repositories/issue_comment_repo.py +++ b/backend/data/repositories/issue_comment_repo.py @@ -14,10 +14,11 @@ from sqlalchemy.exc import SQLAlchemyError from .base_repository import BaseRepository -from backend.data.models import IssueComment # The specific SQLAlchemy model +from backend.data.models import IssueComment # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class IssueCommentRepository(BaseRepository[IssueComment]): """ Repository dedicated to managing IssueComment entities. @@ -51,13 +52,22 @@ def get_by_github_id(self, *, github_id: int) -> Optional[IssueComment]: logger.debug(f"Getting IssueComment by github_id: {github_id}") # Session activity check can aid in diagnosing transaction problems. if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_github_id for IssueComment {github_id}") + logger.warning( + f"Session is inactive in get_by_github_id for IssueComment {github_id}" + ) return None try: # Query the IssueComment model filtering by the unique github_id. - return self.db.query(self.model).filter(self.model.github_id == github_id).first() + return ( + self.db.query(self.model) + .filter(self.model.github_id == github_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_github_id for IssueComment {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_github_id for IssueComment {github_id}: {e}", + exc_info=True, + ) raise def get_or_create_by_github_id( @@ -95,49 +105,67 @@ def get_or_create_by_github_id( if not github_id: raise ValueError("github_id cannot be empty for IssueComment get_or_create") if not self.db.is_active: - logger.error(f"Session is inactive at start of get_or_create_by_github_id for IssueComment {github_id}.") - raise RuntimeError("Database session is inactive for IssueComment get_or_create.") + logger.error( + f"Session is inactive at start of get_or_create_by_github_id for IssueComment {github_id}." + ) + raise RuntimeError( + "Database session is inactive for IssueComment get_or_create." + ) # --- Step 1: Query First --- db_obj = self.get_by_github_id(github_id=github_id) if db_obj: # --- Step 2a: Record Found - Check for Updates --- - logger.debug(f"Found existing IssueComment GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing IssueComment GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False # Check if comment body has changed. - if obj_in_data.get('body') is not None and db_obj.body != obj_in_data.get('body'): - db_obj.body = obj_in_data['body'] + if obj_in_data.get("body") is not None and db_obj.body != obj_in_data.get( + "body" + ): + db_obj.body = obj_in_data["body"] updated = True # Check if the GitHub update timestamp has changed. - if obj_in_data.get('gh_updated_at') is not None and db_obj.gh_updated_at != obj_in_data.get('gh_updated_at'): - db_obj.gh_updated_at = obj_in_data['gh_updated_at'] + if obj_in_data.get( + "gh_updated_at" + ) is not None and db_obj.gh_updated_at != obj_in_data.get("gh_updated_at"): + db_obj.gh_updated_at = obj_in_data["gh_updated_at"] updated = True # Add checks for other potentially updatable fields if needed. if updated: - self.db.add(db_obj) # Mark the instance as dirty. - logger.info(f"IssueComment {db_obj.id} marked for update in the current session.") - # Optional flush/refresh could go here if caller needs immediate DB state. - return db_obj # Return the existing instance. + self.db.add(db_obj) # Mark the instance as dirty. + logger.info( + f"IssueComment {db_obj.id} marked for update in the current session." + ) + # Optional flush/refresh could go here if caller needs immediate DB state. + return db_obj # Return the existing instance. else: # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"IssueComment GH ID {github_id} not found. Preparing to create new.") + logger.debug( + f"IssueComment GH ID {github_id} not found. Preparing to create new." + ) # Validate required foreign keys for creation. - if 'issue_id' not in obj_in_data or 'user_id' not in obj_in_data: - raise ValueError(f"Missing required 'issue_id' or 'user_id' in obj_in_data for creating new IssueComment with GH ID {github_id}") + if "issue_id" not in obj_in_data or "user_id" not in obj_in_data: + raise ValueError( + f"Missing required 'issue_id' or 'user_id' in obj_in_data for creating new IssueComment with GH ID {github_id}" + ) # Ensure the github_id is included in the data for the new object. obj_in_data["github_id"] = github_id - new_obj = self.model(**obj_in_data) # Instantiate the new comment. - self.db.add(new_obj) # Add to the session. + new_obj = self.model(**obj_in_data) # Instantiate the new comment. + self.db.add(new_obj) # Add to the session. # Flush to send INSERT to DB, assign PK, check FK constraints. self.db.flush() # Refresh to load any DB-generated values. self.db.refresh(new_obj) - logger.info(f"Successfully created and flushed new IssueComment GH ID {github_id} (DB ID: {new_obj.id})") - return new_obj # Return the newly created instance. + logger.info( + f"Successfully created and flushed new IssueComment GH ID {github_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the newly created instance. # Note: SQLAlchemyError handling is implicitly covered by the BaseRepository # structure if the error occurs within self.get_by_github_id, or it will - # propagate from flush/refresh if it occurs there. The caller should handle it. \ No newline at end of file + # propagate from flush/refresh if it occurs there. The caller should handle it. diff --git a/backend/data/repositories/issue_repo.py b/backend/data/repositories/issue_repo.py index 593bca7..3847a95 100644 --- a/backend/data/repositories/issue_repo.py +++ b/backend/data/repositories/issue_repo.py @@ -6,6 +6,7 @@ Provides data access operations for the Issue model, representing GitHub issues tracked within associated repositories. """ + import logging from typing import Optional, Dict, Any @@ -13,10 +14,11 @@ from sqlalchemy.exc import SQLAlchemyError from .base_repository import BaseRepository -from backend.data.models import Issue # The specific SQLAlchemy model +from backend.data.models import Issue # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class IssueRepository(BaseRepository[Issue]): """ Repository for managing Issue entities, including CRUD and specific queries. @@ -55,13 +57,22 @@ def get_by_github_id(self, *, github_id: int) -> Optional[Issue]: """ logger.debug(f"Getting Issue by github_id: {github_id}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_github_id for Issue {github_id}") + logger.warning( + f"Session is inactive in get_by_github_id for Issue {github_id}" + ) return None try: # Query the Issue model filtering by the unique github_id. - return self.db.query(self.model).filter(self.model.github_id == github_id).first() + return ( + self.db.query(self.model) + .filter(self.model.github_id == github_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_github_id for Issue {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_github_id for Issue {github_id}: {e}", + exc_info=True, + ) raise def get_or_create_by_github_id( @@ -102,8 +113,12 @@ def get_or_create_by_github_id( if not github_id: raise ValueError("github_id cannot be empty for Issue get_or_create") if not self.db.is_active: - logger.error(f"Session is inactive at start of get_or_create_by_github_id for Issue {github_id}.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + f"Session is inactive at start of get_or_create_by_github_id for Issue {github_id}." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First --- @@ -111,50 +126,75 @@ def get_or_create_by_github_id( if db_obj: # --- Step 2a: Record Found - Check for Updates --- - logger.debug(f"Found existing Issue GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Issue GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False # Check and update common fields that might change. - if obj_in_data.get('title') is not None and db_obj.title != obj_in_data.get('title'): - db_obj.title = obj_in_data['title'] + if obj_in_data.get( + "title" + ) is not None and db_obj.title != obj_in_data.get("title"): + db_obj.title = obj_in_data["title"] updated = True - if obj_in_data.get('state') is not None and db_obj.state != obj_in_data.get('state'): - db_obj.state = obj_in_data['state'] + if obj_in_data.get( + "state" + ) is not None and db_obj.state != obj_in_data.get("state"): + db_obj.state = obj_in_data["state"] updated = True - if obj_in_data.get('gh_updated_at') is not None and db_obj.gh_updated_at != obj_in_data.get('gh_updated_at'): - db_obj.gh_updated_at = obj_in_data['gh_updated_at'] + if obj_in_data.get( + "gh_updated_at" + ) is not None and db_obj.gh_updated_at != obj_in_data.get( + "gh_updated_at" + ): + db_obj.gh_updated_at = obj_in_data["gh_updated_at"] updated = True - if obj_in_data.get('gh_closed_at') is not None and db_obj.gh_closed_at != obj_in_data.get('gh_closed_at'): + if obj_in_data.get( + "gh_closed_at" + ) is not None and db_obj.gh_closed_at != obj_in_data.get( + "gh_closed_at" + ): # Note: Ensure gh_closed_at can be None if the issue is reopened. - db_obj.gh_closed_at = obj_in_data['gh_closed_at'] + db_obj.gh_closed_at = obj_in_data["gh_closed_at"] updated = True # Add other relevant fields like labels, assignees, body if managed here. if updated: - self.db.add(db_obj) # Mark as dirty in the session. - logger.info(f"Issue {db_obj.id} marked for update in the current session.") + self.db.add(db_obj) # Mark as dirty in the session. + logger.info( + f"Issue {db_obj.id} marked for update in the current session." + ) # Optional: Flush and refresh if immediate state needed by caller. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return the existing instance. + return db_obj # Return the existing instance. else: # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"Issue GH ID {github_id} not found. Preparing to create new.") + logger.debug( + f"Issue GH ID {github_id} not found. Preparing to create new." + ) # Validate presence of required foreign keys for creation. - if 'repository_id' not in obj_in_data or 'user_id' not in obj_in_data: - raise ValueError(f"Missing required 'repository_id' or 'user_id' in obj_in_data for creating new Issue with GH ID {github_id}") + if "repository_id" not in obj_in_data or "user_id" not in obj_in_data: + raise ValueError( + f"Missing required 'repository_id' or 'user_id' in obj_in_data for creating new Issue with GH ID {github_id}" + ) # Ensure github_id is set in the creation data. obj_in_data["github_id"] = github_id - new_obj = self.model(**obj_in_data) # Instantiate the new issue. - self.db.add(new_obj) # Add to session. + new_obj = self.model(**obj_in_data) # Instantiate the new issue. + self.db.add(new_obj) # Add to session. # Flush: Send INSERT, get PK, check FK constraints. self.db.flush() # Refresh: Load DB defaults/generated values. self.db.refresh(new_obj) - logger.info(f"Successfully created and flushed new Issue GH ID {github_id} (DB ID: {new_obj.id})") - return new_obj # Return the new instance. + logger.info( + f"Successfully created and flushed new Issue GH ID {github_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create for Issue GH ID {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create for Issue GH ID {github_id}: {e}", + exc_info=True, + ) # Rollback is handled by the caller. - raise # Re-raise the error. \ No newline at end of file + raise # Re-raise the error. diff --git a/backend/data/repositories/keyword_repository_association_repo.py b/backend/data/repositories/keyword_repository_association_repo.py index 1903856..4158377 100644 --- a/backend/data/repositories/keyword_repository_association_repo.py +++ b/backend/data/repositories/keyword_repository_association_repo.py @@ -8,7 +8,7 @@ """ import logging -from typing import Optional, Dict, Any, List, Tuple # Import Tuple for composite key get +from typing import Optional, Dict, Any, List # Import Tuple for composite key get from sqlalchemy.orm import Session from sqlalchemy.exc import SQLAlchemyError @@ -20,6 +20,7 @@ logger = logging.getLogger(__name__) + class KeywordRepositoryAssociationRepository: """ Repository for managing KeywordRepositoryAssociation link records. @@ -29,6 +30,7 @@ class KeywordRepositoryAssociationRepository: primary key (session_id, repository_id), it implements its own methods instead of inheriting directly from BaseRepository. """ + def __init__(self, db: Session): """ Initializes the KeywordRepositoryAssociationRepository. @@ -44,7 +46,7 @@ def create_association( *, session_id: int, repository_id: int, - match_details: Optional[Dict[str, Any]] = None + match_details: Optional[Dict[str, Any]] = None, ) -> KeywordRepositoryAssociation: """ Creates a new association record between a search session and a repository. @@ -71,30 +73,34 @@ def create_association( SQLAlchemyError: If adding or flushing the object to the database fails (e.g., due to constraint violations). """ - logger.debug(f"Preparing to create KeywordRepositoryAssociation for session {session_id}, repo {repository_id}") + logger.debug( + f"Preparing to create KeywordRepositoryAssociation for session {session_id}, repo {repository_id}" + ) # Create the association object instance. db_obj = self.model( keyword_search_session_id=session_id, repository_id=repository_id, - match_details=match_details # Store provided JSON details. + match_details=match_details, # Store provided JSON details. ) try: - self.db.add(db_obj) # Add the new association to the session. + self.db.add(db_obj) # Add the new association to the session. # Flush the session to send the INSERT statement. This helps catch # potential integrity errors (like duplicate primary keys) early. self.db.flush() # No refresh needed here typically, as this model likely doesn't have # database-generated defaults beyond the primary key components provided. - logger.info(f"Successfully created and flushed KeywordRepositoryAssociation for session {session_id}, repo {repository_id}") + logger.info( + f"Successfully created and flushed KeywordRepositoryAssociation for session {session_id}, repo {repository_id}" + ) return db_obj except SQLAlchemyError as e: # Log the specific error during creation/flush. logger.error( f"Database error creating KeywordRepositoryAssociation for session {session_id}, repo {repository_id}: {e}", - exc_info=True + exc_info=True, ) # Rollback should be handled by the service layer or API endpoint managing the overall transaction. - raise # Re-raise the error for the caller. + raise # Re-raise the error for the caller. def get_by_session_and_repo_id( self, *, session_id: int, repository_id: int @@ -114,7 +120,9 @@ def get_by_session_and_repo_id( Raises: SQLAlchemyError: If a database error occurs during the lookup. """ - logger.debug(f"Getting KeywordRepositoryAssociation by composite key: session {session_id}, repo {repository_id}") + logger.debug( + f"Getting KeywordRepositoryAssociation by composite key: session {session_id}, repo {repository_id}" + ) try: # For composite keys, Session.get requires a tuple of the key values in the correct order. composite_key = (session_id, repository_id) @@ -122,9 +130,9 @@ def get_by_session_and_repo_id( except SQLAlchemyError as e: logger.error( f"Database error getting KeywordRepositoryAssociation for session {session_id}, repo {repository_id}: {e}", - exc_info=True + exc_info=True, ) - raise # Re-raise for higher-level handling. + raise # Re-raise for higher-level handling. def find_by_session_id( self, *, session_id: int @@ -143,7 +151,9 @@ def find_by_session_id( Raises: SQLAlchemyError: If a database error occurs during the query. """ - logger.debug(f"Finding all KeywordRepositoryAssociations for session_id {session_id}") + logger.debug( + f"Finding all KeywordRepositoryAssociations for session_id {session_id}" + ) try: # Query the association model, filtering by the session ID part of the composite key. return ( @@ -154,13 +164,13 @@ def find_by_session_id( except SQLAlchemyError as e: logger.error( f"Database error finding KeywordRepositoryAssociations for session {session_id}: {e}", - exc_info=True + exc_info=True, ) - raise # Re-raise for caller to handle. + raise # Re-raise for caller to handle. # A potential future method: # def find_by_repository_id(self, *, repository_id: int) -> List[KeywordRepositoryAssociation]: # """Find all search sessions that identified a specific repository.""" # logger.debug(f"Finding KeywordRepositoryAssociations for repository_id {repository_id}") # # Implementation would filter by self.model.repository_id - # ... \ No newline at end of file + # ... diff --git a/backend/data/repositories/keyword_search_session_repo.py b/backend/data/repositories/keyword_search_session_repo.py index d286fe4..74ba031 100644 --- a/backend/data/repositories/keyword_search_session_repo.py +++ b/backend/data/repositories/keyword_search_session_repo.py @@ -8,15 +8,17 @@ """ import logging + # from typing import Optional, List # Optional/List not currently used, uncomment if needed from sqlalchemy.orm import Session # from sqlalchemy.exc import SQLAlchemyError # Not used directly if only using BaseRepository methods from .base_repository import BaseRepository -from backend.data.models import KeywordSearchSession # The specific model +from backend.data.models import KeywordSearchSession # The specific model logger = logging.getLogger(__name__) + class KeywordSearchSessionRepository(BaseRepository[KeywordSearchSession]): """ Repository for managing KeywordSearchSession entities. @@ -78,4 +80,4 @@ def __init__(self, db: Session): # ) # except SQLAlchemyError as e: # logger.error(f"DB error finding pending KeywordSearchSessions: {e}", exc_info=True) - # raise \ No newline at end of file + # raise diff --git a/backend/data/repositories/owner_repo.py b/backend/data/repositories/owner_repo.py index 2841de0..d4a317d 100644 --- a/backend/data/repositories/owner_repo.py +++ b/backend/data/repositories/owner_repo.py @@ -11,13 +11,14 @@ from typing import Optional, Dict, Any from sqlalchemy.orm import Session -from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception +from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception from .base_repository import BaseRepository -from backend.data.models import Owner # The specific SQLAlchemy model +from backend.data.models import Owner # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class OwnerRepository(BaseRepository[Owner]): """ Repository dedicated to CRUD and specific query operations for Owner entities. @@ -52,13 +53,22 @@ def get_by_github_id(self, *, github_id: int) -> Optional[Owner]: logger.debug(f"Getting Owner by github_id: {github_id}") # Session activity check for debugging transactional issues. if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_github_id for Owner GH ID {github_id}") + logger.warning( + f"Session is inactive in get_by_github_id for Owner GH ID {github_id}" + ) return None try: # Standard query filtering by the github_id column. - return self.db.query(self.model).filter(self.model.github_id == github_id).first() + return ( + self.db.query(self.model) + .filter(self.model.github_id == github_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_github_id for Owner {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_github_id for Owner {github_id}: {e}", + exc_info=True, + ) raise def get_by_login(self, *, login: str) -> Optional[Owner]: @@ -76,14 +86,19 @@ def get_by_login(self, *, login: str) -> Optional[Owner]: """ logger.debug(f"Getting Owner by login: {login}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_login for Owner login '{login}'") - return None + logger.warning( + f"Session is inactive in get_by_login for Owner login '{login}'" + ) + return None try: # Query filtering by the login column. return self.db.query(self.model).filter(self.model.login == login).first() except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_login for Owner {login}: {e}", exc_info=True) - raise + logger.error( + f"SQLAlchemyError during get_by_login for Owner {login}: {e}", + exc_info=True, + ) + raise def get_or_create_by_github_id( self, *, github_id: int, obj_in_data: Dict[str, Any] @@ -123,10 +138,14 @@ def get_or_create_by_github_id( The caller should handle rollback. """ if not github_id: - raise ValueError("github_id cannot be empty for Owner get_or_create") + raise ValueError("github_id cannot be empty for Owner get_or_create") if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_github_id for Owner.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_github_id for Owner." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First --- @@ -134,14 +153,20 @@ def get_or_create_by_github_id( if db_obj: # --- Step 2a: Record Found - Check for Updates --- - logger.debug(f"Found existing Owner GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Owner GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False - new_login = obj_in_data.get('login') + new_login = obj_in_data.get("login") # Check if login needs update and handle potential uniqueness conflicts. if new_login and db_obj.login != new_login: - if not self.db.is_active: # Re-check session state before next query - raise RuntimeError("Session became inactive before login conflict check during owner update.") + if ( + not self.db.is_active + ): # Re-check session state before next query + raise RuntimeError( + "Session became inactive before login conflict check during owner update." + ) existing_login_owner = self.get_by_login(login=new_login) if existing_login_owner and existing_login_owner.id != db_obj.id: # Log the conflict but skip the update to avoid DB error. @@ -151,47 +176,64 @@ def get_or_create_by_github_id( f"because it's already assigned to Owner DB ID {existing_login_owner.id}. Skipping login update." ) else: - logger.info(f"Updating login for Owner {db_obj.id} from '{db_obj.login}' to '{new_login}'") + logger.info( + f"Updating login for Owner {db_obj.id} from '{db_obj.login}' to '{new_login}'" + ) db_obj.login = new_login updated = True # Check and update other fields if they differ. - if obj_in_data.get('type') is not None and db_obj.type != obj_in_data.get('type'): - db_obj.type = obj_in_data['type'] + if obj_in_data.get( + "type" + ) is not None and db_obj.type != obj_in_data.get("type"): + db_obj.type = obj_in_data["type"] + updated = True + if obj_in_data.get( + "avatar_url" + ) is not None and db_obj.avatar_url != obj_in_data.get("avatar_url"): + db_obj.avatar_url = obj_in_data["avatar_url"] + updated = True + if obj_in_data.get( + "html_url" + ) is not None and db_obj.html_url != obj_in_data.get("html_url"): + db_obj.html_url = obj_in_data["html_url"] updated = True - if obj_in_data.get('avatar_url') is not None and db_obj.avatar_url != obj_in_data.get('avatar_url'): - db_obj.avatar_url = obj_in_data['avatar_url'] - updated = True - if obj_in_data.get('html_url') is not None and db_obj.html_url != obj_in_data.get('html_url'): - db_obj.html_url = obj_in_data['html_url'] - updated = True # Add checks for other relevant fields... if updated: - self.db.add(db_obj) # Add to session to mark dirty for commit. - logger.info(f"Owner {db_obj.id} marked for update in the current session.") + self.db.add(db_obj) # Add to session to mark dirty for commit. + logger.info( + f"Owner {db_obj.id} marked for update in the current session." + ) # Optional flush/refresh if caller needs immediate DB state. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return the existing (potentially updated) owner. + return db_obj # Return the existing (potentially updated) owner. else: # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"Owner with GH ID {github_id} not found. Preparing to create new.") + logger.debug( + f"Owner with GH ID {github_id} not found. Preparing to create new." + ) # Ensure the github_id is included in the data used for creation. obj_in_data["github_id"] = github_id - new_obj = self.model(**obj_in_data) # Create a new model instance. - self.db.add(new_obj) # Add the new object to the session. + new_obj = self.model(**obj_in_data) # Create a new model instance. + self.db.add(new_obj) # Add the new object to the session. # Flush the session: sends INSERT, assigns PK, checks constraints. self.db.flush() # Refresh the instance: loads DB-generated values (e.g., defaults). self.db.refresh(new_obj) - logger.info(f"Successfully created and flushed new Owner GH ID {github_id} (DB ID: {new_obj.id})") - return new_obj # Return the newly created owner. + logger.info( + f"Successfully created and flushed new Owner GH ID {github_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the newly created owner. except SQLAlchemyError as e: # Log the error encountered during the get_or_create process. - logger.error(f"SQLAlchemyError during get_or_create for Owner GH ID {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create for Owner GH ID {github_id}: {e}", + exc_info=True, + ) # Critical: Do NOT rollback here. The caller manages the transaction. # self.db.rollback() # <-- Avoid rollback in repository methods. - raise # Re-raise the exception for the caller to handle. \ No newline at end of file + raise # Re-raise the exception for the caller to handle. diff --git a/backend/data/repositories/person_repo.py b/backend/data/repositories/person_repo.py index 4ec5240..8cecd44 100644 --- a/backend/data/repositories/person_repo.py +++ b/backend/data/repositories/person_repo.py @@ -11,13 +11,14 @@ from typing import Optional, Dict, Any from sqlalchemy.orm import Session -from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception +from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception from .base_repository import BaseRepository -from backend.data.models import Person # The specific SQLAlchemy model +from backend.data.models import Person # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class PersonRepository(BaseRepository[Person]): """ Repository for managing Person entities, including CRUD and specific queries. @@ -52,13 +53,22 @@ def get_by_openalex_id(self, *, openalex_id: str) -> Optional[Person]: """ logger.debug(f"Getting Person by openalex_id: {openalex_id}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_openalex_id for Person OA ID {openalex_id}") + logger.warning( + f"Session is inactive in get_by_openalex_id for Person OA ID {openalex_id}" + ) return None try: # Query based on the OpenAlex ID. - return self.db.query(self.model).filter(self.model.openalex_id == openalex_id).first() + return ( + self.db.query(self.model) + .filter(self.model.openalex_id == openalex_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_openalex_id for Person {openalex_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_openalex_id for Person {openalex_id}: {e}", + exc_info=True, + ) raise def get_by_orcid(self, *, orcid: str) -> Optional[Person]: @@ -76,13 +86,18 @@ def get_by_orcid(self, *, orcid: str) -> Optional[Person]: """ logger.debug(f"Getting Person by orcid: {orcid}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_orcid for Person ORCID {orcid}") + logger.warning( + f"Session is inactive in get_by_orcid for Person ORCID {orcid}" + ) return None try: # Query based on the ORCID. Assumes ORCID is unique or the first match is desired. return self.db.query(self.model).filter(self.model.orcid == orcid).first() except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_orcid for Person {orcid}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_orcid for Person {orcid}: {e}", + exc_info=True, + ) raise def get_or_create_by_openalex_id( @@ -123,8 +138,12 @@ def get_or_create_by_openalex_id( if not openalex_id: raise ValueError("openalex_id cannot be empty for Person get_or_create") if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_openalex_id for Person.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_openalex_id for Person." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First by OpenAlex ID --- @@ -132,43 +151,61 @@ def get_or_create_by_openalex_id( if db_obj: # --- Step 2a: Found by OA ID - Update Check --- - logger.debug(f"Found existing Person by OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Person by OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False new_orcid = obj_in_data.get("orcid") # Update ORCID if provided and different, checking for conflicts. if new_orcid and db_obj.orcid != new_orcid: - if not self.db.is_active: # Re-check session before dependent query - raise RuntimeError("Session became inactive before ORCID conflict check.") - existing_orcid_person = self.get_by_orcid(orcid=new_orcid) - if existing_orcid_person and existing_orcid_person.id != db_obj.id: - # Log conflict but don't update ORCID to avoid unique constraint error. - logger.warning( - f"Cannot update ORCID for Person OA ID {openalex_id} (DB ID {db_obj.id}) to '{new_orcid}' " - f"because it is already assigned to Person DB ID {existing_orcid_person.id}. Skipping ORCID update." - ) - else: - logger.info(f"Updating ORCID for Person {db_obj.id} from '{db_obj.orcid}' to '{new_orcid}'") - db_obj.orcid = new_orcid - updated = True + if not self.db.is_active: # Re-check session before dependent query + raise RuntimeError( + "Session became inactive before ORCID conflict check." + ) + existing_orcid_person = self.get_by_orcid(orcid=new_orcid) + if existing_orcid_person and existing_orcid_person.id != db_obj.id: + # Log conflict but don't update ORCID to avoid unique constraint error. + logger.warning( + f"Cannot update ORCID for Person OA ID {openalex_id} (DB ID {db_obj.id}) to '{new_orcid}' " + f"because it is already assigned to Person DB ID {existing_orcid_person.id}. Skipping ORCID update." + ) + else: + logger.info( + f"Updating ORCID for Person {db_obj.id} from '{db_obj.orcid}' to '{new_orcid}'" + ) + db_obj.orcid = new_orcid + updated = True # Update other fields if provided and different. - if obj_in_data.get('display_name') is not None and db_obj.display_name != obj_in_data.get('display_name'): - db_obj.display_name = obj_in_data['display_name'] + if obj_in_data.get( + "display_name" + ) is not None and db_obj.display_name != obj_in_data.get( + "display_name" + ): + db_obj.display_name = obj_in_data["display_name"] updated = True # Note: Comparing JSON fields requires careful handling depending on DB backend and exact structure. - if obj_in_data.get('display_name_alternatives') is not None and db_obj.display_name_alternatives != obj_in_data.get('display_name_alternatives'): - db_obj.display_name_alternatives = obj_in_data['display_name_alternatives'] - updated = True + if obj_in_data.get( + "display_name_alternatives" + ) is not None and db_obj.display_name_alternatives != obj_in_data.get( + "display_name_alternatives" + ): + db_obj.display_name_alternatives = obj_in_data[ + "display_name_alternatives" + ] + updated = True # Add other updatable fields... if updated: - self.db.add(db_obj) # Mark as dirty. - logger.info(f"Person {db_obj.id} (found by OA ID) marked for update.") + self.db.add(db_obj) # Mark as dirty. + logger.info( + f"Person {db_obj.id} (found by OA ID) marked for update." + ) # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return the instance found by OA ID. + return db_obj # Return the instance found by OA ID. else: # --- Step 2b: Not Found by OA ID - Check ORCID --- @@ -178,43 +215,64 @@ def get_or_create_by_openalex_id( db_obj_orcid = self.get_by_orcid(orcid=orcid_to_check) if db_obj_orcid: # --- Step 4: Found by ORCID - Update with OA ID --- - logger.warning(f"Person not found by OA ID {openalex_id}, but found existing " - f"Person DB ID {db_obj_orcid.id} by ORCID {orcid_to_check}. Attempting to merge/update.") + logger.warning( + f"Person not found by OA ID {openalex_id}, but found existing " + f"Person DB ID {db_obj_orcid.id} by ORCID {orcid_to_check}. Attempting to merge/update." + ) updated = False # Add the OpenAlex ID if it was missing on the record found by ORCID. if not db_obj_orcid.openalex_id: - logger.info(f"Updating missing OA ID for Person {db_obj_orcid.id} (found by ORCID {orcid_to_check}) to {openalex_id}") + logger.info( + f"Updating missing OA ID for Person {db_obj_orcid.id} (found by ORCID {orcid_to_check}) to {openalex_id}" + ) db_obj_orcid.openalex_id = openalex_id updated = True # Potentially update other fields if they were missing on the ORCID-found record. - if obj_in_data.get('display_name') is not None and db_obj_orcid.display_name is None: - db_obj_orcid.display_name = obj_in_data['display_name'] + if ( + obj_in_data.get("display_name") is not None + and db_obj_orcid.display_name is None + ): + db_obj_orcid.display_name = obj_in_data["display_name"] + updated = True + if ( + obj_in_data.get("display_name_alternatives") is not None + and db_obj_orcid.display_name_alternatives is None + ): + db_obj_orcid.display_name_alternatives = obj_in_data[ + "display_name_alternatives" + ] updated = True - if obj_in_data.get('display_name_alternatives') is not None and db_obj_orcid.display_name_alternatives is None: - db_obj_orcid.display_name_alternatives = obj_in_data['display_name_alternatives'] - updated = True # Add other fields... if updated: - self.db.add(db_obj_orcid) # Mark for update. - logger.info(f"Person {db_obj_orcid.id} (found by ORCID) marked for update with OA ID {openalex_id}.") + self.db.add(db_obj_orcid) # Mark for update. + logger.info( + f"Person {db_obj_orcid.id} (found by ORCID) marked for update with OA ID {openalex_id}." + ) # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj_orcid) - return db_obj_orcid # Return the instance found by ORCID. + return db_obj_orcid # Return the instance found by ORCID. # --- Step 5: Not Found by OA ID or ORCID - Create New --- - logger.debug(f"Person OA ID {openalex_id} (and ORCID {orcid_to_check or 'N/A'}) not found. Creating new.") - obj_in_data["openalex_id"] = openalex_id # Ensure OA ID is set. - new_obj = self.model(**obj_in_data) # Create instance. - self.db.add(new_obj) # Add to session. - self.db.flush() # Send INSERT. - self.db.refresh(new_obj) # Load DB defaults. - logger.info(f"Successfully created and flushed new Person OA ID {openalex_id} (DB ID: {new_obj.id})") - return new_obj # Return the new instance. + logger.debug( + f"Person OA ID {openalex_id} (and ORCID {orcid_to_check or 'N/A'}) not found. Creating new." + ) + obj_in_data["openalex_id"] = openalex_id # Ensure OA ID is set. + new_obj = self.model(**obj_in_data) # Create instance. + self.db.add(new_obj) # Add to session. + self.db.flush() # Send INSERT. + self.db.refresh(new_obj) # Load DB defaults. + logger.info( + f"Successfully created and flushed new Person OA ID {openalex_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create_by_openalex_id for Person OA ID {openalex_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create_by_openalex_id for Person OA ID {openalex_id}: {e}", + exc_info=True, + ) # Caller handles rollback. raise @@ -254,10 +312,14 @@ def get_or_create_by_orcid( SQLAlchemyError: If any database operation fails. """ if not orcid: - raise ValueError("ORCID must be provided for get_or_create_by_orcid") + raise ValueError("ORCID must be provided for get_or_create_by_orcid") if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_orcid for Person.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_orcid for Person." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First by ORCID --- @@ -265,84 +327,125 @@ def get_or_create_by_orcid( if db_obj: # --- Step 2a: Found by ORCID - Update Check --- - logger.debug(f"Found existing Person by ORCID {orcid} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Person by ORCID {orcid} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False new_oa_id = obj_in_data.get("openalex_id") # Update OpenAlex ID if provided and different, checking for conflicts. if new_oa_id and db_obj.openalex_id != new_oa_id: - if not self.db.is_active: # Re-check session - raise RuntimeError("Session inactive before OA ID check during ORCID-based update.") + if not self.db.is_active: # Re-check session + raise RuntimeError( + "Session inactive before OA ID check during ORCID-based update." + ) existing_oa_person = self.get_by_openalex_id(openalex_id=new_oa_id) if existing_oa_person and existing_oa_person.id != db_obj.id: # Log conflict, skip OA ID update. - logger.warning(f"Cannot update OA ID for Person ORCID {orcid} (DB ID {db_obj.id}) to {new_oa_id} " - f"because it's already assigned to Person DB ID {existing_oa_person.id}. Skipping OA ID update.") + logger.warning( + f"Cannot update OA ID for Person ORCID {orcid} (DB ID {db_obj.id}) to {new_oa_id} " + f"because it's already assigned to Person DB ID {existing_oa_person.id}. Skipping OA ID update." + ) else: - logger.info(f"Updating OA ID for Person {db_obj.id} from '{db_obj.openalex_id}' to '{new_oa_id}'") + logger.info( + f"Updating OA ID for Person {db_obj.id} from '{db_obj.openalex_id}' to '{new_oa_id}'" + ) db_obj.openalex_id = new_oa_id updated = True # Update other fields if provided and different. - if obj_in_data.get('display_name') is not None and db_obj.display_name != obj_in_data.get('display_name'): - db_obj.display_name = obj_in_data['display_name'] + if obj_in_data.get( + "display_name" + ) is not None and db_obj.display_name != obj_in_data.get( + "display_name" + ): + db_obj.display_name = obj_in_data["display_name"] + updated = True + if obj_in_data.get( + "display_name_alternatives" + ) is not None and db_obj.display_name_alternatives != obj_in_data.get( + "display_name_alternatives" + ): + db_obj.display_name_alternatives = obj_in_data[ + "display_name_alternatives" + ] updated = True - if obj_in_data.get('display_name_alternatives') is not None and db_obj.display_name_alternatives != obj_in_data.get('display_name_alternatives'): - db_obj.display_name_alternatives = obj_in_data['display_name_alternatives'] - updated = True # Add other updatable fields ... if updated: - self.db.add(db_obj) # Mark as dirty. - logger.info(f"Person {db_obj.id} (found by ORCID) marked for update.") + self.db.add(db_obj) # Mark as dirty. + logger.info( + f"Person {db_obj.id} (found by ORCID) marked for update." + ) # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return instance found by ORCID. + return db_obj # Return instance found by ORCID. else: - # --- Step 2b: Not Found by ORCID - Check OpenAlex ID --- + # --- Step 2b: Not Found by ORCID - Check OpenAlex ID --- oa_id_to_check = obj_in_data.get("openalex_id") if oa_id_to_check: # --- Step 3: Query by OpenAlex ID --- db_obj_oa = self.get_by_openalex_id(openalex_id=oa_id_to_check) if db_obj_oa: # --- Step 4: Found by OA ID - Update with ORCID --- - logger.warning(f"Person not found by ORCID {orcid}, but found existing " - f"Person DB ID {db_obj_oa.id} by OA ID {oa_id_to_check}. Attempting to merge/update.") + logger.warning( + f"Person not found by ORCID {orcid}, but found existing " + f"Person DB ID {db_obj_oa.id} by OA ID {oa_id_to_check}. Attempting to merge/update." + ) updated = False # Add the ORCID if it was missing. if not db_obj_oa.orcid: - logger.info(f"Updating missing ORCID for Person {db_obj_oa.id} (found by OA ID {oa_id_to_check}) to {orcid}") + logger.info( + f"Updating missing ORCID for Person {db_obj_oa.id} (found by OA ID {oa_id_to_check}) to {orcid}" + ) db_obj_oa.orcid = orcid updated = True # Potentially update other fields if missing. - if obj_in_data.get('display_name') is not None and db_obj_oa.display_name is None: - db_obj_oa.display_name = obj_in_data['display_name'] + if ( + obj_in_data.get("display_name") is not None + and db_obj_oa.display_name is None + ): + db_obj_oa.display_name = obj_in_data["display_name"] + updated = True + if ( + obj_in_data.get("display_name_alternatives") is not None + and db_obj_oa.display_name_alternatives is None + ): + db_obj_oa.display_name_alternatives = obj_in_data[ + "display_name_alternatives" + ] updated = True - if obj_in_data.get('display_name_alternatives') is not None and db_obj_oa.display_name_alternatives is None: - db_obj_oa.display_name_alternatives = obj_in_data['display_name_alternatives'] - updated = True # Add other fields ... if updated: - self.db.add(db_obj_oa) # Mark for update. - logger.info(f"Person {db_obj_oa.id} (found by OA ID) marked for update with ORCID {orcid}.") + self.db.add(db_obj_oa) # Mark for update. + logger.info( + f"Person {db_obj_oa.id} (found by OA ID) marked for update with ORCID {orcid}." + ) # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj_oa) - return db_obj_oa # Return instance found by OA ID. + return db_obj_oa # Return instance found by OA ID. # --- Step 5: Not Found by ORCID or OA ID - Create New --- - logger.debug(f"Person ORCID {orcid} (and OA ID {oa_id_to_check or 'N/A'}) not found. Creating new.") - obj_in_data["orcid"] = orcid # Ensure ORCID is set. - new_obj = self.model(**obj_in_data) # Create instance. - self.db.add(new_obj) # Add to session. - self.db.flush() # Send INSERT. - self.db.refresh(new_obj) # Load DB defaults. - logger.info(f"Successfully created and flushed new Person ORCID {orcid} (DB ID: {new_obj.id})") - return new_obj # Return new instance. + logger.debug( + f"Person ORCID {orcid} (and OA ID {oa_id_to_check or 'N/A'}) not found. Creating new." + ) + obj_in_data["orcid"] = orcid # Ensure ORCID is set. + new_obj = self.model(**obj_in_data) # Create instance. + self.db.add(new_obj) # Add to session. + self.db.flush() # Send INSERT. + self.db.refresh(new_obj) # Load DB defaults. + logger.info( + f"Successfully created and flushed new Person ORCID {orcid} (DB ID: {new_obj.id})" + ) + return new_obj # Return new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create_by_orcid for Person ORCID {orcid}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create_by_orcid for Person ORCID {orcid}: {e}", + exc_info=True, + ) # Caller handles rollback. - raise \ No newline at end of file + raise diff --git a/backend/data/repositories/pr_review_comment_repo.py b/backend/data/repositories/pr_review_comment_repo.py index 83dde0c..a68da8b 100644 --- a/backend/data/repositories/pr_review_comment_repo.py +++ b/backend/data/repositories/pr_review_comment_repo.py @@ -6,6 +6,7 @@ Provides data access operations for the PRReviewComment model, representing comments made as part of a GitHub Pull Request review. """ + import logging from typing import Optional, Dict, Any @@ -13,10 +14,11 @@ from sqlalchemy.exc import SQLAlchemyError from .base_repository import BaseRepository -from backend.data.models import PRReviewComment # The specific SQLAlchemy model +from backend.data.models import PRReviewComment # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class PRReviewCommentRepository(BaseRepository[PRReviewComment]): """ Repository dedicated to managing Pull Request Review Comment entities. @@ -52,13 +54,22 @@ def get_by_github_id(self, *, github_id: int) -> Optional[PRReviewComment]: logger.debug(f"Getting PRReviewComment by github_id: {github_id}") # Check for active session to help debug potential transaction issues. if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_github_id for PRReviewComment {github_id}") + logger.warning( + f"Session is inactive in get_by_github_id for PRReviewComment {github_id}" + ) return None try: # Query the PRReviewComment model filtering by the unique github_id. - return self.db.query(self.model).filter(self.model.github_id == github_id).first() + return ( + self.db.query(self.model) + .filter(self.model.github_id == github_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_github_id for PRReviewComment {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_github_id for PRReviewComment {github_id}: {e}", + exc_info=True, + ) raise def get_or_create_by_github_id( @@ -97,55 +108,79 @@ def get_or_create_by_github_id( SQLAlchemyError: If any database interaction (query, add, flush, refresh) fails. """ if not github_id: - raise ValueError("github_id cannot be empty for PRReviewComment get_or_create") + raise ValueError( + "github_id cannot be empty for PRReviewComment get_or_create" + ) if not self.db.is_active: - logger.error(f"Session is inactive at start of get_or_create_by_github_id for PRReviewComment {github_id}.") - raise RuntimeError("Database session is inactive for PRReviewComment get_or_create.") + logger.error( + f"Session is inactive at start of get_or_create_by_github_id for PRReviewComment {github_id}." + ) + raise RuntimeError( + "Database session is inactive for PRReviewComment get_or_create." + ) # --- Step 1: Query First --- db_obj = self.get_by_github_id(github_id=github_id) if db_obj: # --- Step 2a: Record Found - Check for Updates --- - logger.debug(f"Found existing PRReviewComment GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing PRReviewComment GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False # Check if comment body has changed. - if obj_in_data.get('body') is not None and db_obj.body != obj_in_data.get('body'): - db_obj.body = obj_in_data['body'] + if obj_in_data.get("body") is not None and db_obj.body != obj_in_data.get( + "body" + ): + db_obj.body = obj_in_data["body"] updated = True # Check if the GitHub update timestamp has changed. - if obj_in_data.get('gh_updated_at') is not None and db_obj.gh_updated_at != obj_in_data.get('gh_updated_at'): - db_obj.gh_updated_at = obj_in_data['gh_updated_at'] + if obj_in_data.get( + "gh_updated_at" + ) is not None and db_obj.gh_updated_at != obj_in_data.get("gh_updated_at"): + db_obj.gh_updated_at = obj_in_data["gh_updated_at"] updated = True # Check if the associated review ID has changed (less likely, but possible). - if obj_in_data.get('pull_request_review_id') is not None and db_obj.pull_request_review_id != obj_in_data.get('pull_request_review_id'): - db_obj.pull_request_review_id = obj_in_data['pull_request_review_id'] - updated = True + if obj_in_data.get( + "pull_request_review_id" + ) is not None and db_obj.pull_request_review_id != obj_in_data.get( + "pull_request_review_id" + ): + db_obj.pull_request_review_id = obj_in_data["pull_request_review_id"] + updated = True # Add checks for other potentially updatable fields if needed. if updated: - self.db.add(db_obj) # Mark the instance as dirty. - logger.info(f"PRReviewComment {db_obj.id} marked for update in the current session.") - # Optional flush/refresh could go here if caller needs immediate DB state. - return db_obj # Return the existing instance. + self.db.add(db_obj) # Mark the instance as dirty. + logger.info( + f"PRReviewComment {db_obj.id} marked for update in the current session." + ) + # Optional flush/refresh could go here if caller needs immediate DB state. + return db_obj # Return the existing instance. else: # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"PRReviewComment GH ID {github_id} not found. Preparing to create new.") + logger.debug( + f"PRReviewComment GH ID {github_id} not found. Preparing to create new." + ) # Validate required foreign keys for creation. - if 'pr_id' not in obj_in_data or 'user_id' not in obj_in_data: - raise ValueError(f"Missing required 'pr_id' or 'user_id' in obj_in_data for creating new PRReviewComment with GH ID {github_id}") + if "pr_id" not in obj_in_data or "user_id" not in obj_in_data: + raise ValueError( + f"Missing required 'pr_id' or 'user_id' in obj_in_data for creating new PRReviewComment with GH ID {github_id}" + ) # Ensure the github_id is included in the data for the new object. obj_in_data["github_id"] = github_id - new_obj = self.model(**obj_in_data) # Instantiate the new comment. - self.db.add(new_obj) # Add to the session. + new_obj = self.model(**obj_in_data) # Instantiate the new comment. + self.db.add(new_obj) # Add to the session. # Flush to send INSERT to DB, assign PK, check FK constraints. self.db.flush() # Refresh to load any DB-generated values. self.db.refresh(new_obj) - logger.info(f"Successfully created and flushed new PRReviewComment GH ID {github_id} (DB ID: {new_obj.id})") - return new_obj # Return the newly created instance. + logger.info( + f"Successfully created and flushed new PRReviewComment GH ID {github_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the newly created instance. # Note: SQLAlchemyError handling from underlying operations like # get_by_github_id, flush, refresh will propagate up. The caller - # is responsible for handling these and managing the transaction. \ No newline at end of file + # is responsible for handling these and managing the transaction. diff --git a/backend/data/repositories/pull_request_repo.py b/backend/data/repositories/pull_request_repo.py index 8102f04..aa6b95b 100644 --- a/backend/data/repositories/pull_request_repo.py +++ b/backend/data/repositories/pull_request_repo.py @@ -6,6 +6,7 @@ Provides data access operations for the PullRequest model, representing GitHub Pull Requests associated with tracked repositories. """ + import logging from typing import Optional, Dict, Any @@ -13,10 +14,11 @@ from sqlalchemy.exc import SQLAlchemyError from .base_repository import BaseRepository -from backend.data.models import PullRequest # Import the specific model +from backend.data.models import PullRequest # Import the specific model logger = logging.getLogger(__name__) + class PullRequestRepository(BaseRepository[PullRequest]): """ Repository for managing PullRequest entities, including CRUD and specific queries. @@ -53,13 +55,22 @@ def get_by_github_id(self, *, github_id: int) -> Optional[PullRequest]: """ logger.debug(f"Getting PullRequest by github_id: {github_id}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_github_id for PullRequest {github_id}") + logger.warning( + f"Session is inactive in get_by_github_id for PullRequest {github_id}" + ) return None try: # Use self.model (set to PullRequest in __init__) for the query. - return self.db.query(self.model).filter(self.model.github_id == github_id).first() + return ( + self.db.query(self.model) + .filter(self.model.github_id == github_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_github_id for PullRequest {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_github_id for PullRequest {github_id}: {e}", + exc_info=True, + ) raise def get_or_create_by_github_id( @@ -100,8 +111,12 @@ def get_or_create_by_github_id( if not github_id: raise ValueError("github_id cannot be empty for PullRequest get_or_create") if not self.db.is_active: - logger.error(f"Session is inactive at start of get_or_create_by_github_id for PullRequest {github_id}.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + f"Session is inactive at start of get_or_create_by_github_id for PullRequest {github_id}." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First --- @@ -109,53 +124,82 @@ def get_or_create_by_github_id( if db_obj: # --- Step 2a: Record Found - Check for Updates --- - logger.debug(f"Found existing PullRequest GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing PullRequest GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False # Check and update common fields that might change. - if obj_in_data.get('title') is not None and db_obj.title != obj_in_data.get('title'): - db_obj.title = obj_in_data['title'] + if obj_in_data.get( + "title" + ) is not None and db_obj.title != obj_in_data.get("title"): + db_obj.title = obj_in_data["title"] updated = True - if obj_in_data.get('state') is not None and db_obj.state != obj_in_data.get('state'): - db_obj.state = obj_in_data['state'] + if obj_in_data.get( + "state" + ) is not None and db_obj.state != obj_in_data.get("state"): + db_obj.state = obj_in_data["state"] updated = True - if obj_in_data.get('gh_updated_at') is not None and db_obj.gh_updated_at != obj_in_data.get('gh_updated_at'): - db_obj.gh_updated_at = obj_in_data['gh_updated_at'] + if obj_in_data.get( + "gh_updated_at" + ) is not None and db_obj.gh_updated_at != obj_in_data.get( + "gh_updated_at" + ): + db_obj.gh_updated_at = obj_in_data["gh_updated_at"] updated = True # Ensure timestamps that can be nullified (like closed/merged) are handled correctly. - if obj_in_data.get('gh_closed_at') is not None and db_obj.gh_closed_at != obj_in_data.get('gh_closed_at'): - db_obj.gh_closed_at = obj_in_data['gh_closed_at'] + if obj_in_data.get( + "gh_closed_at" + ) is not None and db_obj.gh_closed_at != obj_in_data.get( + "gh_closed_at" + ): + db_obj.gh_closed_at = obj_in_data["gh_closed_at"] updated = True - if obj_in_data.get('gh_merged_at') is not None and db_obj.gh_merged_at != obj_in_data.get('gh_merged_at'): - db_obj.gh_merged_at = obj_in_data['gh_merged_at'] + if obj_in_data.get( + "gh_merged_at" + ) is not None and db_obj.gh_merged_at != obj_in_data.get( + "gh_merged_at" + ): + db_obj.gh_merged_at = obj_in_data["gh_merged_at"] updated = True # Add other relevant fields like labels, assignees, body, merge commit SHA etc. if updated: - self.db.add(db_obj) # Mark as dirty in the session. - logger.info(f"PullRequest {db_obj.id} marked for update in the current session.") + self.db.add(db_obj) # Mark as dirty in the session. + logger.info( + f"PullRequest {db_obj.id} marked for update in the current session." + ) # Optional: Flush and refresh if immediate state needed by caller. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return the existing instance. + return db_obj # Return the existing instance. else: # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"PullRequest GH ID {github_id} not found. Preparing to create new.") + logger.debug( + f"PullRequest GH ID {github_id} not found. Preparing to create new." + ) # Validate presence of required foreign keys for creation. - if 'repository_id' not in obj_in_data or 'user_id' not in obj_in_data: - raise ValueError(f"Missing required 'repository_id' or 'user_id' in obj_in_data for creating new PullRequest with GH ID {github_id}") + if "repository_id" not in obj_in_data or "user_id" not in obj_in_data: + raise ValueError( + f"Missing required 'repository_id' or 'user_id' in obj_in_data for creating new PullRequest with GH ID {github_id}" + ) # Ensure github_id is set in the creation data. obj_in_data["github_id"] = github_id - new_obj = self.model(**obj_in_data) # Instantiate the new PR. - self.db.add(new_obj) # Add to session. + new_obj = self.model(**obj_in_data) # Instantiate the new PR. + self.db.add(new_obj) # Add to session. # Flush: Send INSERT, get PK, check FK constraints. self.db.flush() # Refresh: Load DB defaults/generated values. self.db.refresh(new_obj) - logger.info(f"Successfully created and flushed new PullRequest GH ID {github_id} (DB ID: {new_obj.id})") - return new_obj # Return the new instance. + logger.info( + f"Successfully created and flushed new PullRequest GH ID {github_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create for PullRequest GH ID {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create for PullRequest GH ID {github_id}: {e}", + exc_info=True, + ) # Rollback is handled by the caller. - raise # Re-raise the error. \ No newline at end of file + raise # Re-raise the error. diff --git a/backend/data/repositories/repository_institution_affiliation_repo.py b/backend/data/repositories/repository_institution_affiliation_repo.py index 09e2769..ad8d6f7 100644 --- a/backend/data/repositories/repository_institution_affiliation_repo.py +++ b/backend/data/repositories/repository_institution_affiliation_repo.py @@ -13,13 +13,15 @@ from sqlalchemy.orm import Session from sqlalchemy.exc import SQLAlchemyError + # from sqlalchemy import func # Uncomment if using func.now() as server_default -from datetime import datetime, timezone # Used for manually setting timestamps +from datetime import datetime, timezone # Used for manually setting timestamps -from backend.data.models import RepositoryInstitutionAffiliation # The specific model +from backend.data.models import RepositoryInstitutionAffiliation # The specific model logger = logging.getLogger(__name__) + class RepositoryInstitutionAffiliationRepository: """ Repository for managing RepositoryInstitutionAffiliation records. @@ -30,6 +32,7 @@ class RepositoryInstitutionAffiliationRepository: primary key (repository_id, institution_id, algorithm_name, algorithm_version) and implements a specific create-or-update logic. """ + def __init__(self, db: Session): """ Initializes the RepositoryInstitutionAffiliationRepository. @@ -46,7 +49,7 @@ def get_affiliation( repository_id: int, institution_id: int, algorithm_name: str, - algorithm_version: str + algorithm_version: str, ) -> Optional[RepositoryInstitutionAffiliation]: """ Retrieves a specific affiliation record using its composite primary key. @@ -69,8 +72,10 @@ def get_affiliation( # Session.get is efficient for primary key lookups, including composite keys (passed as a tuple). return self.db.get(self.model, pk_tuple) except SQLAlchemyError as e: - logger.error(f"DB error getting affiliation for key {pk_tuple}: {e}", exc_info=True) - raise # Propagate the error for handling by the caller. + logger.error( + f"DB error getting affiliation for key {pk_tuple}: {e}", exc_info=True + ) + raise # Propagate the error for handling by the caller. def create_or_update_affiliation( self, @@ -81,7 +86,7 @@ def create_or_update_affiliation( algorithm_version: str, confidence_score: float, evidence: Optional[Dict[str, Any]] = None, - parameters_used: Optional[Dict[str, Any]] = None + parameters_used: Optional[Dict[str, Any]] = None, ) -> Tuple[RepositoryInstitutionAffiliation, bool]: """ Creates a new affiliation record or updates an existing one based on the composite PK. @@ -127,13 +132,18 @@ def create_or_update_affiliation( repository_id=repository_id, institution_id=institution_id, algorithm_name=algorithm_name, - algorithm_version=algorithm_version + algorithm_version=algorithm_version, ) - created = False # Flag to indicate if a new record was created. + created = False # Flag to indicate if a new record was created. # Get the current UTC time for the calculated_at timestamp. current_time = datetime.now(timezone.utc) - pk_tuple = (repository_id, institution_id, algorithm_name, algorithm_version) # For logging + pk_tuple = ( + repository_id, + institution_id, + algorithm_name, + algorithm_version, + ) # For logging if existing_affiliation: # --- Update Existing Record --- @@ -158,21 +168,29 @@ def create_or_update_affiliation( confidence_score=confidence_score, evidence=evidence, parameters_used=parameters_used, - calculated_at=current_time # Set timestamp on creation as well. + calculated_at=current_time, # Set timestamp on creation as well. ) # Mark created as True since we are inserting. created = True try: - self.db.add(db_obj) # Add the new or updated object to the session. + self.db.add(db_obj) # Add the new or updated object to the session. # Flush to send SQL (INSERT or UPDATE) to the database and check constraints. self.db.flush() # Refresh the object state to ensure it reflects any DB-side changes # (though less likely for this model unless triggers are used). self.db.refresh(db_obj) - logger.info(f"Successfully {'created' if created else 'updated'} and flushed affiliation for key: {pk_tuple}") - return db_obj, created # Return the object and the created/updated status flag. + logger.info( + f"Successfully {'created' if created else 'updated'} and flushed affiliation for key: {pk_tuple}" + ) + return ( + db_obj, + created, + ) # Return the object and the created/updated status flag. except SQLAlchemyError as e: - logger.error(f"DB error {'creating' if created else 'updating'} affiliation for key {pk_tuple}: {e}", exc_info=True) + logger.error( + f"DB error {'creating' if created else 'updating'} affiliation for key {pk_tuple}: {e}", + exc_info=True, + ) # Rollback should occur in the calling service layer / API endpoint. - raise # Re-raise the error. \ No newline at end of file + raise # Re-raise the error. diff --git a/backend/data/repositories/repository_repo.py b/backend/data/repositories/repository_repo.py index 5be456d..18c0d72 100644 --- a/backend/data/repositories/repository_repo.py +++ b/backend/data/repositories/repository_repo.py @@ -11,13 +11,17 @@ from typing import Optional, Dict, Any from sqlalchemy.orm import Session -from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception +from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception from .base_repository import BaseRepository -from backend.data.models import Repository, Owner # Import Owner for relationship handling +from backend.data.models import ( + Repository, + Owner, +) # Import Owner for relationship handling logger = logging.getLogger(__name__) + class RepositoryRepository(BaseRepository[Repository]): """ Repository for managing Repository entities, including CRUD and specific queries. @@ -51,13 +55,22 @@ def get_by_github_id(self, *, github_id: int) -> Optional[Repository]: """ logger.debug(f"Getting Repository by github_id: {github_id}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_github_id for Repository GH ID {github_id}") + logger.warning( + f"Session is inactive in get_by_github_id for Repository GH ID {github_id}" + ) return None try: # Standard query filtering by the unique github_id. - return self.db.query(self.model).filter(self.model.github_id == github_id).first() + return ( + self.db.query(self.model) + .filter(self.model.github_id == github_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_github_id for Repository {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_github_id for Repository {github_id}: {e}", + exc_info=True, + ) raise def get_by_full_name(self, *, full_name: str) -> Optional[Repository]: @@ -75,17 +88,30 @@ def get_by_full_name(self, *, full_name: str) -> Optional[Repository]: """ logger.debug(f"Getting Repository by full_name: {full_name}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_full_name for Repository '{full_name}'") + logger.warning( + f"Session is inactive in get_by_full_name for Repository '{full_name}'" + ) return None try: # Query filtering by the full_name, which should ideally be unique. - return self.db.query(self.model).filter(self.model.full_name == full_name).first() + return ( + self.db.query(self.model) + .filter(self.model.full_name == full_name) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_full_name for Repository '{full_name}': {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_full_name for Repository '{full_name}': {e}", + exc_info=True, + ) raise def get_or_create_by_github_id( - self, *, github_id: int, obj_in_data: Dict[str, Any], owner_obj: Optional[Owner] = None + self, + *, + github_id: int, + obj_in_data: Dict[str, Any], + owner_obj: Optional[Owner] = None, ) -> Repository: """ Retrieves a repository by GitHub ID or creates a new one if not found. @@ -125,8 +151,12 @@ def get_or_create_by_github_id( if not github_id: raise ValueError("github_id cannot be empty for Repository get_or_create") if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_github_id for Repository.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_github_id for Repository." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First --- @@ -134,14 +164,18 @@ def get_or_create_by_github_id( if db_obj: # --- Step 2a: Record Found - Check for Updates --- - logger.debug(f"Found existing Repository GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Repository GH ID {github_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False - new_full_name = obj_in_data.get('full_name') + new_full_name = obj_in_data.get("full_name") # Check if full_name needs update and handle potential uniqueness conflicts. if new_full_name and db_obj.full_name != new_full_name: - if not self.db.is_active: # Re-check session before dependent query - raise RuntimeError("Session inactive before full_name conflict check.") + if not self.db.is_active: # Re-check session before dependent query + raise RuntimeError( + "Session inactive before full_name conflict check." + ) existing_fn_repo = self.get_by_full_name(full_name=new_full_name) if existing_fn_repo and existing_fn_repo.id != db_obj.id: # Log conflict, skip full_name update to avoid potential unique constraint error. @@ -150,14 +184,22 @@ def get_or_create_by_github_id( f"because it's already assigned to Repository DB ID {existing_fn_repo.id}. Skipping full_name update." ) else: - logger.info(f"Updating full_name for Repository {db_obj.id} from '{db_obj.full_name}' to '{new_full_name}'") + logger.info( + f"Updating full_name for Repository {db_obj.id} from '{db_obj.full_name}' to '{new_full_name}'" + ) db_obj.full_name = new_full_name updated = True # Update owner relationship if a valid owner object is provided and different. # Assumes owner_obj is already flushed and has an ID. - if owner_obj and owner_obj.id is not None and db_obj.owner_id != owner_obj.id: - logger.info(f"Updating owner for Repository {db_obj.id} from owner_id {db_obj.owner_id} to owner_id {owner_obj.id}") + if ( + owner_obj + and owner_obj.id is not None + and db_obj.owner_id != owner_obj.id + ): + logger.info( + f"Updating owner for Repository {db_obj.id} from owner_id {db_obj.owner_id} to owner_id {owner_obj.id}" + ) db_obj.owner_id = owner_obj.id # Optionally update the relationship attribute directly if needed before commit, # although changing owner_id is often sufficient for SQLAlchemy. @@ -165,51 +207,74 @@ def get_or_create_by_github_id( updated = True # Update other repository attributes if provided and different. - if obj_in_data.get('description') is not None and db_obj.description != obj_in_data.get('description'): - db_obj.description = obj_in_data['description'] + if obj_in_data.get( + "description" + ) is not None and db_obj.description != obj_in_data.get("description"): + db_obj.description = obj_in_data["description"] updated = True - if obj_in_data.get('stargazers_count') is not None and db_obj.stargazers_count != obj_in_data.get('stargazers_count'): - db_obj.stargazers_count = obj_in_data['stargazers_count'] + if obj_in_data.get( + "stargazers_count" + ) is not None and db_obj.stargazers_count != obj_in_data.get( + "stargazers_count" + ): + db_obj.stargazers_count = obj_in_data["stargazers_count"] updated = True # Note: Comparison for JSON/Array fields like topics might need adjustment based on data type/DB. - if obj_in_data.get('topics') is not None and db_obj.topics != obj_in_data.get('topics'): - db_obj.topics = obj_in_data['topics'] + if obj_in_data.get( + "topics" + ) is not None and db_obj.topics != obj_in_data.get("topics"): + db_obj.topics = obj_in_data["topics"] + updated = True + if obj_in_data.get( + "license" + ) is not None and db_obj.license != obj_in_data.get("license"): + db_obj.license = obj_in_data["license"] updated = True - if obj_in_data.get('license') is not None and db_obj.license != obj_in_data.get('license'): - db_obj.license = obj_in_data['license'] - updated = True # Add other updatable fields (e.g., fork, archived, language, homepage)... if updated: - self.db.add(db_obj) # Mark as dirty. - logger.info(f"Repository {db_obj.id} marked for update in the current session.") + self.db.add(db_obj) # Mark as dirty. + logger.info( + f"Repository {db_obj.id} marked for update in the current session." + ) # Optional: Flush and refresh if needed. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return the existing instance. + return db_obj # Return the existing instance. else: # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"Repository GH ID {github_id} not found. Preparing to create new.") + logger.debug( + f"Repository GH ID {github_id} not found. Preparing to create new." + ) # CRITICAL: Ensure a valid, flushed Owner object is provided for creation. if not owner_obj or owner_obj.id is None: - logger.error(f"Cannot create Repository GH ID {github_id}: Owner object is missing or not flushed (Owner ID: {getattr(owner_obj, 'id', 'None')}).") - raise ValueError("A flushed Owner object (with an assigned ID) must be provided via 'owner_obj' when creating a Repository.") + logger.error( + f"Cannot create Repository GH ID {github_id}: Owner object is missing or not flushed (Owner ID: {getattr(owner_obj, 'id', 'None')})." + ) + raise ValueError( + "A flushed Owner object (with an assigned ID) must be provided via 'owner_obj' when creating a Repository." + ) # Ensure github_id is set in the creation data. obj_in_data["github_id"] = github_id - new_obj = self.model(**obj_in_data) # Create the Repository instance. + new_obj = self.model(**obj_in_data) # Create the Repository instance. # Assign the owner relationship. SQLAlchemy handles setting the owner_id FK based on this. new_obj.owner = owner_obj - self.db.add(new_obj) # Add to session. + self.db.add(new_obj) # Add to session. # Flush: Send INSERT, assign PK, check constraints (including FK to owner). self.db.flush() # Refresh: Load DB defaults. self.db.refresh(new_obj) - logger.info(f"Successfully created and flushed new Repository GH ID {github_id} (DB ID: {new_obj.id}) with owner_id {new_obj.owner_id}") - return new_obj # Return the new instance. + logger.info( + f"Successfully created and flushed new Repository GH ID {github_id} (DB ID: {new_obj.id}) with owner_id {new_obj.owner_id}" + ) + return new_obj # Return the new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create for Repository GH ID {github_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create for Repository GH ID {github_id}: {e}", + exc_info=True, + ) # Caller handles rollback. - raise # Re-raise the error. \ No newline at end of file + raise # Re-raise the error. diff --git a/backend/data/repositories/software_dependency_repo.py b/backend/data/repositories/software_dependency_repo.py index 1dea183..bf03f07 100644 --- a/backend/data/repositories/software_dependency_repo.py +++ b/backend/data/repositories/software_dependency_repo.py @@ -6,6 +6,7 @@ Provides data access operations for the SoftwareDependency model, representing dependencies listed in project files (e.g., requirements.txt, package.json). """ + import logging from typing import Optional, List, Dict, Any @@ -13,10 +14,11 @@ from sqlalchemy.exc import SQLAlchemyError from .base_repository import BaseRepository -from backend.data.models import SoftwareDependency # The specific model +from backend.data.models import SoftwareDependency # The specific model logger = logging.getLogger(__name__) + class SoftwareDependencyRepository(BaseRepository[SoftwareDependency]): """ Repository for managing SoftwareDependency entities. @@ -56,24 +58,27 @@ def find_by_repository_and_name( Raises: SQLAlchemyError: If a database error occurs during the query. """ - logger.debug(f"Finding dependency '{dependency_name}' from source '{source_file}' in repository {repository_id}") + logger.debug( + f"Finding dependency '{dependency_name}' from source '{source_file}' in repository {repository_id}" + ) try: return ( self.db.query(self.model) .filter( self.model.repository_id == repository_id, self.model.dependency_name == dependency_name, - self.model.source_file == source_file + self.model.source_file == source_file, ) - .first() # Expecting one or zero matches based on these fields. + .first() # Expecting one or zero matches based on these fields. ) except SQLAlchemyError as e: - logger.error(f"DB error finding dependency {dependency_name} in {source_file} for repo {repository_id}: {e}", exc_info=True) + logger.error( + f"DB error finding dependency {dependency_name} in {source_file} for repo {repository_id}: {e}", + exc_info=True, + ) raise - def get_or_create( - self, *, obj_in_data: Dict[str, Any] - ) -> SoftwareDependency: + def get_or_create(self, *, obj_in_data: Dict[str, Any]) -> SoftwareDependency: """ Retrieves a software dependency record or creates a new one if not found. @@ -108,8 +113,12 @@ def get_or_create( src_file = obj_in_data.get("source_file") # Validate required fields for lookup/creation. - if not all([repo_id, dep_name, src_file is not None]): # Allow empty string for source_file? Check constraints. - raise ValueError("repository_id, dependency_name, and source_file must be provided in obj_in_data for SoftwareDependency get_or_create") + if not all( + [repo_id, dep_name, src_file is not None] + ): # Allow empty string for source_file? Check constraints. + raise ValueError( + "repository_id, dependency_name, and source_file must be provided in obj_in_data for SoftwareDependency get_or_create" + ) # --- Step 1: Query First --- db_obj = self.find_by_repository_and_name( @@ -118,7 +127,9 @@ def get_or_create( if db_obj: # --- Step 2a: Record Found --- - logger.debug(f"Found existing dependency record: {dep_name} in {src_file} for repo {repo_id} (ID: {db_obj.id})") + logger.debug( + f"Found existing dependency record: {dep_name} in {src_file} for repo {repo_id} (ID: {db_obj.id})" + ) # --- Optional Update Logic --- # Example: Update version constraint if it has changed. # new_version = obj_in_data.get("version_constraint") @@ -127,24 +138,29 @@ def get_or_create( # db_obj.version_constraint = new_version # self.db.add(db_obj) # Mark as dirty if updated. # # Consider flushing/refreshing if updates are made. - return db_obj # Return existing object. + return db_obj # Return existing object. else: # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"Creating new dependency record: {dep_name} in {src_file} for repo {repo_id}") + logger.debug( + f"Creating new dependency record: {dep_name} in {src_file} for repo {repo_id}" + ) try: - new_obj = self.model(**obj_in_data) # Instantiate new object. - self.db.add(new_obj) # Add to session. - self.db.flush() # Send INSERT, get PK, check constraints. - self.db.refresh(new_obj) # Load DB defaults. - logger.info(f"Successfully created and flushed new dependency {new_obj.id} ({dep_name} in {src_file} for repo {repo_id})") - return new_obj # Return new object. + new_obj = self.model(**obj_in_data) # Instantiate new object. + self.db.add(new_obj) # Add to session. + self.db.flush() # Send INSERT, get PK, check constraints. + self.db.refresh(new_obj) # Load DB defaults. + logger.info( + f"Successfully created and flushed new dependency {new_obj.id} ({dep_name} in {src_file} for repo {repo_id})" + ) + return new_obj # Return new object. except SQLAlchemyError as e: - logger.error(f"DB error creating dependency {dep_name} in {src_file} for repo {repo_id}: {e}", exc_info=True) - raise # Re-raise for caller to handle (and rollback). + logger.error( + f"DB error creating dependency {dep_name} in {src_file} for repo {repo_id}: {e}", + exc_info=True, + ) + raise # Re-raise for caller to handle (and rollback). - def find_by_repository( - self, *, repository_id: int - ) -> List[SoftwareDependency]: + def find_by_repository(self, *, repository_id: int) -> List[SoftwareDependency]: """ Finds all software dependencies declared within a specific repository. @@ -164,9 +180,14 @@ def find_by_repository( return ( self.db.query(self.model) .filter(self.model.repository_id == repository_id) - .order_by(self.model.source_file, self.model.dependency_name) # Order for consistent results. + .order_by( + self.model.source_file, self.model.dependency_name + ) # Order for consistent results. .all() ) except SQLAlchemyError as e: - logger.error(f"DB error finding dependencies for repo {repository_id}: {e}", exc_info=True) - raise \ No newline at end of file + logger.error( + f"DB error finding dependencies for repo {repository_id}: {e}", + exc_info=True, + ) + raise diff --git a/backend/data/repositories/subfield_repo.py b/backend/data/repositories/subfield_repo.py index c0f8ccd..2897e35 100644 --- a/backend/data/repositories/subfield_repo.py +++ b/backend/data/repositories/subfield_repo.py @@ -11,13 +11,14 @@ from typing import Optional, Dict, Any from sqlalchemy.orm import Session -from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception +from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception from .base_repository import BaseRepository -from backend.data.models import Subfield # The specific SQLAlchemy model +from backend.data.models import Subfield # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class SubfieldRepository(BaseRepository[Subfield]): """ Repository managing CRUD and specific queries for Subfield entities. @@ -53,13 +54,22 @@ def get_by_openalex_id(self, *, openalex_id: str) -> Optional[Subfield]: """ logger.debug(f"Getting Subfield by openalex_id: {openalex_id}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_openalex_id for Subfield OA ID {openalex_id}") + logger.warning( + f"Session is inactive in get_by_openalex_id for Subfield OA ID {openalex_id}" + ) return None try: # Standard query filtering by the unique OpenAlex ID. - return self.db.query(self.model).filter(self.model.openalex_id == openalex_id).first() + return ( + self.db.query(self.model) + .filter(self.model.openalex_id == openalex_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_openalex_id for Subfield {openalex_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_openalex_id for Subfield {openalex_id}: {e}", + exc_info=True, + ) raise def get_or_create_by_openalex_id( @@ -98,8 +108,12 @@ def get_or_create_by_openalex_id( if not openalex_id: raise ValueError("openalex_id cannot be empty for Subfield get_or_create") if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_openalex_id for Subfield.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_openalex_id for Subfield." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First --- @@ -107,52 +121,73 @@ def get_or_create_by_openalex_id( if db_obj: # --- Step 2a: Record Found - Check for Updates --- - logger.debug(f"Found existing Subfield OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Subfield OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False # Check and update display name if provided and different. - if obj_in_data.get('display_name') is not None and db_obj.display_name != obj_in_data.get('display_name'): - db_obj.display_name = obj_in_data['display_name'] + if obj_in_data.get( + "display_name" + ) is not None and db_obj.display_name != obj_in_data.get( + "display_name" + ): + db_obj.display_name = obj_in_data["display_name"] updated = True # Check and update description if provided and different. - if obj_in_data.get('description') is not None and db_obj.description != obj_in_data.get('description'): - db_obj.description = obj_in_data['description'] - updated = True + if obj_in_data.get( + "description" + ) is not None and db_obj.description != obj_in_data.get("description"): + db_obj.description = obj_in_data["description"] + updated = True # Check if the parent field_id needs updating. - new_field_id = obj_in_data.get('field_id') + new_field_id = obj_in_data.get("field_id") if new_field_id is not None and db_obj.field_id != new_field_id: - logger.warning(f"Subfield OA ID {openalex_id} exists but field_id mismatch detected. " - f"DB has {db_obj.field_id}, input data has {new_field_id}. Updating.") - db_obj.field_id = new_field_id - updated = True + logger.warning( + f"Subfield OA ID {openalex_id} exists but field_id mismatch detected. " + f"DB has {db_obj.field_id}, input data has {new_field_id}. Updating." + ) + db_obj.field_id = new_field_id + updated = True # Add other field update checks here if needed... if updated: - self.db.add(db_obj) # Mark as dirty. - logger.info(f"Subfield {db_obj.id} marked for update in the current session.") + self.db.add(db_obj) # Mark as dirty. + logger.info( + f"Subfield {db_obj.id} marked for update in the current session." + ) # Optional: Flush and refresh if immediate state needed by caller. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return the existing instance. + return db_obj # Return the existing instance. else: - # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"Subfield OA ID {openalex_id} not found. Preparing to create new.") + # --- Step 2b: Record Not Found - Create New --- + logger.debug( + f"Subfield OA ID {openalex_id} not found. Preparing to create new." + ) # CRITICAL: Ensure the foreign key `field_id` is present for creation. - if 'field_id' not in obj_in_data or obj_in_data['field_id'] is None: - raise ValueError(f"Missing required 'field_id' in obj_in_data for creating new Subfield with OA ID {openalex_id}") + if "field_id" not in obj_in_data or obj_in_data["field_id"] is None: + raise ValueError( + f"Missing required 'field_id' in obj_in_data for creating new Subfield with OA ID {openalex_id}" + ) # Ensure openalex_id is part of the creation data. obj_in_data["openalex_id"] = openalex_id - new_obj = self.model(**obj_in_data) # Create the instance. - self.db.add(new_obj) # Add to session. + new_obj = self.model(**obj_in_data) # Create the instance. + self.db.add(new_obj) # Add to session. # Flush: Send INSERT, get PK, check constraints (including FK to field). self.db.flush() # Refresh: Update object with DB defaults. self.db.refresh(new_obj) - logger.info(f"Successfully created and flushed new Subfield OA ID {openalex_id} (DB ID: {new_obj.id})") - return new_obj # Return the new instance. + logger.info( + f"Successfully created and flushed new Subfield OA ID {openalex_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create for Subfield OA ID {openalex_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create for Subfield OA ID {openalex_id}: {e}", + exc_info=True, + ) # Caller handles rollback. - raise # Re-raise the caught exception. \ No newline at end of file + raise # Re-raise the caught exception. diff --git a/backend/data/repositories/topic_repo.py b/backend/data/repositories/topic_repo.py index 0f3ba68..e2e3dfb 100644 --- a/backend/data/repositories/topic_repo.py +++ b/backend/data/repositories/topic_repo.py @@ -11,13 +11,14 @@ from typing import Optional, Dict, Any from sqlalchemy.orm import Session -from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception +from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception from .base_repository import BaseRepository -from backend.data.models import Topic # The specific SQLAlchemy model +from backend.data.models import Topic # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class TopicRepository(BaseRepository[Topic]): """ Repository managing CRUD and specific queries for Topic entities. @@ -53,13 +54,22 @@ def get_by_openalex_id(self, *, openalex_id: str) -> Optional[Topic]: """ logger.debug(f"Getting Topic by openalex_id: {openalex_id}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_openalex_id for Topic OA ID {openalex_id}") + logger.warning( + f"Session is inactive in get_by_openalex_id for Topic OA ID {openalex_id}" + ) return None try: # Standard query filtering by the unique OpenAlex ID. - return self.db.query(self.model).filter(self.model.openalex_id == openalex_id).first() + return ( + self.db.query(self.model) + .filter(self.model.openalex_id == openalex_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_openalex_id for Topic {openalex_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_openalex_id for Topic {openalex_id}: {e}", + exc_info=True, + ) raise def get_or_create_by_openalex_id( @@ -98,8 +108,12 @@ def get_or_create_by_openalex_id( if not openalex_id: raise ValueError("openalex_id cannot be empty for Topic get_or_create") if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_openalex_id for Topic.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_openalex_id for Topic." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First --- @@ -107,51 +121,78 @@ def get_or_create_by_openalex_id( if db_obj: # --- Step 2a: Record Found - Check for Updates --- - logger.debug(f"Found existing Topic OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Topic OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False # Check and update display name if provided and different. - if obj_in_data.get('display_name') is not None and db_obj.display_name != obj_in_data.get('display_name'): - db_obj.display_name = obj_in_data['display_name'] + if obj_in_data.get( + "display_name" + ) is not None and db_obj.display_name != obj_in_data.get( + "display_name" + ): + db_obj.display_name = obj_in_data["display_name"] updated = True # Check and update description if provided and different. - if obj_in_data.get('description') is not None and db_obj.description != obj_in_data.get('description'): - db_obj.description = obj_in_data['description'] - updated = True + if obj_in_data.get( + "description" + ) is not None and db_obj.description != obj_in_data.get("description"): + db_obj.description = obj_in_data["description"] + updated = True # Check if the parent subfield_id needs updating. - new_subfield_id = obj_in_data.get('subfield_id') - if new_subfield_id is not None and db_obj.subfield_id != new_subfield_id: - logger.warning(f"Topic OA ID {openalex_id} exists but subfield_id mismatch detected. " - f"DB has {db_obj.subfield_id}, input data has {new_subfield_id}. Updating.") - db_obj.subfield_id = new_subfield_id - updated = True + new_subfield_id = obj_in_data.get("subfield_id") + if ( + new_subfield_id is not None + and db_obj.subfield_id != new_subfield_id + ): + logger.warning( + f"Topic OA ID {openalex_id} exists but subfield_id mismatch detected. " + f"DB has {db_obj.subfield_id}, input data has {new_subfield_id}. Updating." + ) + db_obj.subfield_id = new_subfield_id + updated = True # Add other field update checks here if needed... if updated: - self.db.add(db_obj) # Mark as dirty. - logger.info(f"Topic {db_obj.id} marked for update in the current session.") + self.db.add(db_obj) # Mark as dirty. + logger.info( + f"Topic {db_obj.id} marked for update in the current session." + ) # Optional: Flush and refresh if immediate state needed by caller. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return the existing instance. + return db_obj # Return the existing instance. else: - # --- Step 2b: Record Not Found - Create New --- - logger.debug(f"Topic OA ID {openalex_id} not found. Preparing to create new.") + # --- Step 2b: Record Not Found - Create New --- + logger.debug( + f"Topic OA ID {openalex_id} not found. Preparing to create new." + ) # CRITICAL: Ensure the foreign key `subfield_id` is present for creation. - if 'subfield_id' not in obj_in_data or obj_in_data['subfield_id'] is None: - raise ValueError(f"Missing required 'subfield_id' in obj_in_data for creating new Topic with OA ID {openalex_id}") + if ( + "subfield_id" not in obj_in_data + or obj_in_data["subfield_id"] is None + ): + raise ValueError( + f"Missing required 'subfield_id' in obj_in_data for creating new Topic with OA ID {openalex_id}" + ) # Ensure openalex_id is part of the creation data. obj_in_data["openalex_id"] = openalex_id - new_obj = self.model(**obj_in_data) # Create the instance. - self.db.add(new_obj) # Add to session. + new_obj = self.model(**obj_in_data) # Create the instance. + self.db.add(new_obj) # Add to session. # Flush: Send INSERT, get PK, check constraints (including FK to subfield). self.db.flush() # Refresh: Update object with DB defaults. self.db.refresh(new_obj) - logger.info(f"Successfully created and flushed new Topic OA ID {openalex_id} (DB ID: {new_obj.id})") - return new_obj # Return the new instance. + logger.info( + f"Successfully created and flushed new Topic OA ID {openalex_id} (DB ID: {new_obj.id})" + ) + return new_obj # Return the new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create for Topic OA ID {openalex_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create for Topic OA ID {openalex_id}: {e}", + exc_info=True, + ) # Caller handles rollback. - raise # Re-raise the caught exception. \ No newline at end of file + raise # Re-raise the caught exception. diff --git a/backend/data/repositories/work_repo.py b/backend/data/repositories/work_repo.py index df3fc6e..79dd0ff 100644 --- a/backend/data/repositories/work_repo.py +++ b/backend/data/repositories/work_repo.py @@ -11,13 +11,14 @@ from typing import Optional, Dict, Any from sqlalchemy.orm import Session -from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception +from sqlalchemy.exc import SQLAlchemyError # General SQLAlchemy exception from .base_repository import BaseRepository -from backend.data.models import Work # The specific SQLAlchemy model +from backend.data.models import Work # The specific SQLAlchemy model logger = logging.getLogger(__name__) + class WorkRepository(BaseRepository[Work]): """ Repository managing CRUD and specific queries for Work entities. @@ -54,7 +55,8 @@ def get_by_doi(self, *, doi: str) -> Optional[Work]: Raises: SQLAlchemyError: If a database error occurs during the query. """ - if not doi: return None # Avoid querying with empty DOI. + if not doi: + return None # Avoid querying with empty DOI. logger.debug(f"Getting Work by DOI: {doi}") if not self.db.is_active: logger.warning(f"Session is inactive in get_by_doi for Work DOI {doi}") @@ -65,7 +67,9 @@ def get_by_doi(self, *, doi: str) -> Optional[Work]: # Consider `noload('*')` or `load_only()` if only the ID is needed frequently. return self.db.query(self.model).filter(self.model.doi == doi).first() except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_doi for Work {doi}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_doi for Work {doi}: {e}", exc_info=True + ) raise def get_by_openalex_id(self, *, openalex_id: str) -> Optional[Work]: @@ -82,21 +86,29 @@ def get_by_openalex_id(self, *, openalex_id: str) -> Optional[Work]: Raises: SQLAlchemyError: If a database error occurs during the query. """ - if not openalex_id: return None # Avoid querying with empty ID. + if not openalex_id: + return None # Avoid querying with empty ID. logger.debug(f"Getting Work by OpenAlex ID: {openalex_id}") if not self.db.is_active: - logger.warning(f"Session is inactive in get_by_openalex_id for Work OA ID {openalex_id}") + logger.warning( + f"Session is inactive in get_by_openalex_id for Work OA ID {openalex_id}" + ) return None try: # Query based on the OpenAlex ID. Indexing is essential here too. - return self.db.query(self.model).filter(self.model.openalex_id == openalex_id).first() + return ( + self.db.query(self.model) + .filter(self.model.openalex_id == openalex_id) + .first() + ) except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_by_openalex_id for Work {openalex_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_by_openalex_id for Work {openalex_id}: {e}", + exc_info=True, + ) raise - def get_or_create_by_doi( - self, *, doi: str, obj_in_data: Dict[str, Any] - ) -> Work: + def get_or_create_by_doi(self, *, doi: str, obj_in_data: Dict[str, Any]) -> Work: """ Retrieves or creates a Work, prioritizing the DOI. @@ -131,10 +143,14 @@ def get_or_create_by_doi( SQLAlchemyError: If any database operation fails. """ if not doi: - raise ValueError("DOI cannot be empty for Work get_or_create_by_doi") + raise ValueError("DOI cannot be empty for Work get_or_create_by_doi") if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_doi for Work.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_doi for Work." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First by DOI --- @@ -142,42 +158,54 @@ def get_or_create_by_doi( if db_obj: # --- Step 2a: Found by DOI - Update Check --- - logger.debug(f"Found existing Work by DOI {doi} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Work by DOI {doi} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False new_oa_id = obj_in_data.get("openalex_id") # Update OpenAlex ID if provided and different, checking for conflicts. if new_oa_id and db_obj.openalex_id != new_oa_id: - if not self.db.is_active: # Re-check session - raise RuntimeError("Session inactive before OA ID conflict check.") - existing_oa_work = self.get_by_openalex_id(openalex_id=new_oa_id) - if existing_oa_work and existing_oa_work.id != db_obj.id: - # Log conflict, skip OA ID update. - logger.warning( - f"Cannot update OA ID for Work DOI {doi} (DB ID {db_obj.id}) to {new_oa_id} " - f"because it's already assigned to Work DB ID {existing_oa_work.id}. Skipping OA ID update." - ) - else: - logger.info(f"Updating OA ID for Work {db_obj.id} from '{db_obj.openalex_id}' to '{new_oa_id}'") - db_obj.openalex_id = new_oa_id - updated = True + if not self.db.is_active: # Re-check session + raise RuntimeError( + "Session inactive before OA ID conflict check." + ) + existing_oa_work = self.get_by_openalex_id(openalex_id=new_oa_id) + if existing_oa_work and existing_oa_work.id != db_obj.id: + # Log conflict, skip OA ID update. + logger.warning( + f"Cannot update OA ID for Work DOI {doi} (DB ID {db_obj.id}) to {new_oa_id} " + f"because it's already assigned to Work DB ID {existing_oa_work.id}. Skipping OA ID update." + ) + else: + logger.info( + f"Updating OA ID for Work {db_obj.id} from '{db_obj.openalex_id}' to '{new_oa_id}'" + ) + db_obj.openalex_id = new_oa_id + updated = True # Update other fields if provided and different. - if obj_in_data.get('title') is not None and db_obj.title != obj_in_data.get('title'): - db_obj.title = obj_in_data['title'] + if obj_in_data.get( + "title" + ) is not None and db_obj.title != obj_in_data.get("title"): + db_obj.title = obj_in_data["title"] updated = True - if obj_in_data.get('cited_by_count') is not None and db_obj.cited_by_count != obj_in_data.get('cited_by_count'): - db_obj.cited_by_count = obj_in_data['cited_by_count'] + if obj_in_data.get( + "cited_by_count" + ) is not None and db_obj.cited_by_count != obj_in_data.get( + "cited_by_count" + ): + db_obj.cited_by_count = obj_in_data["cited_by_count"] updated = True # Add other updatable fields (publication_year, type, etc.)... if updated: - self.db.add(db_obj) # Mark as dirty. + self.db.add(db_obj) # Mark as dirty. logger.info(f"Work {db_obj.id} (found by DOI) marked for update.") # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return the instance found by DOI. + return db_obj # Return the instance found by DOI. else: # --- Step 2b: Not Found by DOI - Check OpenAlex ID --- @@ -187,44 +215,65 @@ def get_or_create_by_doi( db_obj_oa = self.get_by_openalex_id(openalex_id=openalex_id) if db_obj_oa: # --- Step 4: Found by OA ID - Update with DOI --- - logger.warning(f"Work not found by DOI {doi}, but found existing " - f"Work DB ID {db_obj_oa.id} by OA ID {openalex_id}. Attempting to merge/update.") + logger.warning( + f"Work not found by DOI {doi}, but found existing " + f"Work DB ID {db_obj_oa.id} by OA ID {openalex_id}. Attempting to merge/update." + ) updated = False # Update DOI if it was missing or a placeholder. # Assumes placeholders start with 'placeholder/'. Adapt if needed. - if db_obj_oa.doi is None or db_obj_oa.doi.startswith('placeholder/'): - logger.info(f"Updating placeholder/missing DOI for Work {db_obj_oa.id} (found by OA ID {openalex_id}) to {doi}") + if db_obj_oa.doi is None or db_obj_oa.doi.startswith( + "placeholder/" + ): + logger.info( + f"Updating placeholder/missing DOI for Work {db_obj_oa.id} (found by OA ID {openalex_id}) to {doi}" + ) db_obj_oa.doi = doi updated = True # Potentially update other fields if they were missing on the OA-found record. - if obj_in_data.get('title') is not None and db_obj_oa.title is None: - db_obj_oa.title = obj_in_data['title'] + if ( + obj_in_data.get("title") is not None + and db_obj_oa.title is None + ): + db_obj_oa.title = obj_in_data["title"] updated = True - if obj_in_data.get('cited_by_count') is not None and db_obj_oa.cited_by_count is None: - db_obj_oa.cited_by_count = obj_in_data['cited_by_count'] + if ( + obj_in_data.get("cited_by_count") is not None + and db_obj_oa.cited_by_count is None + ): + db_obj_oa.cited_by_count = obj_in_data["cited_by_count"] updated = True # Add other fields... if updated: - self.db.add(db_obj_oa) # Mark for update. - logger.info(f"Work {db_obj_oa.id} (found by OA ID) marked for update with DOI {doi}.") + self.db.add(db_obj_oa) # Mark for update. + logger.info( + f"Work {db_obj_oa.id} (found by OA ID) marked for update with DOI {doi}." + ) # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj_oa) - return db_obj_oa # Return the instance found by OA ID. + return db_obj_oa # Return the instance found by OA ID. # --- Step 5: Not Found by DOI or OA ID - Create New --- - logger.debug(f"Work DOI {doi} (and OA ID {openalex_id or 'N/A'}) not found. Creating new.") - obj_in_data["doi"] = doi # Ensure DOI is set. - new_obj = self.model(**obj_in_data) # Create instance. - self.db.add(new_obj) # Add to session. - self.db.flush() # Send INSERT. - self.db.refresh(new_obj) # Load DB defaults. - logger.info(f"Successfully created and flushed new Work DOI {doi} (DB ID: {new_obj.id})") - return new_obj # Return the new instance. + logger.debug( + f"Work DOI {doi} (and OA ID {openalex_id or 'N/A'}) not found. Creating new." + ) + obj_in_data["doi"] = doi # Ensure DOI is set. + new_obj = self.model(**obj_in_data) # Create instance. + self.db.add(new_obj) # Add to session. + self.db.flush() # Send INSERT. + self.db.refresh(new_obj) # Load DB defaults. + logger.info( + f"Successfully created and flushed new Work DOI {doi} (DB ID: {new_obj.id})" + ) + return new_obj # Return the new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create_by_doi for Work DOI {doi}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create_by_doi for Work DOI {doi}: {e}", + exc_info=True, + ) # Caller handles rollback. raise @@ -265,10 +314,16 @@ def get_or_create_by_openalex_id( SQLAlchemyError: If any database operation fails. """ if not openalex_id: - raise ValueError("OpenAlex ID cannot be empty for Work get_or_create_by_openalex_id") + raise ValueError( + "OpenAlex ID cannot be empty for Work get_or_create_by_openalex_id" + ) if not self.db.is_active: - logger.error("Session is inactive at start of get_or_create_by_openalex_id for Work.") - raise RuntimeError("Database session is inactive, cannot perform get_or_create.") + logger.error( + "Session is inactive at start of get_or_create_by_openalex_id for Work." + ) + raise RuntimeError( + "Database session is inactive, cannot perform get_or_create." + ) try: # --- Step 1: Query First by OpenAlex ID --- @@ -276,17 +331,27 @@ def get_or_create_by_openalex_id( if db_obj: # --- Step 2a: Found by OA ID - Update Check --- - logger.debug(f"Found existing Work by OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates.") + logger.debug( + f"Found existing Work by OA ID {openalex_id} (DB ID: {db_obj.id}). Checking for updates." + ) updated = False new_doi = obj_in_data.get("doi") # Update DOI if provided and different (or if current is placeholder). # Also checks for conflicts if the new DOI exists elsewhere. - needs_doi_update = new_doi and (db_obj.doi is None or db_obj.doi.startswith('placeholder/') or db_obj.doi != new_doi) + needs_doi_update = new_doi and ( + db_obj.doi is None + or db_obj.doi.startswith("placeholder/") + or db_obj.doi != new_doi + ) if needs_doi_update: - if not self.db.is_active: # Re-check session - raise RuntimeError("Session inactive before DOI conflict check.") - existing_doi_work = self.get_by_doi(doi=new_doi) if new_doi else None # Check only if new_doi is not None + if not self.db.is_active: # Re-check session + raise RuntimeError( + "Session inactive before DOI conflict check." + ) + existing_doi_work = ( + self.get_by_doi(doi=new_doi) if new_doi else None + ) # Check only if new_doi is not None if existing_doi_work and existing_doi_work.id != db_obj.id: # Log conflict, skip DOI update. logger.warning( @@ -294,78 +359,107 @@ def get_or_create_by_openalex_id( f"because it's already assigned to Work DB ID {existing_doi_work.id}. Skipping DOI update." ) else: - logger.info(f"Updating DOI for Work {db_obj.id} from '{db_obj.doi}' to '{new_doi}'") - db_obj.doi = new_doi - updated = True + logger.info( + f"Updating DOI for Work {db_obj.id} from '{db_obj.doi}' to '{new_doi}'" + ) + db_obj.doi = new_doi + updated = True # Update other fields if provided and different. - if obj_in_data.get('title') is not None and db_obj.title != obj_in_data.get('title'): - db_obj.title = obj_in_data['title'] + if obj_in_data.get( + "title" + ) is not None and db_obj.title != obj_in_data.get("title"): + db_obj.title = obj_in_data["title"] updated = True - if obj_in_data.get('cited_by_count') is not None and db_obj.cited_by_count != obj_in_data.get('cited_by_count'): - db_obj.cited_by_count = obj_in_data['cited_by_count'] + if obj_in_data.get( + "cited_by_count" + ) is not None and db_obj.cited_by_count != obj_in_data.get( + "cited_by_count" + ): + db_obj.cited_by_count = obj_in_data["cited_by_count"] updated = True # Add other updatable fields ... if updated: - self.db.add(db_obj) # Mark as dirty. + self.db.add(db_obj) # Mark as dirty. logger.info(f"Work {db_obj.id} (found by OA ID) marked for update.") # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj) - return db_obj # Return instance found by OA ID. + return db_obj # Return instance found by OA ID. else: # --- Step 2b: Not Found by OA ID - Check DOI --- doi_to_check = obj_in_data.get("doi") # Only check by DOI if it's provided and isn't a placeholder itself. - if doi_to_check and not doi_to_check.startswith('placeholder/'): + if doi_to_check and not doi_to_check.startswith("placeholder/"): # --- Step 3: Query by DOI --- db_obj_doi = self.get_by_doi(doi=doi_to_check) if db_obj_doi: # --- Step 4: Found by DOI - Update with OA ID --- - logger.warning(f"Work not found by OA ID {openalex_id}, but found existing " - f"Work DB ID {db_obj_doi.id} by DOI {doi_to_check}. Attempting to merge/update.") + logger.warning( + f"Work not found by OA ID {openalex_id}, but found existing " + f"Work DB ID {db_obj_doi.id} by DOI {doi_to_check}. Attempting to merge/update." + ) updated = False # Add the OpenAlex ID if it was missing. if not db_obj_doi.openalex_id: - logger.info(f"Updating missing OA ID for Work {db_obj_doi.id} (found by DOI {doi_to_check}) to {openalex_id}") + logger.info( + f"Updating missing OA ID for Work {db_obj_doi.id} (found by DOI {doi_to_check}) to {openalex_id}" + ) db_obj_doi.openalex_id = openalex_id updated = True # Potentially update other fields if missing. - if obj_in_data.get('title') is not None and db_obj_doi.title is None: - db_obj_doi.title = obj_in_data['title'] + if ( + obj_in_data.get("title") is not None + and db_obj_doi.title is None + ): + db_obj_doi.title = obj_in_data["title"] updated = True - if obj_in_data.get('cited_by_count') is not None and db_obj_doi.cited_by_count is None: - db_obj_doi.cited_by_count = obj_in_data['cited_by_count'] + if ( + obj_in_data.get("cited_by_count") is not None + and db_obj_doi.cited_by_count is None + ): + db_obj_doi.cited_by_count = obj_in_data["cited_by_count"] updated = True # Add other fields ... if updated: - self.db.add(db_obj_doi) # Mark for update. - logger.info(f"Work {db_obj_doi.id} (found by DOI) marked for update with OA ID {openalex_id}.") + self.db.add(db_obj_doi) # Mark for update. + logger.info( + f"Work {db_obj_doi.id} (found by DOI) marked for update with OA ID {openalex_id}." + ) # Optional: Flush and refresh. # self.db.flush() # self.db.refresh(db_obj_doi) - return db_obj_doi # Return instance found by DOI. + return db_obj_doi # Return instance found by DOI. # --- Step 5: Not Found by OA ID or valid DOI - Create New --- - logger.debug(f"Work OA ID {openalex_id} (and DOI {doi_to_check or 'N/A'}) not found. Creating new.") - obj_in_data["openalex_id"] = openalex_id # Ensure OA ID is set. + logger.debug( + f"Work OA ID {openalex_id} (and DOI {doi_to_check or 'N/A'}) not found. Creating new." + ) + obj_in_data["openalex_id"] = openalex_id # Ensure OA ID is set. # Assign a placeholder DOI if a real DOI wasn't provided in the input data. if "doi" not in obj_in_data or not obj_in_data["doi"]: # Generate a predictable placeholder based on the OpenAlex ID. placeholder_doi = f"placeholder/oa_{openalex_id}" obj_in_data["doi"] = placeholder_doi - logger.info(f"Assigning placeholder DOI '{placeholder_doi}' for new Work OA ID {openalex_id}") - - new_obj = self.model(**obj_in_data) # Create instance. - self.db.add(new_obj) # Add to session. - self.db.flush() # Send INSERT. - self.db.refresh(new_obj) # Load DB defaults. - logger.info(f"Successfully created and flushed new Work OA ID {openalex_id} (DB ID: {new_obj.id}) with DOI '{new_obj.doi}'") - return new_obj # Return new instance. + logger.info( + f"Assigning placeholder DOI '{placeholder_doi}' for new Work OA ID {openalex_id}" + ) + + new_obj = self.model(**obj_in_data) # Create instance. + self.db.add(new_obj) # Add to session. + self.db.flush() # Send INSERT. + self.db.refresh(new_obj) # Load DB defaults. + logger.info( + f"Successfully created and flushed new Work OA ID {openalex_id} (DB ID: {new_obj.id}) with DOI '{new_obj.doi}'" + ) + return new_obj # Return new instance. except SQLAlchemyError as e: - logger.error(f"SQLAlchemyError during get_or_create_by_openalex_id for Work OA ID {openalex_id}: {e}", exc_info=True) + logger.error( + f"SQLAlchemyError during get_or_create_by_openalex_id for Work OA ID {openalex_id}: {e}", + exc_info=True, + ) # Caller handles rollback. - raise \ No newline at end of file + raise diff --git a/backend/external/__init__.py b/backend/external/__init__.py index d9b1ffc..e5eb983 100644 --- a/backend/external/__init__.py +++ b/backend/external/__init__.py @@ -12,4 +12,4 @@ "RateLimitError", "GitHubClient", "OpenAlexClient", -] \ No newline at end of file +] diff --git a/backend/external/client_base.py b/backend/external/client_base.py index 30b946e..0bc8471 100644 --- a/backend/external/client_base.py +++ b/backend/external/client_base.py @@ -12,7 +12,7 @@ import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from typing import Optional, Dict, Any, Tuple, List, Union # Added Union +from typing import Optional, Dict, Any, Tuple, List, Union # Added Union # Ensure settings are imported to access config like OPENALEX_EMAIL # This also ensures dotenv is loaded if settings module does it @@ -20,6 +20,7 @@ logger = logging.getLogger(__name__) + # --- Custom Exception Classes --- class ApiClientError(Exception): """ @@ -30,10 +31,12 @@ class ApiClientError(Exception): request. It may optionally include the HTTP status code if the error originated from an HTTP response. """ + def __init__(self, message: str, status_code: Optional[int] = None): super().__init__(message) self.status_code = status_code + class RateLimitError(ApiClientError): """ Specific exception raised when an API rate limit (HTTP 429) is encountered @@ -43,6 +46,7 @@ class RateLimitError(ApiClientError): retry_after: The suggested wait time in seconds provided by the API's 'Retry-After' header, if available. """ + def __init__(self, message: str, retry_after: Optional[int] = None): super().__init__(message, status_code=429) self.retry_after = retry_after @@ -62,14 +66,15 @@ class ClientBase: Subclasses should inherit from `ClientBase` to leverage this common infrastructure for interacting with specific external APIs. """ + def __init__( self, base_url: Optional[str] = None, headers: Optional[Dict[str, str]] = None, - timeout: Union[float, Tuple[float, float]] = (10, 30), # connect, read - retries: int = 3, # Retries for connection/server errors + timeout: Union[float, Tuple[float, float]] = (10, 30), # connect, read + retries: int = 3, # Retries for connection/server errors backoff_factor: float = 0.5, - status_forcelist: Optional[List[int]] = None + status_forcelist: Optional[List[int]] = None, ): """ Initializes the base client and its session. @@ -96,8 +101,8 @@ def __init__( [500, 502, 503, 504]. """ # Base URL is optional now, can be provided per request or rely on endpoint being full URL - self.base_url = base_url.rstrip('/') if base_url else None - self.settings = settings # Access loaded settings instance + self.base_url = base_url.rstrip("/") if base_url else None + self.settings = settings # Access loaded settings instance self.default_timeout = timeout self.default_headers = { "User-Agent": f"MOSS Bot (Map of Open Source Science; mailto:{self.settings.OPENALEX_EMAIL or 'not-set'}) / Python Requests", @@ -106,17 +111,29 @@ def __init__( self.default_headers.update(headers) # Configure retries for connection/server errors (NOT 429) - self.status_forcelist = status_forcelist if status_forcelist is not None else [500, 502, 503, 504] + self.status_forcelist = ( + status_forcelist if status_forcelist is not None else [500, 502, 503, 504] + ) self.retries_config = Retry( total=retries, backoff_factor=backoff_factor, status_forcelist=self.status_forcelist, - allowed_methods=["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"], # Retry on these methods for server errors - respect_retry_after_header=True # Good practice for non-429 retries + allowed_methods=[ + "HEAD", + "GET", + "POST", + "PUT", + "DELETE", + "OPTIONS", + "TRACE", + ], # Retry on these methods for server errors + respect_retry_after_header=True, # Good practice for non-429 retries ) self.session = self._create_session() - logger.info(f"{self.__class__.__name__} initialized for base URL: {self.base_url or 'Not Set'}") + logger.info( + f"{self.__class__.__name__} initialized for base URL: {self.base_url or 'Not Set'}" + ) def _create_session(self) -> requests.Session: """ @@ -134,7 +151,9 @@ def _create_session(self) -> requests.Session: session.mount("https://", adapter) session.mount("http://", adapter) session.headers.update(self.default_headers) - logger.debug(f"Requests session created with non-429 retry strategy for {self.__class__.__name__}.") + logger.debug( + f"Requests session created with non-429 retry strategy for {self.__class__.__name__}." + ) return session def _request( @@ -142,11 +161,11 @@ def _request( method: str, endpoint: str, params: Optional[Dict[str, Any]] = None, - data: Optional[Dict[str, Any]] = None, # For form data - json: Optional[Dict[str, Any]] = None, # For JSON body + data: Optional[Dict[str, Any]] = None, # For form data + json: Optional[Dict[str, Any]] = None, # For JSON body headers: Optional[Dict[str, str]] = None, timeout: Optional[Union[float, Tuple[float, float]]] = None, - **kwargs # Allow passing extra arguments like 'files' + **kwargs, # Allow passing extra arguments like 'files' ) -> requests.Response: """ Executes an HTTP request with integrated retry logic for rate limits. @@ -202,16 +221,19 @@ def _request( request_headers.update(headers) # --- Rate Limit Handling Configuration --- - MAX_429_RETRIES = 4 # Limit how many times *we* retry on 429 internally - INITIAL_429_DELAY = 3 # Initial delay (seconds) after a 429 if no Retry-After - MAX_429_WAIT = 60 # Maximum wait time (seconds) for a single 429 retry delay + MAX_429_RETRIES = 4 # Limit how many times *we* retry on 429 internally + INITIAL_429_DELAY = 3 # Initial delay (seconds) after a 429 if no Retry-After + MAX_429_WAIT = 60 # Maximum wait time (seconds) for a single 429 retry delay # --- End Rate Limit Configuration --- - last_exception: Optional[Exception] = None # Store the last exception encountered + last_exception: Optional[Exception] = ( + None # Store the last exception encountered + ) for attempt in range(MAX_429_RETRIES + 1): - logger.debug(f"Attempt {attempt+1}: {method.upper()} {full_url}") - if params: logger.debug(f"Params: {params}") + logger.debug(f"Attempt {attempt + 1}: {method.upper()} {full_url}") + if params: + logger.debug(f"Params: {params}") try: response = self.session.request( @@ -222,7 +244,7 @@ def _request( json=json, headers=request_headers, timeout=request_timeout, - **kwargs + **kwargs, ) # --- Specific 429 Rate Limit Handling --- @@ -230,7 +252,7 @@ def _request( if attempt < MAX_429_RETRIES: retry_after_str = response.headers.get("Retry-After") # Default wait is exponential backoff - wait_time = INITIAL_429_DELAY * (2 ** attempt) + wait_time = INITIAL_429_DELAY * (2**attempt) if retry_after_str: try: @@ -238,9 +260,13 @@ def _request( wait_time_header = int(retry_after_str) # Use the header value if it's longer than backoff, add buffer wait_time = max(wait_time, wait_time_header) + 1 - logger.info(f"Rate limit hit. Respecting Retry-After: {wait_time_header}s. Waiting ~{wait_time}s.") + logger.info( + f"Rate limit hit. Respecting Retry-After: {wait_time_header}s. Waiting ~{wait_time}s." + ) except (ValueError, TypeError): - logger.warning(f"Could not parse Retry-After header: '{retry_after_str}'. Using exponential backoff ({wait_time:.2f}s).") + logger.warning( + f"Could not parse Retry-After header: '{retry_after_str}'. Using exponential backoff ({wait_time:.2f}s)." + ) # Cap the wait time to avoid excessively long waits wait_time = min(wait_time, MAX_429_WAIT) @@ -251,24 +277,30 @@ def _request( ) time.sleep(wait_time) # Store a dummy exception to indicate a retry occurred - last_exception = requests.exceptions.RetryError(f"Rate limited on attempt {attempt+1}") - continue # Proceed to the next attempt in the 429 retry loop + last_exception = requests.exceptions.RetryError( + f"Rate limited on attempt {attempt + 1}" + ) + continue # Proceed to the next attempt in the 429 retry loop else: # Exceeded internal retries specifically for 429 errors - logger.error(f"Rate limit hit ({response.status_code}) on {method.upper()} {full_url} and exceeded internal retry limit ({MAX_429_RETRIES}). Raising error.") + logger.error( + f"Rate limit hit ({response.status_code}) on {method.upper()} {full_url} and exceeded internal retry limit ({MAX_429_RETRIES}). Raising error." + ) # Use raise_for_status() to create an HTTPError, which will be caught below response.raise_for_status() # If not 429, return the response immediately. # The caller should check response.ok or response.status_code. if response.ok: - logger.debug(f"Request successful: {response.status_code} {method.upper()} {full_url}") + logger.debug( + f"Request successful: {response.status_code} {method.upper()} {full_url}" + ) else: - # Log non-429 client/server errors handled by the caller - logger.warning( + # Log non-429 client/server errors handled by the caller + logger.warning( f"Request returned non-success status (non-429): {response.status_code} {method.upper()} {full_url}. " f"Response snippet: {response.text[:200]}" - ) + ) # Return the response regardless of non-429 status code; caller decides how to handle. return response @@ -277,25 +309,31 @@ def _request( # 1. Connection errors, timeouts etc., *after* the session's # Retry mechanism (configured by self.retries_config) is exhausted. # 2. The HTTPError explicitly raised above if MAX_429_RETRIES was exceeded. - logger.error(f"Request failed for {method.upper()} {full_url} after all retries (Session or internal 429): {e}", exc_info=False) # Log only message unless debugging - logger.debug(f"Underlying exception detail for failed request:", exc_info=True) # Full trace on debug - last_exception = e # Store the actual exception + logger.error( + f"Request failed for {method.upper()} {full_url} after all retries (Session or internal 429): {e}", + exc_info=False, + ) # Log only message unless debugging + logger.debug( + "Underlying exception detail for failed request:", exc_info=True + ) # Full trace on debug + last_exception = e # Store the actual exception # Break the loop, we will raise ApiClientError outside based on last_exception break except Exception as e: # Catch any other unexpected errors during request setup or execution - logger.exception(f"Unexpected error during request: {method.upper()} {full_url}") + logger.exception( + f"Unexpected error during request: {method.upper()} {full_url}" + ) last_exception = e - break # Exit loop on unexpected error + break # Exit loop on unexpected error # If the loop completed without returning a response (i.e., hit break after an exception) # Raise a consistent ApiClientError, wrapping the last encountered exception. err_msg = f"Request failed for {method.upper()} {full_url} after all retries: {last_exception}" - status_code = getattr(last_exception, 'response', None) - status_code = getattr(status_code, 'status_code', None) if status_code else None + status_code = getattr(last_exception, "response", None) + status_code = getattr(status_code, "status_code", None) if status_code else None raise ApiClientError(err_msg, status_code=status_code) from last_exception - def _construct_url(self, endpoint: str) -> str: """ Constructs the full URL for an API request. @@ -313,16 +351,22 @@ def _construct_url(self, endpoint: str) -> str: Raises: ValueError: If the endpoint is relative and `base_url` is not set. """ - if endpoint.lower().startswith(('http://', 'https://')): + if endpoint.lower().startswith(("http://", "https://")): return endpoint if not self.base_url: - logger.error(f"Cannot construct full URL for relative endpoint '{endpoint}' because client base_url is not configured.") - raise ValueError(f"Endpoint '{endpoint}' is not a full URL and no base_url is configured for this client.") + logger.error( + f"Cannot construct full URL for relative endpoint '{endpoint}' because client base_url is not configured." + ) + raise ValueError( + f"Endpoint '{endpoint}' is not a full URL and no base_url is configured for this client." + ) # Ensure there's exactly one slash between base_url and endpoint return f"{self.base_url.rstrip('/')}/{endpoint.lstrip('/')}" # --- Convenience Methods --- - def get(self, endpoint: str, params: Optional[Dict[str, Any]] = None, **kwargs) -> requests.Response: + def get( + self, endpoint: str, params: Optional[Dict[str, Any]] = None, **kwargs + ) -> requests.Response: """ Performs an HTTP GET request. @@ -339,7 +383,13 @@ def get(self, endpoint: str, params: Optional[Dict[str, Any]] = None, **kwargs) """ return self._request("GET", endpoint, params=params, **kwargs) - def post(self, endpoint: str, data: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, **kwargs) -> requests.Response: + def post( + self, + endpoint: str, + data: Optional[Dict[str, Any]] = None, + json: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> requests.Response: """ Performs an HTTP POST request. @@ -357,4 +407,4 @@ def post(self, endpoint: str, data: Optional[Dict[str, Any]] = None, json: Optio """ return self._request("POST", endpoint, data=data, json=json, **kwargs) - # Add other convenience methods (put, delete, patch, head, options) as needed. \ No newline at end of file + # Add other convenience methods (put, delete, patch, head, options) as needed. diff --git a/backend/external/github_client.py b/backend/external/github_client.py index 2ef319a..c747953 100644 --- a/backend/external/github_client.py +++ b/backend/external/github_client.py @@ -13,7 +13,7 @@ import base64 import binascii import requests -import re # Used for parsing Link headers +import re # Used for parsing Link headers from typing import Optional, List, Dict, Any, Tuple # Import base client and custom errors @@ -21,6 +21,7 @@ logger = logging.getLogger(__name__) + class GitHubClient(ClientBase): """ Client for the GitHub REST API v3. @@ -31,6 +32,7 @@ class GitHubClient(ClientBase): Leverages `ClientBase` for underlying request execution, retries, and rate limit handling. """ + def __init__(self): """ Initializes the GitHubClient. @@ -44,17 +46,21 @@ def __init__(self): super().__init__(base_url="https://api.github.com") self.token = self.settings.GITHUB_API_TOKEN if not self.token: - logger.error("GITHUB_API_TOKEN is not configured in settings. GitHubClient requires a token.") + logger.error( + "GITHUB_API_TOKEN is not configured in settings. GitHubClient requires a token." + ) raise ValueError("GitHub API token is required but not set.") # Prepare authentication and API version headers for GitHub requests self.auth_headers = { "Authorization": f"Bearer {self.token}", - "Accept": "application/vnd.github.v3+json", # Request standard JSON format - "X-GitHub-Api-Version": "2022-11-28", # Pin to a specific API version + "Accept": "application/vnd.github.v3+json", # Request standard JSON format + "X-GitHub-Api-Version": "2022-11-28", # Pin to a specific API version } logger.info("GitHubClient initialized successfully.") - def _parse_link_header(self, headers: requests.structures.CaseInsensitiveDict) -> Dict[str, str]: + def _parse_link_header( + self, headers: requests.structures.CaseInsensitiveDict + ) -> Dict[str, str]: """ Parses the 'Link' HTTP header returned by GitHub API pagination responses. @@ -73,10 +79,10 @@ def _parse_link_header(self, headers: requests.structures.CaseInsensitiveDict) - if the 'Link' header is not present or cannot be parsed. """ links = {} - link_header = headers.get('Link') + link_header = headers.get("Link") if link_header: # Split the header into individual link parts (separated by commas) - parts = link_header.split(',') + parts = link_header.split(",") for part in parts: # Use regex to extract the URL and the relation type ('rel') match = re.match(r'<\s*(.*?)\s*>;\s*rel="?(\w+)"?', part.strip()) @@ -85,7 +91,9 @@ def _parse_link_header(self, headers: requests.structures.CaseInsensitiveDict) - links[rel] = url return links - def _fetch_paginated_results(self, endpoint: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: + def _fetch_paginated_results( + self, endpoint: str, params: Optional[Dict[str, Any]] = None + ) -> List[Dict[str, Any]]: """ Retrieves all results from a paginated GitHub API endpoint. @@ -107,7 +115,7 @@ def _fetch_paginated_results(self, endpoint: str, params: Optional[Dict[str, Any """ if params is None: params = {} - params["per_page"] = 100 # Request the maximum number of items per page + params["per_page"] = 100 # Request the maximum number of items per page all_items: List[Dict[str, Any]] = [] # Start with the initial endpoint URL constructed from the base URL @@ -115,7 +123,9 @@ def _fetch_paginated_results(self, endpoint: str, params: Optional[Dict[str, Any page_num = 1 while current_url: - logger.debug(f"Fetching page {page_num} for endpoint '{endpoint}' from URL: {current_url}") + logger.debug( + f"Fetching page {page_num} for endpoint '{endpoint}' from URL: {current_url}" + ) try: # Make the request. For subsequent pages (page_num > 1), # current_url is an absolute URL from the Link header, so pass @@ -125,58 +135,90 @@ def _fetch_paginated_results(self, endpoint: str, params: Optional[Dict[str, Any "GET", current_url, params=params if page_num == 1 else None, - headers=self.auth_headers + headers=self.auth_headers, ) # Handle specific non-OK statuses during pagination if response.status_code == 404: - logger.warning(f"Endpoint not found (404) during pagination: {current_url}. Stopping pagination.") - break # Stop if the resource disappears mid-fetch + logger.warning( + f"Endpoint not found (404) during pagination: {current_url}. Stopping pagination." + ) + break # Stop if the resource disappears mid-fetch # Let ClientBase._request handle retries for 429/5xx. # If we get here and it's not OK, it's likely a persistent issue. elif not response.ok: - logger.error(f"GitHub API error fetching paginated results (page {page_num}, URL: {current_url}). Status: {response.status_code}, Response: {response.text[:200]}") + logger.error( + f"GitHub API error fetching paginated results (page {page_num}, URL: {current_url}). Status: {response.status_code}, Response: {response.text[:200]}" + ) # Raise an error to signal failure to the caller - raise ApiClientError(f"Failed to fetch page {page_num} from {endpoint}", status_code=response.status_code) + raise ApiClientError( + f"Failed to fetch page {page_num} from {endpoint}", + status_code=response.status_code, + ) try: page_data = response.json() # Expect a list of items from paginated endpoints if not isinstance(page_data, list): - logger.error(f"Unexpected response format (expected list, got {type(page_data)}) for paginated results: {current_url}. Response: {str(page_data)[:200]}") - raise ApiClientError(f"Unexpected response format from {endpoint}", status_code=response.status_code) + logger.error( + f"Unexpected response format (expected list, got {type(page_data)}) for paginated results: {current_url}. Response: {str(page_data)[:200]}" + ) + raise ApiClientError( + f"Unexpected response format from {endpoint}", + status_code=response.status_code, + ) all_items.extend(page_data) - logger.debug(f"Fetched {len(page_data)} items on page {page_num}. Total items so far: {len(all_items)}") + logger.debug( + f"Fetched {len(page_data)} items on page {page_num}. Total items so far: {len(all_items)}" + ) # Parse the Link header to find the URL for the next page links = self._parse_link_header(response.headers) - current_url = links.get("next") # Will be None if no 'next' link + current_url = links.get("next") # Will be None if no 'next' link if current_url: page_num += 1 else: - logger.debug(f"No 'next' link found. Reached end of results for {endpoint}.") + logger.debug( + f"No 'next' link found. Reached end of results for {endpoint}." + ) except requests.exceptions.JSONDecodeError as json_err: - logger.error(f"Failed to decode JSON response from {current_url} (page {page_num}): {json_err}", exc_info=True) - raise ApiClientError(f"Failed to decode JSON from {endpoint}", status_code=response.status_code) from json_err + logger.error( + f"Failed to decode JSON response from {current_url} (page {page_num}): {json_err}", + exc_info=True, + ) + raise ApiClientError( + f"Failed to decode JSON from {endpoint}", + status_code=response.status_code, + ) from json_err except ApiClientError as e: - # Propagate API client errors (connection, timeout after retries, etc.) - logger.error(f"API Client error during pagination for {endpoint} (page {page_num}): {e}") - raise e + # Propagate API client errors (connection, timeout after retries, etc.) + logger.error( + f"API Client error during pagination for {endpoint} (page {page_num}): {e}" + ) + raise e except Exception as e: - # Catch any other unexpected errors during the loop - logger.exception(f"Unexpected error during pagination fetch for {endpoint} (page {page_num})") - # Wrap in ApiClientError for consistent error handling upstream - raise ApiClientError(f"Unexpected error during pagination for {endpoint}: {e}") from e - - logger.info(f"Finished fetching paginated results for {endpoint}. Total items retrieved: {len(all_items)}") + # Catch any other unexpected errors during the loop + logger.exception( + f"Unexpected error during pagination fetch for {endpoint} (page {page_num})" + ) + # Wrap in ApiClientError for consistent error handling upstream + raise ApiClientError( + f"Unexpected error during pagination for {endpoint}: {e}" + ) from e + + logger.info( + f"Finished fetching paginated results for {endpoint}. Total items retrieved: {len(all_items)}" + ) return all_items - def get_repository_metadata(self, owner: str, repo: str) -> Optional[Dict[str, Any]]: + def get_repository_metadata( + self, owner: str, repo: str + ) -> Optional[Dict[str, Any]]: """ Fetches metadata for a specific GitHub repository. @@ -204,26 +246,40 @@ def get_repository_metadata(self, owner: str, repo: str) -> Optional[Dict[str, A logger.warning(f"Repository not found: {owner}/{repo} (404)") return None elif response.status_code == 403: - logger.error(f"Access forbidden for repository: {owner}/{repo} (403). Check token permissions or rate limits.") - # Raise a specific error for auth/permission issues - raise ApiClientError(f"Access forbidden for repository {owner}/{repo} (403). Check token permissions.", status_code=403) + logger.error( + f"Access forbidden for repository: {owner}/{repo} (403). Check token permissions or rate limits." + ) + # Raise a specific error for auth/permission issues + raise ApiClientError( + f"Access forbidden for repository {owner}/{repo} (403). Check token permissions.", + status_code=403, + ) elif not response.ok: # Log other non-404, non-403 errors but return None for now - logger.error(f"Failed to get repository metadata for {owner}/{repo}. Status: {response.status_code}, Response: {response.text[:200]}") - return None # Or consider raising ApiClientError for unexpected non-ok statuses + logger.error( + f"Failed to get repository metadata for {owner}/{repo}. Status: {response.status_code}, Response: {response.text[:200]}" + ) + return None # Or consider raising ApiClientError for unexpected non-ok statuses # Attempt to parse JSON only if the request was successful return response.json() except requests.exceptions.JSONDecodeError as json_err: - logger.error(f"Failed to decode JSON response for {owner}/{repo} metadata: {json_err}", exc_info=True) - return None # Return None on decode error - except ApiClientError: # Catch client errors raised by _request or the 403 block - raise # Re-raise client errors - except Exception as e: - # Catch any other unexpected errors during processing - logger.exception(f"Unexpected error processing repository metadata for {owner}/{repo}") - raise # Re-raise unexpected errors + logger.error( + f"Failed to decode JSON response for {owner}/{repo} metadata: {json_err}", + exc_info=True, + ) + return None # Return None on decode error + except ( + ApiClientError + ): # Catch client errors raised by _request or the 403 block + raise # Re-raise client errors + except Exception: + # Catch any other unexpected errors during processing + logger.exception( + f"Unexpected error processing repository metadata for {owner}/{repo}" + ) + raise # Re-raise unexpected errors def get_contributors(self, owner: str, repo: str) -> List[Dict[str, Any]]: """ @@ -250,33 +306,51 @@ def get_contributors(self, owner: str, repo: str) -> List[Dict[str, Any]]: endpoint = f"/repos/{owner}/{repo}/contributors" # Parameters to fetch maximum per page and exclude anonymous contributors params = {"per_page": 100, "anon": "false"} - logger.info(f"Fetching contributors (first page) for repository: {owner}/{repo}") + logger.info( + f"Fetching contributors (first page) for repository: {owner}/{repo}" + ) try: # Fetch only the first page for now - response = self._request("GET", endpoint, headers=self.auth_headers, params=params) + response = self._request( + "GET", endpoint, headers=self.auth_headers, params=params + ) if response.status_code == 404: - logger.warning(f"Repository not found when fetching contributors: {owner}/{repo} (404)") + logger.warning( + f"Repository not found when fetching contributors: {owner}/{repo} (404)" + ) return [] elif response.status_code == 403: - logger.error(f"Access forbidden for contributors: {owner}/{repo} (403).") - raise ApiClientError(f"Access forbidden for contributors {owner}/{repo} (403). Check token permissions.", status_code=403) + logger.error( + f"Access forbidden for contributors: {owner}/{repo} (403)." + ) + raise ApiClientError( + f"Access forbidden for contributors {owner}/{repo} (403). Check token permissions.", + status_code=403, + ) elif not response.ok: - logger.error(f"Failed to get contributors for {owner}/{repo}. Status: {response.status_code}, Response: {response.text[:200]}") - return [] # Return empty list on other errors for now + logger.error( + f"Failed to get contributors for {owner}/{repo}. Status: {response.status_code}, Response: {response.text[:200]}" + ) + return [] # Return empty list on other errors for now contributors = response.json() # Ensure the response is a list as expected return contributors if isinstance(contributors, list) else [] except requests.exceptions.JSONDecodeError as json_err: - logger.error(f"Failed to decode JSON response for {owner}/{repo} contributors: {json_err}", exc_info=True) + logger.error( + f"Failed to decode JSON response for {owner}/{repo} contributors: {json_err}", + exc_info=True, + ) return [] except ApiClientError: - raise # Re-raise client errors - except Exception as e: - logger.exception(f"Unexpected error processing contributors for {owner}/{repo}") - raise # Re-raise unexpected errors + raise # Re-raise client errors + except Exception: + logger.exception( + f"Unexpected error processing contributors for {owner}/{repo}" + ) + raise # Re-raise unexpected errors def get_file_content(self, owner: str, repo: str, path: str) -> Optional[str]: """ @@ -306,42 +380,68 @@ def get_file_content(self, owner: str, repo: str, path: str) -> Optional[str]: logger.info(f"Fetching file content for: {owner}/{repo}/{path}") try: # Use a slightly longer timeout for potentially large file content - response = self._request("GET", endpoint, headers=self.auth_headers, timeout=45) + response = self._request( + "GET", endpoint, headers=self.auth_headers, timeout=45 + ) if response.status_code == 404: - logger.warning(f"File or repository not found: {owner}/{repo}/{path} (404)") + logger.warning( + f"File or repository not found: {owner}/{repo}/{path} (404)" + ) return None elif response.status_code == 403: - logger.error(f"Access forbidden for file content: {owner}/{repo}/{path} (403).") - raise ApiClientError(f"Access forbidden for file content {owner}/{repo}/{path} (403).", status_code=403) + logger.error( + f"Access forbidden for file content: {owner}/{repo}/{path} (403)." + ) + raise ApiClientError( + f"Access forbidden for file content {owner}/{repo}/{path} (403).", + status_code=403, + ) elif not response.ok: - # Log other non-404, non-403 errors - logger.error(f"HTTP error {response.status_code} fetching file content for {owner}/{repo}/{path}: {response.text[:200]}") - return None # Return None for now + # Log other non-404, non-403 errors + logger.error( + f"HTTP error {response.status_code} fetching file content for {owner}/{repo}/{path}: {response.text[:200]}" + ) + return None # Return None for now try: file_data = response.json() except requests.exceptions.JSONDecodeError as json_err: # Handle cases where the response is not valid JSON - logger.error(f"Failed to decode JSON response for file {owner}/{repo}/{path}: {json_err}", exc_info=True) - logger.debug(f"Response text causing decode error: {response.text[:500]}") + logger.error( + f"Failed to decode JSON response for file {owner}/{repo}/{path}: {json_err}", + exc_info=True, + ) + logger.debug( + f"Response text causing decode error: {response.text[:500]}" + ) return None # Check if the response indicates a directory listing instead of file content - if isinstance(file_data, list) or (isinstance(file_data, dict) and file_data.get('type') == 'dir'): - logger.warning(f"Path provided points to a directory, not a file: {owner}/{repo}/{path}") + if isinstance(file_data, list) or ( + isinstance(file_data, dict) and file_data.get("type") == "dir" + ): + logger.warning( + f"Path provided points to a directory, not a file: {owner}/{repo}/{path}" + ) return None # Ensure the response is a dictionary for file content if not isinstance(file_data, dict): - logger.error(f"Unexpected response format (not a dict/list) for file content: {owner}/{repo}/{path}. Got {type(file_data)}") - return None + logger.error( + f"Unexpected response format (not a dict/list) for file content: {owner}/{repo}/{path}. Got {type(file_data)}" + ) + return None encoding = file_data.get("encoding") - content = file_data.get("content") # Base64 encoded string or potentially null + content = file_data.get( + "content" + ) # Base64 encoded string or potentially null if encoding == "base64": if not content or not isinstance(content, str): - logger.warning(f"Expected base64 content string, but found none or invalid type for {owner}/{repo}/{path}") + logger.warning( + f"Expected base64 content string, but found none or invalid type for {owner}/{repo}/{path}" + ) return None try: # Decode the base64 string into bytes @@ -351,34 +451,49 @@ def get_file_content(self, owner: str, repo: str, path: str) -> Optional[str]: return decoded_bytes.decode("utf-8") except UnicodeDecodeError: # Fallback to latin-1 if UTF-8 fails (common for some legacy files) - logger.warning(f"UTF-8 decoding failed for {owner}/{repo}/{path}. Attempting latin-1 decoding.") + logger.warning( + f"UTF-8 decoding failed for {owner}/{repo}/{path}. Attempting latin-1 decoding." + ) return decoded_bytes.decode("latin-1") except (binascii.Error, ValueError) as decode_error: # Handle errors during base64 decoding itself - logger.error(f"Base64 decoding failed for {owner}/{repo}/{path}: {decode_error}") + logger.error( + f"Base64 decoding failed for {owner}/{repo}/{path}: {decode_error}" + ) # Raise a specific error indicating decoding failure - raise ValueError(f"Failed to decode base64 content for file {path}") from decode_error + raise ValueError( + f"Failed to decode base64 content for file {path}" + ) from decode_error elif content is not None: - # Handle cases where encoding is not base64 (e.g., 'none' or potentially others) - # Treat the content as a plain string if available. - logger.info(f"File {owner}/{repo}/{path} has encoding '{encoding}'. Returning content directly.") - return str(content) + # Handle cases where encoding is not base64 (e.g., 'none' or potentially others) + # Treat the content as a plain string if available. + logger.info( + f"File {owner}/{repo}/{path} has encoding '{encoding}'. Returning content directly." + ) + return str(content) else: - # Handle cases where content is missing or null - logger.warning(f"No content found (encoding: {encoding}) in response for {owner}/{repo}/{path}") - return None + # Handle cases where content is missing or null + logger.warning( + f"No content found (encoding: {encoding}) in response for {owner}/{repo}/{path}" + ) + return None except ApiClientError: - raise # Re-raise client-level errors + raise # Re-raise client-level errors except ValueError as ve: - # Catch the ValueError raised by decoding failure - logger.error(f"Data processing error for file {owner}/{repo}/{path}: {ve}", exc_info=False) - raise ve # Re-raise the specific ValueError - except Exception as e: - # Catch any other unexpected errors - logger.exception(f"Unexpected error fetching file content for {owner}/{repo}/{path}") - raise # Re-raise unexpected errors + # Catch the ValueError raised by decoding failure + logger.error( + f"Data processing error for file {owner}/{repo}/{path}: {ve}", + exc_info=False, + ) + raise ve # Re-raise the specific ValueError + except Exception: + # Catch any other unexpected errors + logger.exception( + f"Unexpected error fetching file content for {owner}/{repo}/{path}" + ) + raise # Re-raise unexpected errors def search_repositories( self, query: str, max_results: int = 1000 @@ -414,19 +529,23 @@ def search_repositories( endpoint = "/search/repositories" page = 1 - per_page = 100 # Use max allowed per page by GitHub API + per_page = 100 # Use max allowed per page by GitHub API all_items = [] total_count = 0 # GitHub Search API limitation: only first 1000 results are accessible github_max_results = 1000 # Calculate max pages needed based on GitHub limit, not total_count - max_pages = (github_max_results + per_page - 1) // per_page # Typically 10 pages + max_pages = ( + github_max_results + per_page - 1 + ) // per_page # Typically 10 pages # Adjust max_results if it exceeds the GitHub limit effective_max_results = min(max_results, github_max_results) - logger.info(f"Searching repositories with query: '{query}'. Target results: {max_results}, Effective limit: {effective_max_results}") + logger.info( + f"Searching repositories with query: '{query}'. Target results: {max_results}, Effective limit: {effective_max_results}" + ) - next_url: Optional[str] = None # Store the next page URL from Link header + next_url: Optional[str] = None # Store the next page URL from Link header # Loop until we reach the desired number of results, the GitHub limit, # or run out of pages. @@ -434,7 +553,7 @@ def search_repositories( # Prepare parameters only for the first request or if not using next_url params = None if not next_url: - params = { + params = { "q": query, "page": page, "per_page": per_page, @@ -442,30 +561,43 @@ def search_repositories( # Use the absolute URL from 'next' link if available, otherwise use the base endpoint current_url = next_url if next_url else self._construct_url(endpoint) - request_endpoint = next_url if next_url else endpoint # Use for logging clarity + request_endpoint = ( + next_url if next_url else endpoint + ) # Use for logging clarity - logger.debug(f"Fetching search results page {page} for query '{query}' (URL: {current_url})") + logger.debug( + f"Fetching search results page {page} for query '{query}' (URL: {current_url})" + ) try: response = self._request( "GET", - request_endpoint, # Pass relative endpoint or absolute URL - params=params, # Pass params only if not using next_url - headers=self.auth_headers + request_endpoint, # Pass relative endpoint or absolute URL + params=params, # Pass params only if not using next_url + headers=self.auth_headers, ) # Handle specific error codes for search API if response.status_code == 403: # Could be rate limits, token issues, or abuse detection - logger.error(f"Access forbidden (403) during repository search (page {page}, query='{query}'). Check token, rate limits, or potential abuse flags.") - raise ApiClientError(f"Access forbidden during repository search (page {page}).", status_code=403) + logger.error( + f"Access forbidden (403) during repository search (page {page}, query='{query}'). Check token, rate limits, or potential abuse flags." + ) + raise ApiClientError( + f"Access forbidden during repository search (page {page}).", + status_code=403, + ) elif response.status_code == 422: # Often indicates an invalid or unprocessable search query - logger.error(f"Unprocessable search query '{query}' (page {page}). Status: 422. Response: {response.text[:200]}") - return None # Cannot proceed with an invalid query + logger.error( + f"Unprocessable search query '{query}' (page {page}). Status: 422. Response: {response.text[:200]}" + ) + return None # Cannot proceed with an invalid query elif not response.ok: # Handle other unexpected non-ok statuses - logger.error(f"GitHub API error searching repositories (page {page}, query='{query}'). Status: {response.status_code}, Response: {response.text[:200]}") + logger.error( + f"GitHub API error searching repositories (page {page}, query='{query}'). Status: {response.status_code}, Response: {response.text[:200]}" + ) # Fail the search for now, could potentially return partial results return None @@ -475,58 +607,81 @@ def search_repositories( # Validate the structure of the response if not isinstance(page_items, list): - logger.error(f"Unexpected 'items' format in search response (page {page}, expected list, got {type(page_items)}).") - return None # Cannot process invalid format + logger.error( + f"Unexpected 'items' format in search response (page {page}, expected list, got {type(page_items)})." + ) + return None # Cannot process invalid format # Get the total count from the first page's response only if page == 1: total_count = data.get("total_count", 0) incomplete_results = data.get("incomplete_results", False) - logger.info(f"GitHub reported total_count: {total_count} for query '{query}'. Incomplete results: {incomplete_results}") + logger.info( + f"GitHub reported total_count: {total_count} for query '{query}'. Incomplete results: {incomplete_results}" + ) # Check if total_count exceeds GitHub's accessible limit if total_count > github_max_results: - logger.warning(f"Query '{query}' has {total_count} results, but GitHub API only allows access to the first {github_max_results}.") + logger.warning( + f"Query '{query}' has {total_count} results, but GitHub API only allows access to the first {github_max_results}." + ) # Add items from the current page, respecting the effective_max_results limit num_needed = effective_max_results - len(all_items) items_to_add = page_items[:num_needed] all_items.extend(items_to_add) - logger.debug(f"Fetched {len(page_items)} items on page {page}. Added {len(items_to_add)}. Total items collected: {len(all_items)}") + logger.debug( + f"Fetched {len(page_items)} items on page {page}. Added {len(items_to_add)}. Total items collected: {len(all_items)}" + ) # Check if we've reached the limit if len(all_items) >= effective_max_results: - logger.info(f"Reached effective result limit ({effective_max_results} items). Stopping pagination.") - break # Exit loop + logger.info( + f"Reached effective result limit ({effective_max_results} items). Stopping pagination." + ) + break # Exit loop # --- Pagination Logic --- links = self._parse_link_header(response.headers) next_url = links.get("next") if not next_url: - logger.debug("No 'next' link found in header. Reached end of accessible results.") - break # Exit loop if no more pages are available + logger.debug( + "No 'next' link found in header. Reached end of accessible results." + ) + break # Exit loop if no more pages are available - page += 1 # Increment page number for the next iteration + page += 1 # Increment page number for the next iteration except requests.exceptions.JSONDecodeError as json_err: - logger.error(f"Failed to decode JSON search response (page {page}): {json_err}", exc_info=True) - return None # Cannot proceed if JSON is invalid + logger.error( + f"Failed to decode JSON search response (page {page}): {json_err}", + exc_info=True, + ) + return None # Cannot proceed if JSON is invalid except ApiClientError as api_err: # Propagate client-level errors (connection, timeout, 403, etc.) - logger.error(f"API client error during search pagination (page {page}): {api_err}") + logger.error( + f"API client error during search pagination (page {page}): {api_err}" + ) raise - except Exception as e: + except Exception: # Catch any other unexpected errors - logger.exception(f"Unexpected error during search pagination (page {page})") - raise # Propagate unexpected errors + logger.exception( + f"Unexpected error during search pagination (page {page})" + ) + raise # Propagate unexpected errors - logger.info(f"Finished repository search for '{query}'. Fetched {len(all_items)} items across {page if not next_url else page-1} pages. GitHub total count: {total_count}.") + logger.info( + f"Finished repository search for '{query}'. Fetched {len(all_items)} items across {page if not next_url else page - 1} pages. GitHub total count: {total_count}." + ) # Return the aggregated list and the total count reported by GitHub return all_items, total_count - def get_pull_requests(self, owner: str, repo: str, state: str = 'all', per_page: int = 100) -> List[Dict[str, Any]]: + def get_pull_requests( + self, owner: str, repo: str, state: str = "all", per_page: int = 100 + ) -> List[Dict[str, Any]]: """ Fetches pull requests for a repository, handling pagination. @@ -546,7 +701,12 @@ def get_pull_requests(self, owner: str, repo: str, state: str = 'all', per_page: if not owner or not repo: raise ValueError("Owner and repository name cannot be empty.") endpoint = f"/repos/{owner}/{repo}/pulls" - params = {"state": state, "per_page": per_page, "sort": "created", "direction": "desc"} + params = { + "state": state, + "per_page": per_page, + "sort": "created", + "direction": "desc", + } logger.info(f"Fetching pull requests for {owner}/{repo} (state={state})...") try: # Use the generic pagination helper @@ -555,9 +715,11 @@ def get_pull_requests(self, owner: str, repo: str, state: str = 'all', per_page: except ApiClientError as e: # Log the error specific to this operation before re-raising logger.error(f"Failed to fetch pull requests for {owner}/{repo}: {e}") - raise e # Re-raise the error for upstream handling + raise e # Re-raise the error for upstream handling - def get_issues(self, owner: str, repo: str, state: str = 'all', per_page: int = 100) -> List[Dict[str, Any]]: + def get_issues( + self, owner: str, repo: str, state: str = "all", per_page: int = 100 + ) -> List[Dict[str, Any]]: """ Fetches issues for a repository, handling pagination. Note: This fetches both issues and pull requests, as GitHub treats @@ -580,7 +742,12 @@ def get_issues(self, owner: str, repo: str, state: str = 'all', per_page: int = if not owner or not repo: raise ValueError("Owner and repository name cannot be empty.") endpoint = f"/repos/{owner}/{repo}/issues" - params = {"state": state, "per_page": per_page, "sort": "created", "direction": "desc"} + params = { + "state": state, + "per_page": per_page, + "sort": "created", + "direction": "desc", + } logger.info(f"Fetching issues (and PRs) for {owner}/{repo} (state={state})...") try: # Use the generic pagination helper @@ -588,11 +755,17 @@ def get_issues(self, owner: str, repo: str, state: str = 'all', per_page: int = return all_issues except ApiClientError as e: logger.error(f"Failed to fetch issues for {owner}/{repo}: {e}") - raise e # Re-raise the error + raise e # Re-raise the error # --- Methods for Fetching Comments --- - def get_issue_comments(self, owner: str, repo: str, issue_number: Optional[int] = None, per_page: int = 100) -> List[Dict[str, Any]]: + def get_issue_comments( + self, + owner: str, + repo: str, + issue_number: Optional[int] = None, + per_page: int = 100, + ) -> List[Dict[str, Any]]: """ Fetches comments on issues within a repository, handling pagination. @@ -619,7 +792,9 @@ def get_issue_comments(self, owner: str, repo: str, issue_number: Optional[int] if issue_number is not None: # Endpoint for comments on a specific issue endpoint = f"/repos/{owner}/{repo}/issues/{issue_number}/comments" - log_msg = f"Fetching comments for issue #{issue_number} in {owner}/{repo}..." + log_msg = ( + f"Fetching comments for issue #{issue_number} in {owner}/{repo}..." + ) else: # Endpoint for comments across all issues in the repo endpoint = f"/repos/{owner}/{repo}/issues/comments" @@ -634,10 +809,18 @@ def get_issue_comments(self, owner: str, repo: str, issue_number: Optional[int] return all_comments except ApiClientError as e: issue_id = f"issue #{issue_number}" if issue_number else "all issues" - logger.error(f"Failed to fetch issue comments for {owner}/{repo} ({issue_id}): {e}") + logger.error( + f"Failed to fetch issue comments for {owner}/{repo} ({issue_id}): {e}" + ) raise e - def get_pr_review_comments(self, owner: str, repo: str, pull_number: Optional[int] = None, per_page: int = 100) -> List[Dict[str, Any]]: + def get_pr_review_comments( + self, + owner: str, + repo: str, + pull_number: Optional[int] = None, + per_page: int = 100, + ) -> List[Dict[str, Any]]: """ Fetches review comments on pull requests within a repository, handling pagination. @@ -666,7 +849,9 @@ def get_pr_review_comments(self, owner: str, repo: str, pull_number: Optional[in if pull_number is not None: # Endpoint for review comments on a specific PR endpoint = f"/repos/{owner}/{repo}/pulls/{pull_number}/comments" - log_msg = f"Fetching review comments for PR #{pull_number} in {owner}/{repo}..." + log_msg = ( + f"Fetching review comments for PR #{pull_number} in {owner}/{repo}..." + ) else: # Endpoint for review comments across all PRs in the repo endpoint = f"/repos/{owner}/{repo}/pulls/comments" @@ -681,5 +866,7 @@ def get_pr_review_comments(self, owner: str, repo: str, pull_number: Optional[in return all_comments except ApiClientError as e: pr_id = f"PR #{pull_number}" if pull_number else "all PRs" - logger.error(f"Failed to fetch PR review comments for {owner}/{repo} ({pr_id}): {e}") - raise e \ No newline at end of file + logger.error( + f"Failed to fetch PR review comments for {owner}/{repo} ({pr_id}): {e}" + ) + raise e diff --git a/backend/external/openalex_client.py b/backend/external/openalex_client.py index e27f9b6..23d3443 100644 --- a/backend/external/openalex_client.py +++ b/backend/external/openalex_client.py @@ -12,20 +12,19 @@ import logging import urllib.parse import requests -import re # Used in helper function import time from typing import Optional, Dict, Any, List # Import base client and custom errors from .client_base import ClientBase, ApiClientError # Import settings to access OPENALEX_EMAIL -from backend.config.settings import settings # Note: The dependency on ScholarlyProcessingService._get_id_from_oa_url was removed # by duplicating the helper function here. Consider moving the helper to a # common utility module if used elsewhere. logger = logging.getLogger(__name__) + class OpenAlexClient(ClientBase): """ Client for the OpenAlex scholarly data API. @@ -34,6 +33,7 @@ class OpenAlexClient(ClientBase): citing works using the OpenAlex API endpoints. It utilizes the base client's request handling and incorporates the polite pool email if configured. """ + def __init__(self): """ Initializes the OpenAlexClient. @@ -43,12 +43,15 @@ def __init__(self): """ super().__init__(base_url="https://api.openalex.org") if not self.settings.OPENALEX_EMAIL: - logger.warning("OPENALEX_EMAIL is not set in settings. Providing an email to OpenAlex is recommended for the polite pool (potentially higher rate limits).") + logger.warning( + "OPENALEX_EMAIL is not set in settings. Providing an email to OpenAlex is recommended for the polite pool (potentially higher rate limits)." + ) else: - logger.info(f"OpenAlexClient initialized. Using email '{self.settings.OPENALEX_EMAIL}' for the polite pool.") + logger.info( + f"OpenAlexClient initialized. Using email '{self.settings.OPENALEX_EMAIL}' for the polite pool." + ) logger.info("OpenAlexClient initialized.") - def _get_id_from_oa_url(self, url: Optional[str]) -> Optional[str]: """ Extracts the OpenAlex ID (e.g., 'W123456789') from a full OpenAlex URL. @@ -60,20 +63,28 @@ def _get_id_from_oa_url(self, url: Optional[str]) -> Optional[str]: The extracted OpenAlex ID string (like 'W123...') if found and valid, otherwise None. """ - if not url or not isinstance(url, str) or not url.startswith("https://openalex.org/"): + if ( + not url + or not isinstance(url, str) + or not url.startswith("https://openalex.org/") + ): return None try: # Get the last part of the URL path - id_part = url.split('/')[-1] + id_part = url.split("/")[-1] # Basic validation: starts with 'W' (for works) followed by digits # TODO: Extend this for other entity types (A, I, C, S, F) if needed. if id_part and id_part[0].isalpha() and id_part[1:].isdigit(): return id_part else: - logger.debug(f"Extracted part '{id_part}' from URL '{url}' does not look like a valid OpenAlex ID.") + logger.debug( + f"Extracted part '{id_part}' from URL '{url}' does not look like a valid OpenAlex ID." + ) except Exception as e: # Catch potential errors during splitting or indexing - logger.warning(f"Error parsing OpenAlex ID from URL '{url}': {e}", exc_info=False) + logger.warning( + f"Error parsing OpenAlex ID from URL '{url}': {e}", exc_info=False + ) return None def resolve_doi_to_work(self, doi: str) -> Optional[Dict[str, Any]]: @@ -100,18 +111,22 @@ def resolve_doi_to_work(self, doi: str) -> Optional[Dict[str, Any]]: try: # DOIs can contain special characters like '/' which need encoding # when used as part of a URL path segment. - encoded_doi = urllib.parse.quote(doi, safe='') # Use quote() for path segments + encoded_doi = urllib.parse.quote( + doi, safe="" + ) # Use quote() for path segments # Construct the endpoint using the DOI resolver format endpoint = f"/works/https://doi.org/{encoded_doi}" except Exception as e: - # Catch potential encoding errors, although unlikely with standard DOIs - logger.error(f"Failed to URL-encode DOI '{doi}': {e}", exc_info=True) - raise ValueError(f"Invalid characters in DOI for URL encoding: {doi}") from e + # Catch potential encoding errors, although unlikely with standard DOIs + logger.error(f"Failed to URL-encode DOI '{doi}': {e}", exc_info=True) + raise ValueError( + f"Invalid characters in DOI for URL encoding: {doi}" + ) from e params = {} # Add email to params for polite pool access if self.settings.OPENALEX_EMAIL: - params["mailto"] = self.settings.OPENALEX_EMAIL + params["mailto"] = self.settings.OPENALEX_EMAIL logger.info(f"Resolving DOI '{doi}' via OpenAlex endpoint: {endpoint}") try: @@ -122,44 +137,54 @@ def resolve_doi_to_work(self, doi: str) -> Optional[Dict[str, Any]]: return None # Check for other non-successful status codes elif not response.ok: - logger.error(f"OpenAlex API error resolving DOI {doi}. Status: {response.status_code}, Response: {response.text[:200]}") + logger.error( + f"OpenAlex API error resolving DOI {doi}. Status: {response.status_code}, Response: {response.text[:200]}" + ) # Depending on policy, could raise ApiClientError here or just return None - return None # Fail gracefully for now + return None # Fail gracefully for now # If response is OK, attempt to parse JSON try: work_data = response.json() # Basic validation of the response structure - if work_data and isinstance(work_data, dict) and work_data.get('id'): + if work_data and isinstance(work_data, dict) and work_data.get("id"): # Extract the 'W...' ID from the full ID URL for convenience - oa_id_from_url = self._get_id_from_oa_url(work_data.get('id')) + oa_id_from_url = self._get_id_from_oa_url(work_data.get("id")) if oa_id_from_url: - work_data['openalex_id'] = oa_id_from_url + work_data["openalex_id"] = oa_id_from_url else: - logger.warning(f"Could not extract OpenAlex ID from work ID URL: {work_data.get('id')}") + logger.warning( + f"Could not extract OpenAlex ID from work ID URL: {work_data.get('id')}" + ) return work_data else: - logger.warning(f"Received unexpected or incomplete JSON structure from OpenAlex for DOI {doi}: {str(work_data)[:200]}") + logger.warning( + f"Received unexpected or incomplete JSON structure from OpenAlex for DOI {doi}: {str(work_data)[:200]}" + ) return None except requests.exceptions.JSONDecodeError as json_err: # Handle cases where response status was OK but body is not valid JSON - logger.error(f"Failed to parse JSON response from OpenAlex for DOI {doi} (Status: {response.status_code}): {json_err}", exc_info=True) - logger.debug(f"Response text causing decode error: {response.text[:500]}") + logger.error( + f"Failed to parse JSON response from OpenAlex for DOI {doi} (Status: {response.status_code}): {json_err}", + exc_info=True, + ) + logger.debug( + f"Response text causing decode error: {response.text[:500]}" + ) return None except ApiClientError as e: # Catch errors raised by _request (connection, timeout, retries exceeded) logger.error(f"OpenAlex API client error resolving DOI {doi}: {e}") - raise # Re-raise client errors + raise # Re-raise client errors except ValueError as e: # Catch the ValueError from DOI encoding failure logger.error(f"Value error related to DOI {doi}: {e}") - raise # Re-raise value errors - except Exception as e: + raise # Re-raise value errors + except Exception: # Catch any other unexpected errors during the process logger.exception(f"Unexpected error resolving DOI {doi} via OpenAlex") - raise # Re-raise unexpected errors - + raise # Re-raise unexpected errors def get_work_details(self, openalex_id: str) -> Optional[Dict[str, Any]]: """ @@ -181,22 +206,30 @@ def get_work_details(self, openalex_id: str) -> Optional[Dict[str, Any]]: ApiClientError: If the API request fails after retries. Exception: For other unexpected errors. """ - if not openalex_id: raise ValueError("OpenAlex ID cannot be empty.") + if not openalex_id: + raise ValueError("OpenAlex ID cannot be empty.") # Extract the 'W...' part if a full URL is provided if openalex_id.startswith("https://openalex.org/"): - work_id_part = self._get_id_from_oa_url(openalex_id) + work_id_part = self._get_id_from_oa_url(openalex_id) else: - work_id_part = openalex_id + work_id_part = openalex_id # Validate the extracted/provided ID format (basic check) - if not work_id_part or not work_id_part.startswith('W') or not work_id_part[1:].isdigit(): - logger.error(f"Invalid OpenAlex Work ID format provided: '{openalex_id}' (parsed as '{work_id_part}')") - raise ValueError(f"Invalid OpenAlex Work ID format: {openalex_id}") + if ( + not work_id_part + or not work_id_part.startswith("W") + or not work_id_part[1:].isdigit() + ): + logger.error( + f"Invalid OpenAlex Work ID format provided: '{openalex_id}' (parsed as '{work_id_part}')" + ) + raise ValueError(f"Invalid OpenAlex Work ID format: {openalex_id}") endpoint = f"/works/{work_id_part}" params = {} - if self.settings.OPENALEX_EMAIL: params["mailto"] = self.settings.OPENALEX_EMAIL + if self.settings.OPENALEX_EMAIL: + params["mailto"] = self.settings.OPENALEX_EMAIL logger.info(f"Fetching full work details for OpenAlex ID: {work_id_part}") try: @@ -206,33 +239,51 @@ def get_work_details(self, openalex_id: str) -> Optional[Dict[str, Any]]: logger.info(f"Work not found in OpenAlex: {work_id_part} (404)") return None elif not response.ok: - logger.error(f"OpenAlex API error getting details for work {work_id_part}. Status: {response.status_code}, Response: {response.text[:200]}") - return None # Fail gracefully + logger.error( + f"OpenAlex API error getting details for work {work_id_part}. Status: {response.status_code}, Response: {response.text[:200]}" + ) + return None # Fail gracefully try: work_data = response.json() # Verify the response contains an ID and it matches the requested ID - if work_data and isinstance(work_data, dict) and work_data.get('id') and work_data['id'].endswith(work_id_part): + if ( + work_data + and isinstance(work_data, dict) + and work_data.get("id") + and work_data["id"].endswith(work_id_part) + ): # Add the extracted 'W...' ID for consistency - oa_id_from_url = self._get_id_from_oa_url(work_data.get('id')) - if oa_id_from_url: work_data['openalex_id'] = oa_id_from_url + oa_id_from_url = self._get_id_from_oa_url(work_data.get("id")) + if oa_id_from_url: + work_data["openalex_id"] = oa_id_from_url return work_data else: - logger.warning(f"Received unexpected JSON structure or mismatched ID from OpenAlex for {work_id_part}: {str(work_data)[:200]}") + logger.warning( + f"Received unexpected JSON structure or mismatched ID from OpenAlex for {work_id_part}: {str(work_data)[:200]}" + ) return None except requests.exceptions.JSONDecodeError as json_err: - logger.error(f"Failed to parse JSON response from OpenAlex for work {work_id_part} (Status: {response.status_code}): {json_err}", exc_info=True) - logger.debug(f"Response text causing decode error: {response.text[:500]}") + logger.error( + f"Failed to parse JSON response from OpenAlex for work {work_id_part} (Status: {response.status_code}): {json_err}", + exc_info=True, + ) + logger.debug( + f"Response text causing decode error: {response.text[:500]}" + ) return None except ApiClientError as e: - logger.error(f"OpenAlex API client error getting details for work {work_id_part}: {e}") + logger.error( + f"OpenAlex API client error getting details for work {work_id_part}: {e}" + ) raise - except Exception as e: - logger.exception(f"Unexpected error getting details for work {work_id_part} via OpenAlex") + except Exception: + logger.exception( + f"Unexpected error getting details for work {work_id_part} via OpenAlex" + ) raise - def get_citing_works( self, citing_works_url: str, per_page: int = 200, max_results: int = 1000 ) -> Optional[List[Dict[str, Any]]]: @@ -262,8 +313,12 @@ def get_citing_works( ApiClientError: If an API request fails after retries. Exception: For other unexpected errors. """ - if not citing_works_url or not citing_works_url.startswith("https://api.openalex.org/works"): - logger.error(f"Invalid or missing citing_works_url provided: '{citing_works_url}'") + if not citing_works_url or not citing_works_url.startswith( + "https://api.openalex.org/works" + ): + logger.error( + f"Invalid or missing citing_works_url provided: '{citing_works_url}'" + ) raise ValueError(f"Invalid citing_works_url provided: {citing_works_url}") all_results: List[Dict[str, Any]] = [] @@ -275,30 +330,41 @@ def get_citing_works( try: # Parse the base URL and existing query parameters from the provided URL parsed_url = urllib.parse.urlparse(citing_works_url) - base_endpoint = parsed_url.path # Should be '/works' - initial_params = urllib.parse.parse_qs(parsed_url.query) # Contains the 'filter' param + base_endpoint = parsed_url.path # Should be '/works' + initial_params = urllib.parse.parse_qs( + parsed_url.query + ) # Contains the 'filter' param except Exception as parse_e: - logger.error(f"Failed to parse provided cited_by_api_url '{citing_works_url}': {parse_e}", exc_info=True) - raise ValueError(f"Could not parse cited_by_api_url: {citing_works_url}") from parse_e + logger.error( + f"Failed to parse provided cited_by_api_url '{citing_works_url}': {parse_e}", + exc_info=True, + ) + raise ValueError( + f"Could not parse cited_by_api_url: {citing_works_url}" + ) from parse_e # Cap per_page at OpenAlex maximum per_page = min(per_page, 200) - logger.info(f"Fetching citing works from URL: {citing_works_url} (max_results={max_results}, per_page={per_page})") + logger.info( + f"Fetching citing works from URL: {citing_works_url} (max_results={max_results}, per_page={per_page})" + ) while processed_count < max_results: # Prepare parameters for the current page request - current_params = initial_params.copy() # Start with base filter params - current_params['page'] = [str(page)] - current_params['per_page'] = [str(per_page)] + current_params = initial_params.copy() # Start with base filter params + current_params["page"] = [str(page)] + current_params["per_page"] = [str(per_page)] # Add/overwrite the 'select' parameter to fetch only needed fields - current_params['select'] = [select_fields] + current_params["select"] = [select_fields] # Add email for polite pool if not already present - if self.settings.OPENALEX_EMAIL and 'mailto' not in current_params: - current_params['mailto'] = [self.settings.OPENALEX_EMAIL] + if self.settings.OPENALEX_EMAIL and "mailto" not in current_params: + current_params["mailto"] = [self.settings.OPENALEX_EMAIL] # Log the request details (use base_endpoint as it's relative) - logger.debug(f"Fetching citing works page {page} using endpoint {base_endpoint} with params {current_params}") + logger.debug( + f"Fetching citing works page {page} using endpoint {base_endpoint} with params {current_params}" + ) try: # Make the request using the base endpoint and constructed params @@ -306,13 +372,13 @@ def get_citing_works( if not response.ok: # Log details if the request failed - error_msg = response.text[:200] # Basic error snippet + error_msg = response.text[:200] # Basic error snippet try: # Attempt to get a more specific message from JSON error response error_json = response.json() - error_msg = error_json.get('message', error_msg) + error_msg = error_json.get("message", error_msg) except requests.exceptions.JSONDecodeError: - pass # Ignore if response wasn't JSON + pass # Ignore if response wasn't JSON logger.error( f"OpenAlex API error fetching citing works page {page} from {citing_works_url}. " f"Status: {response.status_code}, Error: {error_msg}" @@ -323,42 +389,56 @@ def get_citing_works( # If response is OK, process the JSON data try: data = response.json() - results = data.get("results", []) # List of citing works + results = data.get("results", []) # List of citing works # Validate the results format if not isinstance(results, list): - logger.error(f"Unexpected 'results' format in citing works response (page {page}, expected list, got {type(results)}).") - return None # Cannot process invalid format + logger.error( + f"Unexpected 'results' format in citing works response (page {page}, expected list, got {type(results)})." + ) + return None # Cannot process invalid format # If no results are returned on the current page, we've reached the end if not results: - logger.debug(f"No more citing works found on page {page} for URL {citing_works_url}. Ending fetch.") - break # Exit the pagination loop + logger.debug( + f"No more citing works found on page {page} for URL {citing_works_url}. Ending fetch." + ) + break # Exit the pagination loop # Process the fetched items: add 'openalex_id' and respect max_results cleaned_results = [] for item in results: - if processed_count >= max_results: break # Stop adding if max reached mid-page + if processed_count >= max_results: + break # Stop adding if max reached mid-page # Ensure item has an ID before processing - if item and isinstance(item, dict) and item.get('id'): - oa_id_from_url = self._get_id_from_oa_url(item.get('id')) + if item and isinstance(item, dict) and item.get("id"): + oa_id_from_url = self._get_id_from_oa_url(item.get("id")) if oa_id_from_url: - item['openalex_id'] = oa_id_from_url # Add the 'W...' ID + item["openalex_id"] = ( + oa_id_from_url # Add the 'W...' ID + ) cleaned_results.append(item) processed_count += 1 else: - logger.warning(f"Could not parse OpenAlex ID from citing work item: {item.get('id')}") + logger.warning( + f"Could not parse OpenAlex ID from citing work item: {item.get('id')}" + ) else: - logger.warning(f"Skipping invalid item in citing works response: {item}") - + logger.warning( + f"Skipping invalid item in citing works response: {item}" + ) all_results.extend(cleaned_results) - logger.debug(f"Page {page}: fetched {len(results)} items, added {len(cleaned_results)}. Total collected: {processed_count}") + logger.debug( + f"Page {page}: fetched {len(results)} items, added {len(cleaned_results)}. Total collected: {processed_count}" + ) # Check if we've hit the max_results limit after processing the page if processed_count >= max_results: - logger.info(f"Reached max_results ({max_results}) while fetching citing works from {citing_works_url}.") - break # Exit the pagination loop + logger.info( + f"Reached max_results ({max_results}) while fetching citing works from {citing_works_url}." + ) + break # Exit the pagination loop # Prepare for the next page page += 1 @@ -366,23 +446,33 @@ def get_citing_works( time.sleep(0.1) except requests.exceptions.JSONDecodeError as json_err: - logger.error(f"Failed to parse JSON citing works response (page {page}) from {citing_works_url}: {json_err}", exc_info=True) - logger.debug(f"Response text causing decode error: {response.text[:500]}") - return None # Cannot proceed if JSON is invalid + logger.error( + f"Failed to parse JSON citing works response (page {page}) from {citing_works_url}: {json_err}", + exc_info=True, + ) + logger.debug( + f"Response text causing decode error: {response.text[:500]}" + ) + return None # Cannot proceed if JSON is invalid except ApiClientError as api_err: - # Propagate client-level errors - logger.error(f"API client error during citing works fetch (page {page}) from {citing_works_url}: {api_err}") - raise - except Exception as e: - # Catch unexpected errors during the loop - logger.exception(f"Unexpected error during citing works fetch (page {page}) from {citing_works_url}") - raise - - logger.info(f"Finished fetching citing works from {citing_works_url}. Retrieved {len(all_results)} results (processed count: {processed_count}).") + # Propagate client-level errors + logger.error( + f"API client error during citing works fetch (page {page}) from {citing_works_url}: {api_err}" + ) + raise + except Exception: + # Catch unexpected errors during the loop + logger.exception( + f"Unexpected error during citing works fetch (page {page}) from {citing_works_url}" + ) + raise + + logger.info( + f"Finished fetching citing works from {citing_works_url}. Retrieved {len(all_results)} results (processed count: {processed_count})." + ) return all_results - def get_work_basic_metadata(self, openalex_id: str) -> Optional[Dict[str, Any]]: """ Fetches a minimal set of metadata for a specific OpenAlex work using its ID. @@ -404,63 +494,92 @@ def get_work_basic_metadata(self, openalex_id: str) -> Optional[Dict[str, Any]]: ApiClientError: If the API request fails after retries. Exception: For other unexpected errors. """ - if not openalex_id: raise ValueError("OpenAlex ID cannot be empty.") + if not openalex_id: + raise ValueError("OpenAlex ID cannot be empty.") # Validate ID format strictly for this method (expects 'W...' format) - if not openalex_id.startswith('W') or not openalex_id[1:].isdigit(): - logger.error(f"Invalid OpenAlex Work ID format provided for basic fetch: '{openalex_id}'. Expected 'W' followed by digits.") - raise ValueError(f"Invalid OpenAlex Work ID format for basic fetch: {openalex_id}") + if not openalex_id.startswith("W") or not openalex_id[1:].isdigit(): + logger.error( + f"Invalid OpenAlex Work ID format provided for basic fetch: '{openalex_id}'. Expected 'W' followed by digits." + ) + raise ValueError( + f"Invalid OpenAlex Work ID format for basic fetch: {openalex_id}" + ) endpoint = f"/works/{openalex_id}" # Define the minimal set of fields required select_fields = "id,doi,title,publication_year" params = {"select": select_fields} - if self.settings.OPENALEX_EMAIL: params["mailto"] = self.settings.OPENALEX_EMAIL + if self.settings.OPENALEX_EMAIL: + params["mailto"] = self.settings.OPENALEX_EMAIL - logger.info(f"Fetching basic metadata for OpenAlex ID {openalex_id} (fields: {select_fields})") + logger.info( + f"Fetching basic metadata for OpenAlex ID {openalex_id} (fields: {select_fields})" + ) try: response = self._request("GET", endpoint, params=params) if response.status_code == 404: - logger.info(f"Work not found in OpenAlex (basic fetch): {openalex_id} (404)") + logger.info( + f"Work not found in OpenAlex (basic fetch): {openalex_id} (404)" + ) return None elif not response.ok: - # Log specific error message if available - error_msg = response.text[:200] - try: - error_json = response.json() - error_msg = error_json.get('message', error_msg) - except requests.exceptions.JSONDecodeError: pass - logger.error( - f"OpenAlex API error getting basic details for work {openalex_id}. " - f"Status: {response.status_code}, Error: {error_msg}" - ) - return None # Fail gracefully + # Log specific error message if available + error_msg = response.text[:200] + try: + error_json = response.json() + error_msg = error_json.get("message", error_msg) + except requests.exceptions.JSONDecodeError: + pass + logger.error( + f"OpenAlex API error getting basic details for work {openalex_id}. " + f"Status: {response.status_code}, Error: {error_msg}" + ) + return None # Fail gracefully # Process successful response try: work_data = response.json() # Verify the response contains an ID and it matches - if work_data and isinstance(work_data, dict) and work_data.get('id') and work_data['id'].endswith(openalex_id): - # Add the cleaned 'openalex_id' field for consistency - oa_id_from_url = self._get_id_from_oa_url(work_data.get('id')) - if oa_id_from_url: - work_data['openalex_id'] = oa_id_from_url - else: - # Should ideally always be parseable if ID matched endswith - logger.warning(f"Could not parse OpenAlex ID from work ID URL during basic fetch: {work_data.get('id')}") - return work_data + if ( + work_data + and isinstance(work_data, dict) + and work_data.get("id") + and work_data["id"].endswith(openalex_id) + ): + # Add the cleaned 'openalex_id' field for consistency + oa_id_from_url = self._get_id_from_oa_url(work_data.get("id")) + if oa_id_from_url: + work_data["openalex_id"] = oa_id_from_url + else: + # Should ideally always be parseable if ID matched endswith + logger.warning( + f"Could not parse OpenAlex ID from work ID URL during basic fetch: {work_data.get('id')}" + ) + return work_data else: - logger.warning(f"Received unexpected JSON structure or mismatched ID from basic fetch for {openalex_id}: {str(work_data)[:200]}") - return None + logger.warning( + f"Received unexpected JSON structure or mismatched ID from basic fetch for {openalex_id}: {str(work_data)[:200]}" + ) + return None except requests.exceptions.JSONDecodeError as json_err: - logger.error(f"Failed to parse JSON response from OpenAlex basic fetch for work {openalex_id} (Status: {response.status_code}): {json_err}", exc_info=True) - logger.debug(f"Response text causing decode error: {response.text[:500]}") + logger.error( + f"Failed to parse JSON response from OpenAlex basic fetch for work {openalex_id} (Status: {response.status_code}): {json_err}", + exc_info=True, + ) + logger.debug( + f"Response text causing decode error: {response.text[:500]}" + ) return None except ApiClientError as e: - logger.error(f"OpenAlex API client error getting basic details for work {openalex_id}: {e}") + logger.error( + f"OpenAlex API client error getting basic details for work {openalex_id}: {e}" + ) + raise + except Exception: + logger.exception( + f"Unexpected error getting basic details for work {openalex_id} via OpenAlex" + ) raise - except Exception as e: - logger.exception(f"Unexpected error getting basic details for work {openalex_id} via OpenAlex") - raise \ No newline at end of file diff --git a/backend/main.py b/backend/main.py index a03f203..49bfd9f 100644 --- a/backend/main.py +++ b/backend/main.py @@ -16,6 +16,7 @@ # Import the main API router aggregate from the v1 API definition. from backend.api.v1.api import api_router as api_router_v1 + # Import the centralized logging configuration function. from backend.config.logging_config import setup_logging @@ -32,7 +33,7 @@ app = FastAPI( title="MOSS - Map of Open Source Science API", description="API for ingesting and querying data about open source scientific software and its relationships.", - version="0.1.0", # Consider linking this to a version managed elsewhere (e.g., pyproject.toml) + version="0.1.0", # Consider linking this to a version managed elsewhere (e.g., pyproject.toml) # Additional OpenAPI metadata can be added here (e.g., docs_url, redoc_url) ) @@ -42,23 +43,24 @@ # than the API. # Define allowed origins (adjust for development/production environments). origins = [ - "http://localhost", # Common local development origin - "http://localhost:5173", # Default Vite dev server port - "http://localhost:3000", # Default React dev server port + "http://localhost", # Common local development origin + "http://localhost:5173", # Default Vite dev server port + "http://localhost:3000", # Default React dev server port # Add production frontend URLs here, e.g., "https://moss.example.com" ] app.add_middleware( CORSMiddleware, - allow_origins=origins, # List of allowed origins. - allow_credentials=True, # Allow cookies to be included in requests. - allow_methods=["*"], # Allow all standard HTTP methods (GET, POST, etc.). - allow_headers=["*"], # Allow all request headers. + allow_origins=origins, # List of allowed origins. + allow_credentials=True, # Allow cookies to be included in requests. + allow_methods=["*"], # Allow all standard HTTP methods (GET, POST, etc.). + allow_headers=["*"], # Allow all request headers. ) # --- End CORS Middleware --- # --- Application Lifecycle Event Handlers --- + @app.on_event("startup") async def startup_event(): """ @@ -68,6 +70,7 @@ async def startup_event(): logger.info("MOSS API application starting up...") # Potential future actions: Initialize database connections pools, load caches, etc. + @app.on_event("shutdown") async def shutdown_event(): """ @@ -76,8 +79,10 @@ async def shutdown_event(): logger.info("MOSS API application shutting down...") # Potential future actions: Close database connections, flush logs, etc. + # --- Basic Health Check Endpoint --- + @app.get("/health", tags=["Health"], summary="API Health Status") async def health_check(): """ @@ -87,6 +92,7 @@ async def health_check(): logger.debug("Health check endpoint '/health' invoked.") return {"status": "ok"} + # --- Include API Routers --- # Mount the API version 1 router under the '/api/v1' prefix. # All routes defined in api_router_v1 will be accessible relative to this path. @@ -99,4 +105,4 @@ async def health_check(): # --reload: Enables auto-reloading when code changes are detected. # --host 0.0.0.0: Makes the server accessible on the network (not just localhost). # --port 8000: Specifies the port to listen on. -# --- END OF FILE main.py --- \ No newline at end of file +# --- END OF FILE main.py --- diff --git a/backend/schemas/__init__.py b/backend/schemas/__init__.py index 4682222..3d72fc5 100644 --- a/backend/schemas/__init__.py +++ b/backend/schemas/__init__.py @@ -1,2 +1,2 @@ # Makes 'schemas' a Python package -# Optionally import common schemas or base classes here if needed later \ No newline at end of file +# Optionally import common schemas or base classes here if needed later diff --git a/backend/schemas/requests.py b/backend/schemas/requests.py index 43d7890..b2ec060 100644 --- a/backend/schemas/requests.py +++ b/backend/schemas/requests.py @@ -8,23 +8,35 @@ that required fields are present and conform to the expected types and formats before processing begins. """ -from typing import Dict, Any, Optional, List + +from typing import Dict, Any from pydantic import BaseModel, HttpUrl, Field + # --- Ingestion --- class IngestionRequest(BaseModel): """ Specifies the data required to initiate ingestion from a direct URL. Typically used for adding a specific repository or resource. """ - url: HttpUrl = Field(..., description="The URL of the resource to ingest (e.g., a GitHub repository URL). Must be a valid HTTP/HTTPS URL.") + + url: HttpUrl = Field( + ..., + description="The URL of the resource to ingest (e.g., a GitHub repository URL). Must be a valid HTTP/HTTPS URL.", + ) + class KeywordIngestionRequest(BaseModel): """ Specifies the data required to initiate ingestion based on keywords. Used for discovering resources via external search APIs (e.g., GitHub search). """ - keywords: str = Field(..., description="A string of keywords to use for searching and subsequent ingestion.") + + keywords: str = Field( + ..., + description="A string of keywords to use for searching and subsequent ingestion.", + ) + # --- Shared Recipes / Algorithms --- class RecipeExecutionRequest(BaseModel): @@ -32,7 +44,12 @@ class RecipeExecutionRequest(BaseModel): Defines the structure for requesting the execution of a generic recipe or algorithm. Requires specifying the parameters needed by the target script. """ - parameters: Dict[str, Any] = Field(..., description="A dictionary of parameters required by the specific recipe script being executed. Keys are parameter names, values are the corresponding parameter values.") + + parameters: Dict[str, Any] = Field( + ..., + description="A dictionary of parameters required by the specific recipe script being executed. Keys are parameter names, values are the corresponding parameter values.", + ) + # --- Affiliation Algorithms --- class AffiliationExecutionRequest(BaseModel): @@ -40,5 +57,12 @@ class AffiliationExecutionRequest(BaseModel): Specifies the data required to execute a repository-institution affiliation algorithm. Targets a specific institution and allows for algorithm-specific parameters. """ - institution_id: int = Field(..., description="The internal database ID of the institution for which to run the affiliation algorithm.") - parameters: Dict[str, Any] = Field({}, description="Optional dictionary of additional parameters required by the specific affiliation algorithm being executed. Structure depends on the algorithm.") \ No newline at end of file + + institution_id: int = Field( + ..., + description="The internal database ID of the institution for which to run the affiliation algorithm.", + ) + parameters: Dict[str, Any] = Field( + {}, + description="Optional dictionary of additional parameters required by the specific affiliation algorithm being executed. Structure depends on the algorithm.", + ) diff --git a/backend/schemas/responses.py b/backend/schemas/responses.py index f47a7da..748ca30 100644 --- a/backend/schemas/responses.py +++ b/backend/schemas/responses.py @@ -9,15 +9,12 @@ inherit from base models and summary models to promote reusability. """ -from pydantic import ( - BaseModel, ConfigDict, Field, HttpUrl, - field_validator, - ValidationInfo -) +from pydantic import BaseModel, ConfigDict, Field, HttpUrl, field_validator from typing import List, Optional, Dict, Any from datetime import datetime import uuid + # --- Base --- class BaseResponse(BaseModel): """ @@ -26,116 +23,179 @@ class BaseResponse(BaseModel): Includes optional database ID and timestamp fields, and configures Pydantic to allow population from ORM model attributes. """ + model_config = ConfigDict(from_attributes=True) - id: Optional[int | uuid.UUID] = Field(None, description="Unique identifier for the resource.") - created_at: Optional[datetime] = Field(None, description="Timestamp of resource creation (UTC).") - updated_at: Optional[datetime] = Field(None, description="Timestamp of last resource update (UTC).") + id: Optional[int | uuid.UUID] = Field( + None, description="Unique identifier for the resource." + ) + created_at: Optional[datetime] = Field( + None, description="Timestamp of resource creation (UTC)." + ) + updated_at: Optional[datetime] = Field( + None, description="Timestamp of last resource update (UTC)." + ) + # --- Summaries (for lists) --- class RepositorySummary(BaseResponse): """ A concise representation of a Repository, suitable for list views. """ - id: int = Field(..., description="Internal database ID of the repository.") - full_name: str = Field(..., description="Full name of the repository (e.g., 'owner/repo').") - stargazers_count: Optional[int] = Field(0, description="Number of users who have starred the repository on GitHub.") - language: Optional[str] = Field(None, description="Primary programming language detected in the repository.") - description: Optional[str] = Field(None, description="Description of the repository provided on GitHub.") - html_url: Optional[HttpUrl] = Field(None, description="URL to the repository's main page on GitHub.") - @field_validator('html_url', mode='before') + id: int = Field(..., description="Internal database ID of the repository.") + full_name: str = Field( + ..., description="Full name of the repository (e.g., 'owner/repo')." + ) + stargazers_count: Optional[int] = Field( + 0, description="Number of users who have starred the repository on GitHub." + ) + language: Optional[str] = Field( + None, description="Primary programming language detected in the repository." + ) + description: Optional[str] = Field( + None, description="Description of the repository provided on GitHub." + ) + html_url: Optional[HttpUrl] = Field( + None, description="URL to the repository's main page on GitHub." + ) + + @field_validator("html_url", mode="before") @classmethod def empty_str_to_none_html_url(cls, v: Any): """Ensure empty strings for HTML URLs are converted to None.""" - if isinstance(v, str) and v == '': + if isinstance(v, str) and v == "": return None return v + class WorkSummary(BaseResponse): """ A concise representation of a scholarly Work (publication), suitable for list views. """ + id: int = Field(..., description="Internal database ID of the work.") title: Optional[str] = Field(None, description="Title of the scholarly work.") - doi: Optional[str] = Field(None, description="Digital Object Identifier (DOI) of the work.") - publication_year: Optional[int] = Field(None, description="Year the work was published.") + doi: Optional[str] = Field( + None, description="Digital Object Identifier (DOI) of the work." + ) + publication_year: Optional[int] = Field( + None, description="Year the work was published." + ) + class PersonSummary(BaseResponse): """ A concise representation of a Person (author/contributor), suitable for list views. """ + id: int = Field(..., description="Internal database ID of the person.") - display_name: Optional[str] = Field(None, description="Primary display name of the person.") + display_name: Optional[str] = Field( + None, description="Primary display name of the person." + ) orcid: Optional[str] = Field(None, description="ORCID identifier for the person.") + class InstitutionSummary(BaseResponse): """ A concise representation of an Institution, suitable for list views. """ + id: int = Field(..., description="Internal database ID of the institution.") - display_name: Optional[str] = Field(None, description="Primary display name of the institution.") - ror: Optional[str] = Field(None, description="Research Organization Registry (ROR) identifier for the institution.") + display_name: Optional[str] = Field( + None, description="Primary display name of the institution." + ) + ror: Optional[str] = Field( + None, + description="Research Organization Registry (ROR) identifier for the institution.", + ) + # --- Topic Hierarchy Summaries --- class DomainSummary(BaseResponse): """ A concise representation of an OpenAlex Domain, the highest level in the topic hierarchy. """ + id: int = Field(..., description="Internal database ID of the domain.") openalex_id: str = Field(..., description="OpenAlex ID for the domain.") display_name: str = Field(..., description="Display name of the domain.") + class FieldSummary(BaseResponse): """ A concise representation of an OpenAlex Field, nested under a Domain. """ + id: int = Field(..., description="Internal database ID of the field.") openalex_id: str = Field(..., description="OpenAlex ID for the field.") display_name: str = Field(..., description="Display name of the field.") + class SubfieldSummary(BaseResponse): """ A concise representation of an OpenAlex Subfield, nested under a Field. """ + id: int = Field(..., description="Internal database ID of the subfield.") openalex_id: str = Field(..., description="OpenAlex ID for the subfield.") display_name: str = Field(..., description="Display name of the subfield.") + class TopicSummary(BaseResponse): """ A concise representation of an OpenAlex Topic, the most granular level in the hierarchy, nested under a Subfield. """ + id: int = Field(..., description="Internal database ID of the topic.") openalex_id: str = Field(..., description="OpenAlex ID for the topic.") display_name: str = Field(..., description="Display name of the topic.") + class PrimaryTopicResponse(TopicSummary): """ Represents the primary topic associated with a resource (e.g., a Work), including its hierarchical context (Subfield, Field, Domain) and relevance score. """ - score: Optional[float] = Field(None, description="Relevance score assigned to this topic for the associated resource.") - subfield: Optional[SubfieldSummary] = Field(None, description="The Subfield this topic belongs to.") - field: Optional[FieldSummary] = Field(None, description="The Field this topic's Subfield belongs to.") - domain: Optional[DomainSummary] = Field(None, description="The Domain this topic's Field belongs to.") + + score: Optional[float] = Field( + None, + description="Relevance score assigned to this topic for the associated resource.", + ) + subfield: Optional[SubfieldSummary] = Field( + None, description="The Subfield this topic belongs to." + ) + field: Optional[FieldSummary] = Field( + None, description="The Field this topic's Subfield belongs to." + ) + domain: Optional[DomainSummary] = Field( + None, description="The Domain this topic's Field belongs to." + ) + # --- Full Responses --- class OwnerResponse(BaseResponse): """ Detailed representation of a GitHub Owner (User or Organization). """ + id: int = Field(..., description="Internal database ID of the owner.") github_id: int = Field(..., description="GitHub's unique ID for the owner.") login: str = Field(..., description="GitHub username or organization name.") - type: str = Field(..., description="Type of GitHub account ('User' or 'Organization').") - avatar_url: Optional[HttpUrl] = Field(None, description="URL of the owner's avatar image on GitHub.") - html_url: Optional[HttpUrl] = Field(None, description="URL to the owner's profile page on GitHub.") - - @field_validator('avatar_url', 'html_url', mode='before') + type: str = Field( + ..., description="Type of GitHub account ('User' or 'Organization')." + ) + avatar_url: Optional[HttpUrl] = Field( + None, description="URL of the owner's avatar image on GitHub." + ) + html_url: Optional[HttpUrl] = Field( + None, description="URL to the owner's profile page on GitHub." + ) + + @field_validator("avatar_url", "html_url", mode="before") @classmethod def empty_str_to_none_owner_urls(cls, v: Any): """Ensure empty strings for owner URLs are converted to None.""" - if isinstance(v, str) and v == '': + if isinstance(v, str) and v == "": return None return v @@ -145,84 +205,159 @@ class ContributorResponse(BaseResponse): Detailed representation of a GitHub Repository Contributor. Note: This structure often mirrors OwnerResponse as contributors are GitHub Users. """ - id: int = Field(..., description="Internal database ID of the contributor record (distinct from the user ID).") - github_id: int = Field(..., description="GitHub's unique ID for the contributor (User).") + + id: int = Field( + ..., + description="Internal database ID of the contributor record (distinct from the user ID).", + ) + github_id: int = Field( + ..., description="GitHub's unique ID for the contributor (User)." + ) login: str = Field(..., description="GitHub username of the contributor.") type: str = Field(..., description="Type of GitHub account (usually 'User').") - avatar_url: Optional[HttpUrl] = Field(None, description="URL of the contributor's avatar image on GitHub.") - html_url: Optional[HttpUrl] = Field(None, description="URL to the contributor's profile page on GitHub.") - - @field_validator('avatar_url', 'html_url', mode='before') + avatar_url: Optional[HttpUrl] = Field( + None, description="URL of the contributor's avatar image on GitHub." + ) + html_url: Optional[HttpUrl] = Field( + None, description="URL to the contributor's profile page on GitHub." + ) + + @field_validator("avatar_url", "html_url", mode="before") @classmethod def empty_str_to_none_contrib_urls(cls, v: Any): """Ensure empty strings for contributor URLs are converted to None.""" - if isinstance(v, str) and v == '': + if isinstance(v, str) and v == "": return None return v + class RepositoryResponse(RepositorySummary): """ Detailed representation of a GitHub Repository, extending the summary view. """ + github_id: int = Field(..., description="GitHub's unique ID for the repository.") name: str = Field(..., description="Name of the repository (without the owner).") - homepage: Optional[HttpUrl] = Field(None, description="URL of the project's homepage, if specified.") - api_url: Optional[HttpUrl] = Field(None, description="URL for accessing the repository via the GitHub API.") - watchers_count: Optional[int] = Field(0, description="Number of users watching the repository on GitHub.") - forks_count: Optional[int] = Field(0, description="Number of times the repository has been forked on GitHub.") - open_issues_count: Optional[int] = Field(0, description="Number of open issues in the repository.") - is_fork: Optional[bool] = Field(False, description="Indicates if the repository is a fork of another repository.") - gh_created_at: Optional[datetime] = Field(None, description="Timestamp when the repository was created on GitHub (UTC).") - gh_updated_at: Optional[datetime] = Field(None, description="Timestamp when the repository was last updated on GitHub (UTC).") - gh_pushed_at: Optional[datetime] = Field(None, description="Timestamp when code was last pushed to the repository on GitHub (UTC).") - owner_id: Optional[int] = Field(None, description="Internal database ID of the repository's owner.") - topics: Optional[List[str]] = Field(None, description="List of topics assigned to the repository on GitHub.") - license: Optional[Dict[str, Any]] = Field(None, description="Details of the repository's license, as detected by GitHub.") - - @field_validator('homepage', 'api_url', mode='before') + homepage: Optional[HttpUrl] = Field( + None, description="URL of the project's homepage, if specified." + ) + api_url: Optional[HttpUrl] = Field( + None, description="URL for accessing the repository via the GitHub API." + ) + watchers_count: Optional[int] = Field( + 0, description="Number of users watching the repository on GitHub." + ) + forks_count: Optional[int] = Field( + 0, description="Number of times the repository has been forked on GitHub." + ) + open_issues_count: Optional[int] = Field( + 0, description="Number of open issues in the repository." + ) + is_fork: Optional[bool] = Field( + False, + description="Indicates if the repository is a fork of another repository.", + ) + gh_created_at: Optional[datetime] = Field( + None, description="Timestamp when the repository was created on GitHub (UTC)." + ) + gh_updated_at: Optional[datetime] = Field( + None, + description="Timestamp when the repository was last updated on GitHub (UTC).", + ) + gh_pushed_at: Optional[datetime] = Field( + None, + description="Timestamp when code was last pushed to the repository on GitHub (UTC).", + ) + owner_id: Optional[int] = Field( + None, description="Internal database ID of the repository's owner." + ) + topics: Optional[List[str]] = Field( + None, description="List of topics assigned to the repository on GitHub." + ) + license: Optional[Dict[str, Any]] = Field( + None, description="Details of the repository's license, as detected by GitHub." + ) + + @field_validator("homepage", "api_url", mode="before") @classmethod def empty_str_to_none_repo_urls(cls, v: Any): """Ensure empty strings for repository homepage and API URLs are converted to None.""" - if isinstance(v, str) and v == '': + if isinstance(v, str) and v == "": return None return v + class WorkResponse(WorkSummary): """ Detailed representation of a scholarly Work (publication), extending the summary view. Includes information from OpenAlex and associated topic data. """ + openalex_id: Optional[str] = Field(None, description="OpenAlex ID for the work.") - type: Optional[str] = Field(None, description="Type of the scholarly work (e.g., 'article', 'book').") - cited_by_count: Optional[int] = Field(None, description="Number of times this work has been cited by other works, according to OpenAlex.") - host_venue_display_name: Optional[str] = Field(None, description="Display name of the host venue (e.g., journal, conference) where the work was published.") - openalex_url: Optional[HttpUrl] = Field(None, description="URL to the work's page on OpenAlex.") - primary_topic: Optional[PrimaryTopicResponse] = Field(None, description="The primary topic associated with the work, including its hierarchy.") - topics: Optional[List[TopicSummary]] = Field(None, description="List of all topics associated with the work, represented as summaries.") - - @field_validator('openalex_url', mode='before') + type: Optional[str] = Field( + None, description="Type of the scholarly work (e.g., 'article', 'book')." + ) + cited_by_count: Optional[int] = Field( + None, + description="Number of times this work has been cited by other works, according to OpenAlex.", + ) + host_venue_display_name: Optional[str] = Field( + None, + description="Display name of the host venue (e.g., journal, conference) where the work was published.", + ) + openalex_url: Optional[HttpUrl] = Field( + None, description="URL to the work's page on OpenAlex." + ) + primary_topic: Optional[PrimaryTopicResponse] = Field( + None, + description="The primary topic associated with the work, including its hierarchy.", + ) + topics: Optional[List[TopicSummary]] = Field( + None, + description="List of all topics associated with the work, represented as summaries.", + ) + + @field_validator("openalex_url", mode="before") @classmethod def empty_str_to_none_work_urls(cls, v: Any): """Ensure empty strings for OpenAlex URLs are converted to None.""" - if isinstance(v, str) and v == '': + if isinstance(v, str) and v == "": return None return v + class PersonResponse(PersonSummary): """ Detailed representation of a Person (author/contributor), extending the summary view. """ - openalex_id: Optional[str] = Field(None, description="OpenAlex ID associated with the person.") - display_name_alternatives: Optional[List[str]] = Field(None, description="Alternative names or spellings associated with the person.") + + openalex_id: Optional[str] = Field( + None, description="OpenAlex ID associated with the person." + ) + display_name_alternatives: Optional[List[str]] = Field( + None, description="Alternative names or spellings associated with the person." + ) + class InstitutionResponse(InstitutionSummary): """ Detailed representation of an Institution, extending the summary view. """ - openalex_id: Optional[str] = Field(None, description="OpenAlex ID associated with the institution.") - country_code: Optional[str] = Field(None, description="ISO 3166-1 alpha-2 country code for the institution's location.") - type: Optional[str] = Field(None, description="Type of institution (e.g., 'education', 'government').") - github_organization_logins: Optional[List[str]] = Field(None, description="List of GitHub organization logins potentially associated with this institution.") + + openalex_id: Optional[str] = Field( + None, description="OpenAlex ID associated with the institution." + ) + country_code: Optional[str] = Field( + None, + description="ISO 3166-1 alpha-2 country code for the institution's location.", + ) + type: Optional[str] = Field( + None, description="Type of institution (e.g., 'education', 'government')." + ) + github_organization_logins: Optional[List[str]] = Field( + None, + description="List of GitHub organization logins potentially associated with this institution.", + ) # --- Discovery & Search --- @@ -230,33 +365,76 @@ class DiscoveryChainSummary(BaseResponse): """ Summary of a discovery chain process, representing a traversal through related entities. """ - id: uuid.UUID = Field(..., description="Unique identifier for this specific discovery chain step or link.") - root_chain_id: Optional[uuid.UUID] = Field(None, description="Identifier of the initial starting point of the overall discovery process.") - level: Optional[int] = Field(None, description="Depth or level of this step within the discovery chain.") - discovery_type: Optional[str] = Field(None, description="Type or method used for this discovery step (e.g., 'REPOSITORY_TO_WORK', 'WORK_TO_AUTHOR').") - status: Optional[str] = Field(None, description="Current status of this discovery step (e.g., 'PENDING', 'PROCESSING', 'COMPLETED', 'FAILED').") - started_at: Optional[datetime] = Field(None, description="Timestamp when processing for this step started (UTC).") - completed_at: Optional[datetime] = Field(None, description="Timestamp when processing for this step completed (UTC).") + + id: uuid.UUID = Field( + ..., + description="Unique identifier for this specific discovery chain step or link.", + ) + root_chain_id: Optional[uuid.UUID] = Field( + None, + description="Identifier of the initial starting point of the overall discovery process.", + ) + level: Optional[int] = Field( + None, description="Depth or level of this step within the discovery chain." + ) + discovery_type: Optional[str] = Field( + None, + description="Type or method used for this discovery step (e.g., 'REPOSITORY_TO_WORK', 'WORK_TO_AUTHOR').", + ) + status: Optional[str] = Field( + None, + description="Current status of this discovery step (e.g., 'PENDING', 'PROCESSING', 'COMPLETED', 'FAILED').", + ) + started_at: Optional[datetime] = Field( + None, description="Timestamp when processing for this step started (UTC)." + ) + completed_at: Optional[datetime] = Field( + None, description="Timestamp when processing for this step completed (UTC)." + ) + class KeywordSearchSessionResponse(BaseResponse): """ Represents the results and status of a keyword search session used for ingestion. """ + id: int = Field(..., description="Internal database ID for the search session.") - keywords_raw: str = Field(..., description="The raw keyword string used for the search.") - status: str = Field(..., description="Current status of the search session (e.g., 'PENDING', 'RUNNING', 'COMPLETED', 'FAILED').") - results_count: Optional[int] = Field(None, description="Number of relevant items found or processed during the session.") - started_at: Optional[datetime] = Field(None, description="Timestamp when the search session started (UTC).") - completed_at: Optional[datetime] = Field(None, description="Timestamp when the search session completed (UTC).") + keywords_raw: str = Field( + ..., description="The raw keyword string used for the search." + ) + status: str = Field( + ..., + description="Current status of the search session (e.g., 'PENDING', 'RUNNING', 'COMPLETED', 'FAILED').", + ) + results_count: Optional[int] = Field( + None, + description="Number of relevant items found or processed during the session.", + ) + started_at: Optional[datetime] = Field( + None, description="Timestamp when the search session started (UTC)." + ) + completed_at: Optional[datetime] = Field( + None, description="Timestamp when the search session completed (UTC)." + ) + # --- Surfacing --- class RepositoryCitationCountResponse(BaseModel): """ Provides aggregated citation counts for a specific repository. """ - repository_id: int = Field(..., description="Internal database ID of the repository.") - openalex_aggregated_citations: int = Field(..., description="Total citations of works linked to this repository, based on OpenAlex's cited_by_count.") - moss_discovered_citations: int = Field(..., description="Count of unique citing works discovered and linked within the MOSS system itself.") + + repository_id: int = Field( + ..., description="Internal database ID of the repository." + ) + openalex_aggregated_citations: int = Field( + ..., + description="Total citations of works linked to this repository, based on OpenAlex's cited_by_count.", + ) + moss_discovered_citations: int = Field( + ..., + description="Count of unique citing works discovered and linked within the MOSS system itself.", + ) model_config = ConfigDict(from_attributes=True) @@ -265,27 +443,53 @@ class RecipeParameterMetadataResponse(BaseModel): """ Metadata describing a single parameter required by a recipe or algorithm. """ + name: str = Field(..., description="Name of the parameter.") - type: str = Field(..., description="Expected data type of the parameter (e.g., 'string', 'integer', 'boolean').") - description: str = Field(..., description="Description of the parameter's purpose and usage.") + type: str = Field( + ..., + description="Expected data type of the parameter (e.g., 'string', 'integer', 'boolean').", + ) + description: str = Field( + ..., description="Description of the parameter's purpose and usage." + ) + class RecipeMetadataResponse(BaseModel): """ Metadata describing a discoverable recipe or algorithm script. """ + name: str = Field(..., description="Unique name identifying the recipe/algorithm.") version: str = Field(..., description="Version string for the recipe/algorithm.") - description: str = Field(..., description="Description of what the recipe/algorithm does.") - parameters: List[RecipeParameterMetadataResponse] = Field(..., description="List of parameters required to execute the recipe/algorithm.") - file_path: str = Field(..., description="Relative path to the script file within the recipes directory.") + description: str = Field( + ..., description="Description of what the recipe/algorithm does." + ) + parameters: List[RecipeParameterMetadataResponse] = Field( + ..., description="List of parameters required to execute the recipe/algorithm." + ) + file_path: str = Field( + ..., + description="Relative path to the script file within the recipes directory.", + ) + class RecipeExecutionResponse(BaseModel): """ Standard response structure for the execution of a recipe or algorithm. """ - success: bool = Field(..., description="Indicates whether the execution completed successfully.") - data: Optional[Any] = Field(None, description="Output data generated by the successful execution, structure depends on the recipe.") - error: Optional[Dict[str, str]] = Field(None, description="Details of any error that occurred during execution (e.g., {'type': '...', 'message': '...'}).") + + success: bool = Field( + ..., description="Indicates whether the execution completed successfully." + ) + data: Optional[Any] = Field( + None, + description="Output data generated by the successful execution, structure depends on the recipe.", + ) + error: Optional[Dict[str, str]] = Field( + None, + description="Details of any error that occurred during execution (e.g., {'type': '...', 'message': '...'}).", + ) + # --- Affiliation Algorithm Responses --- class AffiliationResultResponse(BaseResponse): @@ -293,27 +497,65 @@ class AffiliationResultResponse(BaseResponse): Represents a potential affiliation link between a repository and an institution, as determined by an affiliation algorithm. Includes evidence and confidence. """ - repository_id: int = Field(..., description="Internal database ID of the repository.") - institution_id: int = Field(..., description="Internal database ID of the institution.") - algorithm_name: str = Field(..., description="Name of the algorithm that generated this affiliation result.") + + repository_id: int = Field( + ..., description="Internal database ID of the repository." + ) + institution_id: int = Field( + ..., description="Internal database ID of the institution." + ) + algorithm_name: str = Field( + ..., description="Name of the algorithm that generated this affiliation result." + ) algorithm_version: str = Field(..., description="Version of the algorithm used.") - confidence_score: float = Field(..., description="A score (typically 0-1) indicating the algorithm's confidence in this affiliation.") - evidence: Optional[Dict[str, Any]] = Field(None, description="Data used by the algorithm as evidence for this affiliation (structure varies by algorithm).") - parameters_used: Optional[Dict[str, Any]] = Field(None, description="Parameters provided to the algorithm during this execution.") - calculated_at: datetime = Field(..., description="Timestamp when this affiliation result was calculated (UTC).") + confidence_score: float = Field( + ..., + description="A score (typically 0-1) indicating the algorithm's confidence in this affiliation.", + ) + evidence: Optional[Dict[str, Any]] = Field( + None, + description="Data used by the algorithm as evidence for this affiliation (structure varies by algorithm).", + ) + parameters_used: Optional[Dict[str, Any]] = Field( + None, description="Parameters provided to the algorithm during this execution." + ) + calculated_at: datetime = Field( + ..., description="Timestamp when this affiliation result was calculated (UTC)." + ) # Optional fields for convenience, denormalized from related tables - repository_name: Optional[str] = Field(None, description="Full name of the associated repository (owner/repo).") - institution_name: Optional[str] = Field(None, description="Display name of the associated institution.") + repository_name: Optional[str] = Field( + None, description="Full name of the associated repository (owner/repo)." + ) + institution_name: Optional[str] = Field( + None, description="Display name of the associated institution." + ) + class AffiliationExecutionResponse(BaseModel): """ Summarizes the outcome of executing an affiliation algorithm for a specific institution. """ - status: str = Field(..., description="Overall status of the algorithm execution (e.g., 'COMPLETED', 'FAILED', 'PARTIAL_SUCCESS').") - message: str = Field(..., description="A human-readable summary message about the execution process and outcome.") - processed_count: int = Field(..., description="Number of potential affiliation results generated or evaluated by the algorithm.") - created_count: int = Field(..., description="Number of new affiliation records created in the database based on the algorithm's findings.") - updated_count: int = Field(..., description="Number of existing affiliation records updated (e.g., confidence score) based on the algorithm's findings.") + + status: str = Field( + ..., + description="Overall status of the algorithm execution (e.g., 'COMPLETED', 'FAILED', 'PARTIAL_SUCCESS').", + ) + message: str = Field( + ..., + description="A human-readable summary message about the execution process and outcome.", + ) + processed_count: int = Field( + ..., + description="Number of potential affiliation results generated or evaluated by the algorithm.", + ) + created_count: int = Field( + ..., + description="Number of new affiliation records created in the database based on the algorithm's findings.", + ) + updated_count: int = Field( + ..., + description="Number of existing affiliation records updated (e.g., confidence score) based on the algorithm's findings.", + ) # --- Ingestion History Context --- @@ -322,10 +564,21 @@ class IngestionHistoryContextResponse(BaseModel): Provides context about the last ingestion event relevant to a specific parameter (e.g., the last time a specific keyword search was run). """ - param_type: str = Field(..., description="Type of the parameter being queried (e.g., 'KEYWORD', 'URL').") - param_value: str = Field(..., description="Value of the parameter (e.g., the specific keyword or URL).") - last_ingested_at: Optional[datetime] = Field(None, description="Timestamp of the most recent completed ingestion event related to this parameter (UTC).") - ingestion_type: Optional[str] = Field(None, description="Type of the last ingestion event (e.g., 'KEYWORD_SEARCH', 'DIRECT_URL', 'GITHUB_TRENDING').") + + param_type: str = Field( + ..., description="Type of the parameter being queried (e.g., 'KEYWORD', 'URL')." + ) + param_value: str = Field( + ..., description="Value of the parameter (e.g., the specific keyword or URL)." + ) + last_ingested_at: Optional[datetime] = Field( + None, + description="Timestamp of the most recent completed ingestion event related to this parameter (UTC).", + ) + ingestion_type: Optional[str] = Field( + None, + description="Type of the last ingestion event (e.g., 'KEYWORD_SEARCH', 'DIRECT_URL', 'GITHUB_TRENDING').", + ) # --- Discovery Algorithm Responses --- @@ -335,16 +588,35 @@ class IngestionHistoryContextResponse(BaseModel): Currently expected to be a list of strings (e.g., URLs or identifiers found). """ + class SoftwareDependencyResponse(BaseResponse): """ Represents a detected software dependency within a repository's source files. """ + id: int = Field(..., description="Internal database ID for this dependency record.") - repository_id: int = Field(..., description="Internal database ID of the repository containing this dependency.") - dependency_name: str = Field(..., description="Name of the dependency package or library.") - version_constraint: Optional[str] = Field(None, description="Version constraint specified for the dependency (e.g., '>=1.0', '^2.1.3').") - source_file: str = Field(..., description="Path to the file where this dependency was declared (e.g., 'requirements.txt', 'package.json').") - dependency_type: str = Field(..., description="Type or ecosystem of the dependency (e.g., 'pip', 'npm', 'maven').") - is_dev_dependency: Optional[bool] = Field(None, description="Indicates if this is classified as a development dependency (vs. runtime).") + repository_id: int = Field( + ..., + description="Internal database ID of the repository containing this dependency.", + ) + dependency_name: str = Field( + ..., description="Name of the dependency package or library." + ) + version_constraint: Optional[str] = Field( + None, + description="Version constraint specified for the dependency (e.g., '>=1.0', '^2.1.3').", + ) + source_file: str = Field( + ..., + description="Path to the file where this dependency was declared (e.g., 'requirements.txt', 'package.json').", + ) + dependency_type: str = Field( + ..., + description="Type or ecosystem of the dependency (e.g., 'pip', 'npm', 'maven').", + ) + is_dev_dependency: Optional[bool] = Field( + None, + description="Indicates if this is classified as a development dependency (vs. runtime).", + ) # Timestamps inherited from BaseResponse (created_at, updated_at) - model_config = ConfigDict(from_attributes=True) \ No newline at end of file + model_config = ConfigDict(from_attributes=True) diff --git a/backend/services/__init__.py b/backend/services/__init__.py index a06c765..6465e0a 100644 --- a/backend/services/__init__.py +++ b/backend/services/__init__.py @@ -6,7 +6,7 @@ from .ingestion_service import IngestionService from .keyword_discovery_service import KeywordDiscoveryService from .scholarly_processing_service import ScholarlyProcessingService -from .surfacing_service import SurfacingService # <-- ADD THIS IMPORT +from .surfacing_service import SurfacingService # <-- ADD THIS IMPORT __all__ = [ "BaseService", @@ -15,5 +15,5 @@ "IngestionService", "KeywordDiscoveryService", "ScholarlyProcessingService", - "SurfacingService", # <-- ADD THIS TO LIST -] \ No newline at end of file + "SurfacingService", # <-- ADD THIS TO LIST +] diff --git a/backend/services/base_service.py b/backend/services/base_service.py index f5a16cf..4d5b7a8 100644 --- a/backend/services/base_service.py +++ b/backend/services/base_service.py @@ -6,6 +6,7 @@ import logging + class BaseService: """ Base class for service layer components. @@ -14,6 +15,7 @@ class BaseService: Subclasses should implement specific business logic and typically receive dependencies (like repositories or other services) during initialization. """ + # Initialize a logger specific to the service instance, named after the module. logger = logging.getLogger(__name__) @@ -31,4 +33,4 @@ def __init__(self): # self.another_service = another_service # Common utility methods or shared functionality for services - # could be added here in the future if needed. \ No newline at end of file + # could be added here in the future if needed. diff --git a/backend/services/discovery_chain_service.py b/backend/services/discovery_chain_service.py index 0fa2f68..d61e8de 100644 --- a/backend/services/discovery_chain_service.py +++ b/backend/services/discovery_chain_service.py @@ -5,20 +5,19 @@ These chains track the provenance of discovered data points and their relationships. """ -import logging import uuid from datetime import datetime, timezone -from typing import Optional, Dict, Any, Type, TYPE_CHECKING +from typing import Optional, Dict, Any from sqlalchemy.orm import Session from sqlalchemy.exc import SQLAlchemyError # --- Added WorkTopic to model imports --- -from backend.data.models import DiscoveryChain, EntityDiscoveryAssociation, WorkTopic +from backend.data.models import DiscoveryChain, EntityDiscoveryAssociation + # --- End Add --- from backend.data.repositories import ( DiscoveryChainRepository, - EntityDiscoveryAssociationRepository, ) from .base_service import BaseService @@ -52,13 +51,18 @@ def get_by_uuid(self, db: Session, id: uuid.UUID) -> Optional[DiscoveryChain]: self.logger.debug(f"Getting DiscoveryChain by UUID: {id}") repo = DiscoveryChainRepository(db) try: - return repo.get(id=id) + return repo.get(id=id) except SQLAlchemyError as e: - self.logger.error(f"Database error getting DiscoveryChain UUID {id}: {e}", exc_info=True) + self.logger.error( + f"Database error getting DiscoveryChain UUID {id}: {e}", exc_info=True + ) raise def create_root_chain( - self, db: Session, discovery_type: str, parameters: Optional[Dict[str, Any]] = None + self, + db: Session, + discovery_type: str, + parameters: Optional[Dict[str, Any]] = None, ) -> DiscoveryChain: """ Creates a new root DiscoveryChain (level 0). @@ -82,22 +86,24 @@ def create_root_chain( new_chain = DiscoveryChain( id=new_id, parent_chain_id=None, - root_chain_id=new_id, # A root chain is its own root + root_chain_id=new_id, # A root chain is its own root level=0, discovery_type=discovery_type, parameters=parameters, - status='PENDING', # Initial status - started_at=datetime.now(timezone.utc) + status="PENDING", # Initial status + started_at=datetime.now(timezone.utc), ) try: db.add(new_chain) - db.flush() # Ensure the chain object has its ID assigned before returning - db.refresh(new_chain) # Load any server-defaults if applicable + db.flush() # Ensure the chain object has its ID assigned before returning + db.refresh(new_chain) # Load any server-defaults if applicable self.logger.info(f"Created and flushed root chain {new_chain.id}") return new_chain except SQLAlchemyError as e: - self.logger.error(f"Error creating/flushing root discovery chain: {e}", exc_info=True) - db.rollback() # Rollback this specific operation on error + self.logger.error( + f"Error creating/flushing root discovery chain: {e}", exc_info=True + ) + db.rollback() # Rollback this specific operation on error raise def create_child_chain( @@ -127,45 +133,59 @@ def create_child_chain( ValueError: If the parent chain is missing its ID or root_chain_id. SQLAlchemyError: If a database error occurs during creation or flush. """ - self.logger.info(f"Creating child chain under {parent_chain.id}: type='{discovery_type}'") + self.logger.info( + f"Creating child chain under {parent_chain.id}: type='{discovery_type}'" + ) # Ensure parent chain has necessary IDs (already fetched or refreshed) if parent_chain.id is None or parent_chain.root_chain_id is None: - # Attempt to refresh the parent object state from the DB if IDs are missing - try: - db.refresh(parent_chain) - if parent_chain.id is None or parent_chain.root_chain_id is None: - # If still missing after refresh, it indicates a problem - raise ValueError("Parent chain ID or root ID is None even after refresh.") - except Exception as refresh_err: - self.logger.error(f"Failed to refresh parent chain {parent_chain}: {refresh_err}") - raise ValueError("Parent chain must have valid id and root_chain_id, refresh failed.") from refresh_err + # Attempt to refresh the parent object state from the DB if IDs are missing + try: + db.refresh(parent_chain) + if parent_chain.id is None or parent_chain.root_chain_id is None: + # If still missing after refresh, it indicates a problem + raise ValueError( + "Parent chain ID or root ID is None even after refresh." + ) + except Exception as refresh_err: + self.logger.error( + f"Failed to refresh parent chain {parent_chain}: {refresh_err}" + ) + raise ValueError( + "Parent chain must have valid id and root_chain_id, refresh failed." + ) from refresh_err new_chain = DiscoveryChain( parent_chain_id=parent_chain.id, - root_chain_id=parent_chain.root_chain_id, # Inherit root from parent - level=parent_chain.level + 1, # Increment hierarchy level + root_chain_id=parent_chain.root_chain_id, # Inherit root from parent + level=parent_chain.level + 1, # Increment hierarchy level discovery_type=discovery_type, parameters=parameters, - status='PENDING', # Initial status - started_at=datetime.now(timezone.utc) + status="PENDING", # Initial status + started_at=datetime.now(timezone.utc), ) try: db.add(new_chain) - db.flush() # Ensure the chain object has its ID assigned before returning - db.refresh(new_chain) # Load any server-defaults - self.logger.info(f"Created and flushed child chain {new_chain.id} under {parent_chain.id}") + db.flush() # Ensure the chain object has its ID assigned before returning + db.refresh(new_chain) # Load any server-defaults + self.logger.info( + f"Created and flushed child chain {new_chain.id} under {parent_chain.id}" + ) return new_chain except SQLAlchemyError as e: self.logger.error( f"Error creating/flushing child discovery chain under {parent_chain.id}: {e}", - exc_info=True + exc_info=True, ) # Let the caller handle transaction rollback as this might be part of a larger operation raise def _update_chain_status( - self, db: Session, chain: DiscoveryChain, status: str, timestamp: Optional[datetime] = None + self, + db: Session, + chain: DiscoveryChain, + status: str, + timestamp: Optional[datetime] = None, ) -> DiscoveryChain: """ Internal helper to update the status of a DiscoveryChain and optionally set completion time. @@ -190,26 +210,33 @@ def _update_chain_status( self.logger.debug(f"Updating chain {chain.id} status to {status}") chain.status = status if timestamp: - # Set completion timestamp only for terminal states - if status in ['COMPLETED', 'FAILED', 'PARTIAL']: - chain.completed_at = timestamp + # Set completion timestamp only for terminal states + if status in ["COMPLETED", "FAILED", "PARTIAL"]: + chain.completed_at = timestamp try: - db.add(chain) # Add to session to ensure changes are tracked - db.flush() # Persist status change immediately - db.refresh(chain) # Refresh to get accurate state from DB, including potential triggers + db.add(chain) # Add to session to ensure changes are tracked + db.flush() # Persist status change immediately + db.refresh( + chain + ) # Refresh to get accurate state from DB, including potential triggers return chain except SQLAlchemyError as e: - self.logger.error(f"Error updating/flushing chain {chain.id} status to {status}: {e}", exc_info=True) - # Let the caller handle transaction rollback - raise + self.logger.error( + f"Error updating/flushing chain {chain.id} status to {status}: {e}", + exc_info=True, + ) + # Let the caller handle transaction rollback + raise def start_chain(self, db: Session, chain: DiscoveryChain) -> DiscoveryChain: """Sets the chain status to 'PROCESSING'.""" - return self._update_chain_status(db, chain, 'PROCESSING') + return self._update_chain_status(db, chain, "PROCESSING") def complete_chain(self, db: Session, chain: DiscoveryChain) -> DiscoveryChain: """Sets the chain status to 'COMPLETED' and records the completion time.""" - return self._update_chain_status(db, chain, 'COMPLETED', datetime.now(timezone.utc)) + return self._update_chain_status( + db, chain, "COMPLETED", datetime.now(timezone.utc) + ) def fail_chain( self, db: Session, chain: DiscoveryChain, error_message: Optional[str] = None @@ -225,9 +252,13 @@ def fail_chain( Returns: The updated DiscoveryChain object. """ - self.logger.error(f"Discovery chain {chain.id} failed. Type: {chain.discovery_type}. Error: {error_message or 'N/A'}") + self.logger.error( + f"Discovery chain {chain.id} failed. Type: {chain.discovery_type}. Error: {error_message or 'N/A'}" + ) # Future enhancement: could store error_message in chain.parameters or a dedicated field - return self._update_chain_status(db, chain, 'FAILED', datetime.now(timezone.utc)) + return self._update_chain_status( + db, chain, "FAILED", datetime.now(timezone.utc) + ) def associate_entity( self, db: Session, chain: DiscoveryChain, entity: Any, is_direct: bool = True @@ -256,50 +287,66 @@ def associate_entity( """ if entity is None: # Cannot associate a non-existent entity - self.logger.warning(f"Attempted to associate a None entity to chain {chain.id}. Skipping.") + self.logger.warning( + f"Attempted to associate a None entity to chain {chain.id}. Skipping." + ) return None entity_type = entity.__class__.__name__ # Define entity types that use composite primary keys and don't have a single 'id' column # --- ADDED WorkTopic to this list --- - association_types_no_id = ('Authorship', 'Affiliation', 'WorkCitation', 'RepositoryContributorAssociation', 'WorkTopic') + association_types_no_id = ( + "Authorship", + "Affiliation", + "WorkCitation", + "RepositoryContributorAssociation", + "WorkTopic", + ) # --- END ADD --- - entity_id: Optional[int] = None # Standard integer ID + entity_id: Optional[int] = None # Standard integer ID if entity_type not in association_types_no_id: # For standard entities, get the primary key ID - entity_id = getattr(entity, 'id', None) + entity_id = getattr(entity, "id", None) if entity_id is None: # Ensure the entity has been flushed and has an ID before associating - self.logger.error(f"Attempted to associate entity of type {entity_type} without an ID to chain {chain.id}") - raise ValueError(f"Entity {entity_type} must have an ID before association.") + self.logger.error( + f"Attempted to associate entity of type {entity_type} without an ID to chain {chain.id}" + ) + raise ValueError( + f"Entity {entity_type} must have an ID before association." + ) # --- Added else block for logging composite PK types --- else: - # For types with composite keys, create a representation for logging - pk_repr = '[CompositePK]' - try: - # Introspect SQLAlchemy mapper to find primary key columns - if hasattr(entity, '__mapper__'): - pk_cols = [c.name for c in entity.__mapper__.primary_key] - pk_vals = [getattr(entity, c, None) for c in pk_cols] - pk_repr = ', '.join(f"{k}={v}" for k, v in zip(pk_cols, pk_vals)) - self.logger.debug(f"Associating entity type {entity_type} ({pk_repr}) which uses composite PK.") - except Exception as pk_log_err: - self.logger.warning(f"Could not fully represent composite PK for {entity_type}: {pk_log_err}") + # For types with composite keys, create a representation for logging + pk_repr = "[CompositePK]" + try: + # Introspect SQLAlchemy mapper to find primary key columns + if hasattr(entity, "__mapper__"): + pk_cols = [c.name for c in entity.__mapper__.primary_key] + pk_vals = [getattr(entity, c, None) for c in pk_cols] + pk_repr = ", ".join(f"{k}={v}" for k, v in zip(pk_cols, pk_vals)) + self.logger.debug( + f"Associating entity type {entity_type} ({pk_repr}) which uses composite PK." + ) + except Exception as pk_log_err: + self.logger.warning( + f"Could not fully represent composite PK for {entity_type}: {pk_log_err}" + ) # --- End Added --- - if chain.id is None: - # The chain must exist in the DB before associations can be made - raise ValueError("DiscoveryChain must have an ID before association.") + # The chain must exist in the DB before associations can be made + raise ValueError("DiscoveryChain must have an ID before association.") # --- Adjusted Log Message --- # Use the appropriate identifier representation for logging entity_id_repr = entity_id if entity_id is not None else pk_repr - self.logger.debug(f"Associating {entity_type} ({entity_id_repr}) with chain {chain.id} (direct={is_direct})") + self.logger.debug( + f"Associating {entity_type} ({entity_id_repr}) with chain {chain.id} (direct={is_direct})" + ) # --- End Adjusted --- - # Prepare filters to check if this association already exists lookup_filters: Dict[str, Any] = { "discovery_chain_id": chain.id, @@ -307,7 +354,7 @@ def associate_entity( } # Only filter by entity_id if it's applicable (not a composite PK type) if entity_type not in association_types_no_id: - lookup_filters["entity_id"] = entity_id + lookup_filters["entity_id"] = entity_id # For composite PK types, we rely on the combination of chain_id and entity_type # being unique for the purpose of this lookup. If more complex uniqueness checks # involving composite keys are needed, this logic would need enhancement. @@ -316,42 +363,44 @@ def associate_entity( association_data = { "discovery_chain_id": chain.id, "entity_type": entity_type, - "entity_id": entity_id, # Store None for composite PK types in this column + "entity_id": entity_id, # Store None for composite PK types in this column "is_direct_discovery": is_direct, } try: - # --- Modified Lookup Logic --- - # Build the query based on filters - query = db.query(EntityDiscoveryAssociation).filter_by( - discovery_chain_id=lookup_filters["discovery_chain_id"], - entity_type=lookup_filters["entity_type"] - ) - # Add entity_id filter only if applicable - if "entity_id" in lookup_filters: - query = query.filter(EntityDiscoveryAssociation.entity_id == lookup_filters["entity_id"]) - else: - # For composite PK types, ensure we match records where entity_id IS NULL - query = query.filter(EntityDiscoveryAssociation.entity_id.is_(None)) - - existing_assoc = query.first() - # --- End Modified Lookup --- - - if existing_assoc: - # Avoid creating duplicate associations - self.logger.debug("Association already exists, skipping creation.") - return existing_assoc - - # Create and persist the new association - new_assoc = EntityDiscoveryAssociation(**association_data) - db.add(new_assoc) - db.flush() # Assign primary key to the association record itself - db.refresh(new_assoc) # Load defaults like created_at - return new_assoc + # --- Modified Lookup Logic --- + # Build the query based on filters + query = db.query(EntityDiscoveryAssociation).filter_by( + discovery_chain_id=lookup_filters["discovery_chain_id"], + entity_type=lookup_filters["entity_type"], + ) + # Add entity_id filter only if applicable + if "entity_id" in lookup_filters: + query = query.filter( + EntityDiscoveryAssociation.entity_id == lookup_filters["entity_id"] + ) + else: + # For composite PK types, ensure we match records where entity_id IS NULL + query = query.filter(EntityDiscoveryAssociation.entity_id.is_(None)) + + existing_assoc = query.first() + # --- End Modified Lookup --- + + if existing_assoc: + # Avoid creating duplicate associations + self.logger.debug("Association already exists, skipping creation.") + return existing_assoc + + # Create and persist the new association + new_assoc = EntityDiscoveryAssociation(**association_data) + db.add(new_assoc) + db.flush() # Assign primary key to the association record itself + db.refresh(new_assoc) # Load defaults like created_at + return new_assoc except SQLAlchemyError as e: - self.logger.error( - f"Error creating/flushing {entity_type} ({entity_id_repr}) association with chain {chain.id}: {e}", - exc_info=True - ) - # Let the caller handle transaction rollback - raise \ No newline at end of file + self.logger.error( + f"Error creating/flushing {entity_type} ({entity_id_repr}) association with chain {chain.id}: {e}", + exc_info=True, + ) + # Let the caller handle transaction rollback + raise diff --git a/backend/services/doi_processing_service.py b/backend/services/doi_processing_service.py index ea808cc..7c4bb42 100644 --- a/backend/services/doi_processing_service.py +++ b/backend/services/doi_processing_service.py @@ -8,27 +8,32 @@ import logging import re -import time # Ensure time is imported for sleep -from typing import Optional, TYPE_CHECKING, List, Set, Dict, Any, Tuple +from typing import Optional, List -from sqlalchemy.orm import Session, make_transient -from sqlalchemy.exc import IntegrityError, SQLAlchemyError +from sqlalchemy.orm import Session # Import models and repositories -from backend.data.models import Repository, Work, DOIReference, DiscoveryChain, WorkCitation # Added WorkCitation +from backend.data.models import ( + Repository, + Work, + DOIReference, + DiscoveryChain, +) # Added WorkCitation from backend.data.repositories import WorkRepository, DOIReferenceRepository -from backend.external import OpenAlexClient, ApiClientError +from backend.external import OpenAlexClient from backend.utils import doi_utils # Import other services and helpers from .base_service import BaseService from .discovery_chain_service import DiscoveryChainService from .scholarly_processing_service import ScholarlyProcessingService + # Import SessionLocal for creating isolated sessions in specific failure handling scenarios from backend.data.database import SessionLocal logger = logging.getLogger(__name__) + class DOIProcessingService(BaseService): """ Service for processing DOIs discovered in source files. @@ -60,7 +65,9 @@ def __init__(self): self.openalex_client = OpenAlexClient() self.discovery_chain_service = DiscoveryChainService() self.scholarly_processor = ScholarlyProcessingService() - self.logger.debug(f"{self.__class__.__name__} initialized with its own service instances.") + self.logger.debug( + f"{self.__class__.__name__} initialized with its own service instances." + ) def _get_id_from_oa_url(self, url: Optional[str]) -> Optional[str]: """ @@ -77,40 +84,54 @@ def _get_id_from_oa_url(self, url: Optional[str]) -> Optional[str]: The extracted identifier string, or None if parsing fails or the URL format is unrecognized/invalid. """ - if not url or not isinstance(url, str): return None + if not url or not isinstance(url, str): + return None try: id_part: Optional[str] = None # Extract based on URL prefix if url.startswith("https://orcid.org/"): - match = re.search(r'(\d{4}-\d{4}-\d{4}-\d{3}[0-9X])', url) - id_part = match.group(1) if match else None + match = re.search(r"(\d{4}-\d{4}-\d{4}-\d{3}[0-9X])", url) + id_part = match.group(1) if match else None elif url.startswith("https://ror.org/"): - id_part = url.split('/')[-1] + id_part = url.split("/")[-1] elif url.startswith("https://openalex.org/"): - id_part = url.split('/')[-1] + id_part = url.split("/")[-1] elif url.startswith("https://doi.org/"): - # Return the DOI itself, normalized (without the prefix) - id_part = url[len("https://doi.org/"):] + # Return the DOI itself, normalized (without the prefix) + id_part = url[len("https://doi.org/") :] # Check for bare OpenAlex ID pattern (e.g., W123, A456, I789) elif url and url[0].isalpha() and url[1:].isdigit(): id_part = url else: - id_part = None # Unrecognized format + id_part = None # Unrecognized format # Basic validation of extracted ID format (can be extended) is_valid = False if id_part: - if url.startswith("https://openalex.org/") and id_part[0].isalpha() and id_part[1:].isdigit(): is_valid = True - elif url.startswith("https://orcid.org/") and match: is_valid = True - elif url.startswith("https://ror.org/") and id_part.startswith('0') and len(id_part) == 9: is_valid = True - elif url.startswith("https://doi.org/"): is_valid = True # Assume valid DOI string if extracted - elif id_part == url and url[0].isalpha() and url[1:].isdigit(): is_valid = True # Valid bare OA ID + if ( + url.startswith("https://openalex.org/") + and id_part[0].isalpha() + and id_part[1:].isdigit() + ): + is_valid = True + elif url.startswith("https://orcid.org/") and match: + is_valid = True + elif ( + url.startswith("https://ror.org/") + and id_part.startswith("0") + and len(id_part) == 9 + ): + is_valid = True + elif url.startswith("https://doi.org/"): + is_valid = True # Assume valid DOI string if extracted + elif id_part == url and url[0].isalpha() and url[1:].isdigit(): + is_valid = True # Valid bare OA ID return id_part if is_valid else None except Exception as e: - # Log parsing errors but don't crash the whole process - logger.error(f"Error parsing ID/URL {url}: {e}", exc_info=False) + # Log parsing errors but don't crash the whole process + logger.error(f"Error parsing ID/URL {url}: {e}", exc_info=False) return None def extract_resolve_and_store_dois( @@ -119,7 +140,7 @@ def extract_resolve_and_store_dois( parent_chain: DiscoveryChain, repository: Repository, file_content: Optional[str], - source_file: str + source_file: str, ) -> None: """ Orchestrates the main DOI processing workflow for a given file's content. @@ -145,7 +166,9 @@ def extract_resolve_and_store_dois( and background tasks are enqueued. """ if not file_content: - logger.debug(f"No file content provided for {source_file} in repo {repository.id}. Skipping DOI extraction.") + logger.debug( + f"No file content provided for {source_file} in repo {repository.id}. Skipping DOI extraction." + ) return try: @@ -153,20 +176,29 @@ def extract_resolve_and_store_dois( extracted_dois = doi_utils.extract_dois_from_text(file_content) except Exception as e: # Log critical error during extraction phase - logger.error(f"Error extracting DOIs from {source_file} for repo {repository.id}: {e}", exc_info=True) - raise # Re-raise to indicate failure at this stage + logger.error( + f"Error extracting DOIs from {source_file} for repo {repository.id}: {e}", + exc_info=True, + ) + raise # Re-raise to indicate failure at this stage if not extracted_dois: - logger.debug(f"No potential DOIs found in {source_file} for repo {repository.id}.") + logger.debug( + f"No potential DOIs found in {source_file} for repo {repository.id}." + ) return - logger.info(f"Found {len(extracted_dois)} unique potential DOIs in {source_file} for repo {repository.id}.") + logger.info( + f"Found {len(extracted_dois)} unique potential DOIs in {source_file} for repo {repository.id}." + ) # Instantiate repositories using the provided session work_repo = WorkRepository(db) doi_ref_repo = DOIReferenceRepository(db) - any_doi_failed = False # Track if any DOI within the file failed processing - TASK_DELAY_SECONDS = 5 # Delay before background tasks start (allows commit propagation) + any_doi_failed = False # Track if any DOI within the file failed processing + TASK_DELAY_SECONDS = ( + 5 # Delay before background tasks start (allows commit propagation) + ) # Process each extracted DOI individually for doi in extracted_dois: @@ -179,12 +211,22 @@ def extract_resolve_and_store_dois( doi_ref_chain: Optional[DiscoveryChain] = None resolved_work: Optional[Work] = None work_chain: Optional[DiscoveryChain] = None - referenced_oa_ids: List[str] = [] # OpenAlex IDs of works cited by this DOI's work - related_oa_ids: List[str] = [] # OpenAlex IDs of works related to this DOI's work - cited_by_url_for_tasks: Optional[str] = None # URL to fetch citing works from OpenAlex - doi_reference_id: Optional[int] = None # DB ID of the created DOIReference - primary_work_oa_id_for_tasks: Optional[str] = None # OpenAlex ID of the resolved work - commit_main_transaction_successful = False # Flag to control task enqueueing + referenced_oa_ids: List[ + str + ] = [] # OpenAlex IDs of works cited by this DOI's work + related_oa_ids: List[ + str + ] = [] # OpenAlex IDs of works related to this DOI's work + cited_by_url_for_tasks: Optional[str] = ( + None # URL to fetch citing works from OpenAlex + ) + doi_reference_id: Optional[int] = None # DB ID of the created DOIReference + primary_work_oa_id_for_tasks: Optional[str] = ( + None # OpenAlex ID of the resolved work + ) + commit_main_transaction_successful = ( + False # Flag to control task enqueueing + ) try: # --- 1. Check if this exact DOIReference already exists --- @@ -193,46 +235,67 @@ def extract_resolve_and_store_dois( repository_id=repository.id, doi=doi, source_file=source_file ) if existing_ref: - self.logger.debug(f"DOI Loop: DOIReference exists for '{doi}' in {source_file}, committing savepoint and skipping.") - nested_transaction.commit() # Commit the savepoint (effectively does nothing if no changes) - continue # Move to the next DOI in the file + self.logger.debug( + f"DOI Loop: DOIReference exists for '{doi}' in {source_file}, committing savepoint and skipping." + ) + nested_transaction.commit() # Commit the savepoint (effectively does nothing if no changes) + continue # Move to the next DOI in the file # --- 2. Create Discovery Chain for this DOI Reference --- # Tracks the discovery of this specific DOI mention. doi_ref_chain = self.discovery_chain_service.create_child_chain( db=db, - parent_chain=parent_chain, # Linked to the file processing chain - discovery_type='REL_DOI_REFERENCE', - parameters={'repository_id': repository.id, 'source_file': source_file, 'doi': doi} + parent_chain=parent_chain, # Linked to the file processing chain + discovery_type="REL_DOI_REFERENCE", + parameters={ + "repository_id": repository.id, + "source_file": source_file, + "doi": doi, + }, ) self.discovery_chain_service.start_chain(db, doi_ref_chain) - logger.debug(f"DOI Loop: Created DOI ref chain {doi_ref_chain.id} for '{doi}'.") + logger.debug( + f"DOI Loop: Created DOI ref chain {doi_ref_chain.id} for '{doi}'." + ) # --- 3. Resolve DOI via OpenAlex --- # Attempt to find the corresponding scholarly Work using the DOI. work_data = self.openalex_client.resolve_doi_to_work(doi) - logger.debug(f"DOI Loop: OpenAlex resolution result for '{doi}': {'Data found' if work_data else 'Not found (None)'}") + logger.debug( + f"DOI Loop: OpenAlex resolution result for '{doi}': {'Data found' if work_data else 'Not found (None)'}" + ) # --- 4. Process Resolved Work (if found) --- if work_data: # Prepare data for creating/updating the Work record work_input_data = { "openalex_id": self._get_id_from_oa_url(work_data.get("id")), - "doi": self._get_id_from_oa_url(work_data.get("doi")), # Normalize DOI + "doi": self._get_id_from_oa_url( + work_data.get("doi") + ), # Normalize DOI "title": work_data.get("title"), "publication_year": work_data.get("publication_year"), "type": work_data.get("type"), "cited_by_count": work_data.get("cited_by_count"), - "host_venue_display_name": work_data.get("host_venue", {}).get("display_name"), - "openalex_url": work_data.get("id") + "host_venue_display_name": work_data.get("host_venue", {}).get( + "display_name" + ), + "openalex_url": work_data.get("id"), } # Remove keys with None values to avoid overriding existing data with None - work_input_data = {k: v for k, v in work_input_data.items() if v is not None} + work_input_data = { + k: v for k, v in work_input_data.items() if v is not None + } # Validate essential identifiers obtained from OpenAlex - if "doi" not in work_input_data or "openalex_id" not in work_input_data: + if ( + "doi" not in work_input_data + or "openalex_id" not in work_input_data + ): # This indicates an issue with the OpenAlex data or parsing - raise ValueError(f"Missing essential info (DOI/OA ID) for Work from DOI {doi}") + raise ValueError( + f"Missing essential info (DOI/OA ID) for Work from DOI {doi}" + ) # --- 4a. Get or Create Work Record --- # Finds existing Work by DOI or creates a new one. @@ -241,18 +304,25 @@ def extract_resolve_and_store_dois( ) # Store the OpenAlex ID for potential background task arguments primary_work_oa_id_for_tasks = resolved_work.openalex_id - logger.debug(f"DOI Loop: Got/Created Work ID {resolved_work.id}, OA_ID '{primary_work_oa_id_for_tasks}' for DOI '{doi}'.") + logger.debug( + f"DOI Loop: Got/Created Work ID {resolved_work.id}, OA_ID '{primary_work_oa_id_for_tasks}' for DOI '{doi}'." + ) # --- 4b. Create Work Discovery Chain --- # Tracks the discovery of this Work specifically from this DOI. work_chain = self.discovery_chain_service.create_child_chain( db=db, - parent_chain=doi_ref_chain, # Linked to the DOI reference chain - discovery_type='REL_WORK_FROM_DOI', - parameters={'doi': doi, 'openalex_id': resolved_work.openalex_id} + parent_chain=doi_ref_chain, # Linked to the DOI reference chain + discovery_type="REL_WORK_FROM_DOI", + parameters={ + "doi": doi, + "openalex_id": resolved_work.openalex_id, + }, ) # Link the Work record to its discovery chain - self.discovery_chain_service.associate_entity(db, work_chain, resolved_work, is_direct=True) + self.discovery_chain_service.associate_entity( + db, work_chain, resolved_work, is_direct=True + ) # --- 4c. Fetch Full Details & Process Scholarly Data --- # If the work was successfully resolved, fetch and process its detailed metadata. @@ -260,54 +330,77 @@ def extract_resolve_and_store_dois( full_work_data = None try: # Retrieve comprehensive data including authorships, topics, etc. - full_work_data = self.openalex_client.get_work_details(resolved_work.openalex_id) + full_work_data = self.openalex_client.get_work_details( + resolved_work.openalex_id + ) except Exception as fetch_err: # Log error but don't necessarily fail the entire DOI processing - logger.error(f"Error fetching full details for Work OA ID {resolved_work.openalex_id}: {fetch_err}", exc_info=True) + logger.error( + f"Error fetching full details for Work OA ID {resolved_work.openalex_id}: {fetch_err}", + exc_info=True, + ) if full_work_data: - logger.debug(f"DOI Loop: Processing scholarly data for Work ID {resolved_work.id}...") + logger.debug( + f"DOI Loop: Processing scholarly data for Work ID {resolved_work.id}..." + ) try: # Delegate detailed processing (authors, institutions, topics, citations) # This returns IDs needed for background task enqueueing. - referenced_oa_ids, related_oa_ids, cited_by_url_for_tasks = \ - self.scholarly_processor.process_openalex_work_data( - db=db, - work_db=resolved_work, - work_api_data=full_work_data, - parent_chain=work_chain # Pass the specific work chain - ) - logger.debug(f"DOI Loop: Scholarly processing returned: Refs={len(referenced_oa_ids)}, Related={len(related_oa_ids)}, CitedByURL={'Present' if cited_by_url_for_tasks else 'Absent'}") + ( + referenced_oa_ids, + related_oa_ids, + cited_by_url_for_tasks, + ) = self.scholarly_processor.process_openalex_work_data( + db=db, + work_db=resolved_work, + work_api_data=full_work_data, + parent_chain=work_chain, # Pass the specific work chain + ) + logger.debug( + f"DOI Loop: Scholarly processing returned: Refs={len(referenced_oa_ids)}, Related={len(related_oa_ids)}, CitedByURL={'Present' if cited_by_url_for_tasks else 'Absent'}" + ) except Exception as scholarly_err: # Log error during detailed processing, but allow the DOI reference to be saved - logger.error(f"Error during scholarly processing for Work OA ID {resolved_work.openalex_id}: {scholarly_err}", exc_info=True) + logger.error( + f"Error during scholarly processing for Work OA ID {resolved_work.openalex_id}: {scholarly_err}", + exc_info=True, + ) # Potentially mark the work_chain as failed or partial here? else: - logger.warning(f"DOI Loop: Could not fetch full details for Work ID {resolved_work.id}. Skipping detailed scholarly processing.") + logger.warning( + f"DOI Loop: Could not fetch full details for Work ID {resolved_work.id}. Skipping detailed scholarly processing." + ) # Complete the work discovery chain (regardless of detailed processing outcome) if work_chain: self.discovery_chain_service.complete_chain(db, work_chain) else: - # Case where the DOI did not resolve to a known Work in OpenAlex - logger.info(f"DOI Loop: DOI '{doi}' did not resolve via OpenAlex.") + # Case where the DOI did not resolve to a known Work in OpenAlex + logger.info(f"DOI Loop: DOI '{doi}' did not resolve via OpenAlex.") # --- 5. Create DOI Reference Record --- # Link the Repository, source file, and the resolved Work (if any) doi_ref_input_data = { "repository_id": repository.id, "doi": doi, - "work_id": resolved_work.id if resolved_work else None, # Link to Work if resolved - "source_file": source_file + "work_id": resolved_work.id + if resolved_work + else None, # Link to Work if resolved + "source_file": source_file, } doi_reference = DOIReference(**doi_ref_input_data) db.add(doi_reference) - db.flush() # Flush to get the doi_reference.id assigned by the database + db.flush() # Flush to get the doi_reference.id assigned by the database doi_reference_id = doi_reference.id - logger.debug(f"DOI Loop: Created DOIReference ID {doi_reference_id} for '{doi}'.") + logger.debug( + f"DOI Loop: Created DOIReference ID {doi_reference_id} for '{doi}'." + ) # Associate the DOIReference record with its discovery chain - self.discovery_chain_service.associate_entity(db, doi_ref_chain, doi_reference, is_direct=True) + self.discovery_chain_service.associate_entity( + db, doi_ref_chain, doi_reference, is_direct=True + ) # --- 6. Finalize DOI Reference Chain Status --- if resolved_work: @@ -315,26 +408,39 @@ def extract_resolve_and_store_dois( self.discovery_chain_service.complete_chain(db, doi_ref_chain) else: # Mark as failed if the DOI could not be resolved - self.discovery_chain_service.fail_chain(db, doi_ref_chain, error_message="DOI not resolved in OpenAlex") + self.discovery_chain_service.fail_chain( + db, doi_ref_chain, error_message="DOI not resolved in OpenAlex" + ) # --- 7. Commit Savepoint --- # Persist changes made within this loop for this specific DOI. - logger.debug(f"DOI Loop: Attempting commit for savepoint related to DOI '{doi}'...") + logger.debug( + f"DOI Loop: Attempting commit for savepoint related to DOI '{doi}'..." + ) nested_transaction.commit() - logger.info(f"DOI Loop: Successfully committed savepoint for DOI '{doi}' (Ref ID: {doi_reference_id}).") + logger.info( + f"DOI Loop: Successfully committed savepoint for DOI '{doi}' (Ref ID: {doi_reference_id})." + ) # --- 8. Commit Main Transaction (IMPORTANT!) --- # Before enqueueing background tasks, commit the main transaction # to ensure the created Work, DOIReference, etc., are visible to the tasks. try: db.commit() - commit_main_transaction_successful = True # Mark success - logger.info(f"DOI Loop: Committed main transaction after processing DOI '{doi}' before enqueueing.") + commit_main_transaction_successful = True # Mark success + logger.info( + f"DOI Loop: Committed main transaction after processing DOI '{doi}' before enqueueing." + ) except Exception as main_commit_err: # This is a critical failure; the state might be inconsistent. - logger.error(f"DOI Loop: FAILED to commit main transaction for DOI '{doi}': {main_commit_err}", exc_info=True) - db.rollback() # Roll back the entire transaction for safety - primary_work_oa_id_for_tasks = None # Prevent enqueueing based on failed commit + logger.error( + f"DOI Loop: FAILED to commit main transaction for DOI '{doi}': {main_commit_err}", + exc_info=True, + ) + db.rollback() # Roll back the entire transaction for safety + primary_work_oa_id_for_tasks = ( + None # Prevent enqueueing based on failed commit + ) any_doi_failed = True # Attempt to mark the related discovery chain as failed using a separate session # This is best-effort as the primary transaction failed. @@ -343,54 +449,90 @@ def extract_resolve_and_store_dois( # Use a new, independent session for this update temp_db = SessionLocal() try: - chain_to_fail = self.discovery_chain_service.get_by_uuid(temp_db, doi_ref_chain.id) + chain_to_fail = ( + self.discovery_chain_service.get_by_uuid( + temp_db, doi_ref_chain.id + ) + ) if chain_to_fail: - self.discovery_chain_service.fail_chain(temp_db, chain_to_fail, error_message=f"Main commit failed: {str(main_commit_err)[:100]}") - temp_db.commit() # Commit this specific status update - logger.info(f"Marked DOI Ref Chain {chain_to_fail.id} as FAILED after main commit failure.") + self.discovery_chain_service.fail_chain( + temp_db, + chain_to_fail, + error_message=f"Main commit failed: {str(main_commit_err)[:100]}", + ) + temp_db.commit() # Commit this specific status update + logger.info( + f"Marked DOI Ref Chain {chain_to_fail.id} as FAILED after main commit failure." + ) else: - logger.error(f"Could not find DOI Ref Chain {doi_ref_chain.id} to mark as failed after main commit failure.") + logger.error( + f"Could not find DOI Ref Chain {doi_ref_chain.id} to mark as failed after main commit failure." + ) except Exception as fail_e: - logger.error(f"Failed to mark DOI Ref Chain {doi_ref_chain.id} as FAILED after main commit failure: {fail_e}") + logger.error( + f"Failed to mark DOI Ref Chain {doi_ref_chain.id} as FAILED after main commit failure: {fail_e}" + ) temp_db.rollback() finally: temp_db.close() except Exception as session_err: - logger.error(f"Failed to create temp session for failure update: {session_err}") + logger.error( + f"Failed to create temp session for failure update: {session_err}" + ) # --- Error Handling for Single DOI Processing (within the loop) --- except Exception as e: - any_doi_failed = True - logger.error(f"DOI Loop: FAILED processing DOI '{doi}' from {source_file} (before main commit attempt). Rolling back savepoint. Error: {e}", exc_info=True) - try: - # Roll back only the changes made since the last savepoint (for this DOI) - nested_transaction.rollback() - except Exception as rb_err: - logger.error(f"Error rolling back savepoint for failed DOI {doi}: {rb_err}", exc_info=True) - - # Attempt to mark the discovery chain as failed (best-effort) - if doi_ref_chain and doi_ref_chain.id: - try: - # Use a new, independent session - temp_db = SessionLocal() - try: - chain_to_fail = self.discovery_chain_service.get_by_uuid(temp_db, doi_ref_chain.id) - if chain_to_fail: - self.discovery_chain_service.fail_chain(temp_db, chain_to_fail, error_message=f"Savepoint rolled back: {str(e)[:100]}") - temp_db.commit() - logger.info(f"Marked DOI Ref Chain {chain_to_fail.id} as FAILED after rollback.") - else: - logger.error(f"Could not re-fetch DOI Ref Chain {doi_ref_chain.id} to mark as failed after rollback.") - except Exception as fail_e: - logger.error(f"Failed to mark DOI Ref Chain {doi_ref_chain.id} as FAILED after rollback: {fail_e}") - temp_db.rollback() - finally: - temp_db.close() - except Exception as session_err: - logger.error(f"Failed to create temp session for failure update after rollback: {session_err}") - - # Prevent task enqueueing if the initial processing within the savepoint failed - primary_work_oa_id_for_tasks = None + any_doi_failed = True + logger.error( + f"DOI Loop: FAILED processing DOI '{doi}' from {source_file} (before main commit attempt). Rolling back savepoint. Error: {e}", + exc_info=True, + ) + try: + # Roll back only the changes made since the last savepoint (for this DOI) + nested_transaction.rollback() + except Exception as rb_err: + logger.error( + f"Error rolling back savepoint for failed DOI {doi}: {rb_err}", + exc_info=True, + ) + + # Attempt to mark the discovery chain as failed (best-effort) + if doi_ref_chain and doi_ref_chain.id: + try: + # Use a new, independent session + temp_db = SessionLocal() + try: + chain_to_fail = self.discovery_chain_service.get_by_uuid( + temp_db, doi_ref_chain.id + ) + if chain_to_fail: + self.discovery_chain_service.fail_chain( + temp_db, + chain_to_fail, + error_message=f"Savepoint rolled back: {str(e)[:100]}", + ) + temp_db.commit() + logger.info( + f"Marked DOI Ref Chain {chain_to_fail.id} as FAILED after rollback." + ) + else: + logger.error( + f"Could not re-fetch DOI Ref Chain {doi_ref_chain.id} to mark as failed after rollback." + ) + except Exception as fail_e: + logger.error( + f"Failed to mark DOI Ref Chain {doi_ref_chain.id} as FAILED after rollback: {fail_e}" + ) + temp_db.rollback() + finally: + temp_db.close() + except Exception as session_err: + logger.error( + f"Failed to create temp session for failure update after rollback: {session_err}" + ) + + # Prevent task enqueueing if the initial processing within the savepoint failed + primary_work_oa_id_for_tasks = None # --- 9. Background Task Enqueueing --- # Only proceed if the main transaction for this DOI was committed successfully @@ -403,7 +545,10 @@ def extract_resolve_and_store_dois( ) # Import task functions locally to avoid potential circular dependencies at module level - from backend.tasks.scholarly_tasks import process_work_deeply_task, process_citing_works_list_task + from backend.tasks.scholarly_tasks import ( + process_work_deeply_task, + process_citing_works_list_task, + ) # --- Enqueue Task 1: Process Citing Works --- # If OpenAlex provided a URL to fetch works citing the primary work. @@ -411,24 +556,32 @@ def extract_resolve_and_store_dois( try: process_citing_works_list_task.apply_async( args=[ - primary_work_oa_id_for_tasks, # The work being cited (W1) - cited_by_url_for_tasks, # API endpoint to get citing works (Wc) - doi_reference_id # Link back to the original DOI discovery context + primary_work_oa_id_for_tasks, # The work being cited (W1) + cited_by_url_for_tasks, # API endpoint to get citing works (Wc) + doi_reference_id, # Link back to the original DOI discovery context ], - countdown=TASK_DELAY_SECONDS # Delay execution slightly + countdown=TASK_DELAY_SECONDS, # Delay execution slightly + ) + logger.debug( + f"DOI Loop: Enqueued citing works task for {primary_work_oa_id_for_tasks}." ) - logger.debug(f"DOI Loop: Enqueued citing works task for {primary_work_oa_id_for_tasks}.") except Exception as enqueue_err_citing: - logger.error(f"DOI Loop: Failed enqueueing citing works task for {primary_work_oa_id_for_tasks}: {enqueue_err_citing}") + logger.error( + f"DOI Loop: Failed enqueueing citing works task for {primary_work_oa_id_for_tasks}: {enqueue_err_citing}" + ) else: - logger.debug(f"DOI Loop: No cited_by_api_url for {primary_work_oa_id_for_tasks}, skipping citing task.") + logger.debug( + f"DOI Loop: No cited_by_api_url for {primary_work_oa_id_for_tasks}, skipping citing task." + ) # --- Enqueue Task 2: Process Referenced Works --- # If the primary work references other works. if referenced_oa_ids: # Initialize the list of visited nodes for cycle detection in the task initial_visited_list: List[str] = [primary_work_oa_id_for_tasks] - logger.info(f"DOI Loop: Enqueueing deep processing for {len(referenced_oa_ids)} referenced works (W1 cites Wr)...") + logger.info( + f"DOI Loop: Enqueueing deep processing for {len(referenced_oa_ids)} referenced works (W1 cites Wr)..." + ) for ref_oa_id in referenced_oa_ids: # Avoid enqueueing a task for the work to process itself (self-citation handled within task) # Also ensure the referenced ID is valid. @@ -436,20 +589,26 @@ def extract_resolve_and_store_dois( try: process_work_deeply_task.apply_async( args=[ - ref_oa_id, # The work to process deeply (Wr) + ref_oa_id, # The work to process deeply (Wr) primary_work_oa_id_for_tasks, # The citing work (W1) - 'citation', # Relationship type: W1 -> Wr - doi_reference_id, # Link back to original context - 1, # Initial depth for this branch - initial_visited_list # Pass initial visited list + "citation", # Relationship type: W1 -> Wr + doi_reference_id, # Link back to original context + 1, # Initial depth for this branch + initial_visited_list, # Pass initial visited list ], - countdown=TASK_DELAY_SECONDS + countdown=TASK_DELAY_SECONDS, + ) + logger.debug( + f"DOI Loop: Enqueued referenced work task: {ref_oa_id} from {primary_work_oa_id_for_tasks}" ) - logger.debug(f"DOI Loop: Enqueued referenced work task: {ref_oa_id} from {primary_work_oa_id_for_tasks}") except Exception as enqueue_err_ref: - logger.error(f"DOI Loop: Failed to enqueue referenced work {ref_oa_id}: {enqueue_err_ref}") + logger.error( + f"DOI Loop: Failed to enqueue referenced work {ref_oa_id}: {enqueue_err_ref}" + ) else: - logger.debug(f"DOI Loop: No referenced works to enqueue for {primary_work_oa_id_for_tasks}.") + logger.debug( + f"DOI Loop: No referenced works to enqueue for {primary_work_oa_id_for_tasks}." + ) # Optional: Enqueue tasks for related works if needed (currently not standard) # if related_oa_ids: @@ -457,11 +616,17 @@ def extract_resolve_and_store_dois( # # ... similar enqueue logic using 'relation' type ... elif not commit_main_transaction_successful: - logger.warning(f"DOI Loop: Skipping task enqueueing for DOI '{doi}' due to main transaction commit failure.") + logger.warning( + f"DOI Loop: Skipping task enqueueing for DOI '{doi}' due to main transaction commit failure." + ) elif not primary_work_oa_id_for_tasks: # Handles cases where DOI didn't resolve or essential info was missing - logger.info(f"DOI Loop: Skipping task enqueueing for DOI '{doi}' as primary work OA ID was not resolved/set.") + logger.info( + f"DOI Loop: Skipping task enqueueing for DOI '{doi}' as primary work OA ID was not resolved/set." + ) # --- End Task Enqueueing Section --- # --- End of loop for processing individual DOIs --- - logger.info(f"DOI Processing END for: Repo {repository.id}, File {source_file}. Any DOI failures: {any_doi_failed}") \ No newline at end of file + logger.info( + f"DOI Processing END for: Repo {repository.id}, File {source_file}. Any DOI failures: {any_doi_failed}" + ) diff --git a/backend/services/ingestion_service.py b/backend/services/ingestion_service.py index f7950ee..a3af0b1 100644 --- a/backend/services/ingestion_service.py +++ b/backend/services/ingestion_service.py @@ -10,11 +10,9 @@ import re import json from typing import Optional, List, Dict, Any, Tuple -import uuid from sqlalchemy.orm import Session from sqlalchemy.exc import SQLAlchemyError, IntegrityError -from fastapi import HTTPException, status # Import utilities and clients from backend.utils import github_utils @@ -22,12 +20,13 @@ # Import models from backend.data.models import ( - Repository, Owner, Contributor, DiscoveryChain, RepositoryContributorAssociation, - SoftwareDependency, - # --- ADDED MODELS --- - Issue, PullRequest, IssueComment, PRReviewComment + Repository, + Owner, + DiscoveryChain, + RepositoryContributorAssociation, # --- END ADDED --- ) + # Import repositories from backend.data.repositories import ( RepositoryRepository, @@ -38,7 +37,7 @@ IssueRepository, PullRequestRepository, IssueCommentRepository, - PRReviewCommentRepository + PRReviewCommentRepository, # --- END ADDED --- ) @@ -48,8 +47,8 @@ from .doi_processing_service import DOIProcessingService # Import date/time utilities -from datetime import datetime, timezone, timedelta # Added timedelta -import dateutil.parser # Import dateutil.parser for robust timestamp parsing +from datetime import datetime, timezone # Added timedelta +import dateutil.parser # Import dateutil.parser for robust timestamp parsing # Import SessionLocal for creating isolated sessions in specific failure handling scenarios from backend.data.database import SessionLocal @@ -82,8 +81,11 @@ def _parse_github_timestamp(timestamp_str: Optional[str]) -> Optional[datetime]: # Log a warning if parsing fails, but don't interrupt the process logger.warning(f"Could not parse timestamp string '{timestamp_str}': {e}") return None + + # --- End Helper --- + class IngestionService(BaseService): """ Coordinates the ingestion workflow for a single software repository. @@ -104,6 +106,7 @@ class IngestionService(BaseService): - Managing the overall database transaction for a single repository ingestion. - Creating and managing discovery chains to track the ingestion process steps. """ + def __init__(self): """Initializes the IngestionService with its dependencies.""" super().__init__() @@ -112,7 +115,9 @@ def __init__(self): self.discovery_chain_service = DiscoveryChainService() self.doi_processing_service = DOIProcessingService() - def _extract_repo_data_from_github(self, repo_meta: Dict[str, Any]) -> Dict[str, Any]: + def _extract_repo_data_from_github( + self, repo_meta: Dict[str, Any] + ) -> Dict[str, Any]: """ Extracts and transforms relevant fields from the GitHub repository metadata response. @@ -123,10 +128,11 @@ def _extract_repo_data_from_github(self, repo_meta: Dict[str, Any]) -> Dict[str, A dictionary containing structured data suitable for creating or updating a local Repository database record. Includes parsed timestamps, topics, and license info. """ - license_data = repo_meta.get("license") # May be None or a dictionary + license_data = repo_meta.get("license") # May be None or a dictionary topics_list = repo_meta.get("topics", []) # Ensure topics is always a list, even if GitHub returns null - if topics_list is None: topics_list = [] + if topics_list is None: + topics_list = [] # Map GitHub API fields to local database model fields return { @@ -136,22 +142,32 @@ def _extract_repo_data_from_github(self, repo_meta: Dict[str, Any]) -> Dict[str, "description": repo_meta.get("description"), "homepage": repo_meta.get("homepage"), "html_url": repo_meta.get("html_url"), - "api_url": repo_meta.get("url"), # GitHub's API URL for the repo + "api_url": repo_meta.get("url"), # GitHub's API URL for the repo "language": repo_meta.get("language"), "default_branch": repo_meta.get("default_branch"), "stargazers_count": repo_meta.get("stargazers_count", 0), - "watchers_count": repo_meta.get("subscribers_count", 0), # Note: 'subscribers_count' often reflects watchers + "watchers_count": repo_meta.get( + "subscribers_count", 0 + ), # Note: 'subscribers_count' often reflects watchers "forks_count": repo_meta.get("forks_count", 0), "open_issues_count": repo_meta.get("open_issues_count", 0), "is_fork": repo_meta.get("fork", False), - "gh_created_at": _parse_github_timestamp(repo_meta.get("created_at")), # Use helper for robust parsing - "gh_updated_at": _parse_github_timestamp(repo_meta.get("updated_at")), # Use helper - "gh_pushed_at": _parse_github_timestamp(repo_meta.get("pushed_at")), # Use helper - "topics": topics_list, # Store the list of topic strings - "license": license_data, # Store the license sub-dictionary or None + "gh_created_at": _parse_github_timestamp( + repo_meta.get("created_at") + ), # Use helper for robust parsing + "gh_updated_at": _parse_github_timestamp( + repo_meta.get("updated_at") + ), # Use helper + "gh_pushed_at": _parse_github_timestamp( + repo_meta.get("pushed_at") + ), # Use helper + "topics": topics_list, # Store the list of topic strings + "license": license_data, # Store the license sub-dictionary or None } - def _extract_owner_data_from_github(self, owner_meta: Dict[str, Any]) -> Dict[str, Any]: + def _extract_owner_data_from_github( + self, owner_meta: Dict[str, Any] + ) -> Dict[str, Any]: """ Extracts and transforms relevant fields from the GitHub owner metadata response. @@ -166,65 +182,71 @@ def _extract_owner_data_from_github(self, owner_meta: Dict[str, Any]) -> Dict[st # Map GitHub API fields to local database model fields return { "github_id": owner_meta.get("id"), - "login": owner_meta.get("login"), # User or Org name - "type": owner_meta.get("type"), # e.g., "User", "Organization" + "login": owner_meta.get("login"), # User or Org name + "type": owner_meta.get("type"), # e.g., "User", "Organization" "avatar_url": owner_meta.get("avatar_url"), - "html_url": owner_meta.get("html_url"), # URL to GitHub profile/page - "api_url": owner_meta.get("url"), # GitHub's API URL for the owner + "html_url": owner_meta.get("html_url"), # URL to GitHub profile/page + "api_url": owner_meta.get("url"), # GitHub's API URL for the owner } - def _extract_contributor_data_from_github(self, contrib_meta: Dict[str, Any]) -> Dict[str, Any]: - """ - Extracts and transforms relevant fields from the GitHub contributor list item. + def _extract_contributor_data_from_github( + self, contrib_meta: Dict[str, Any] + ) -> Dict[str, Any]: + """ + Extracts and transforms relevant fields from the GitHub contributor list item. - Args: - contrib_meta: A dictionary representing a single contributor from the - GitHub API's contributors list endpoint response. + Args: + contrib_meta: A dictionary representing a single contributor from the + GitHub API's contributors list endpoint response. - Returns: - A dictionary containing structured data suitable for creating or updating - a local Contributor database record, including their contribution count. - """ - # Map GitHub API fields to local database model fields - return { + Returns: + A dictionary containing structured data suitable for creating or updating + a local Contributor database record, including their contribution count. + """ + # Map GitHub API fields to local database model fields + return { "github_id": contrib_meta.get("id"), "login": contrib_meta.get("login"), - "type": contrib_meta.get("type"), # Usually "User" + "type": contrib_meta.get("type"), # Usually "User" "avatar_url": contrib_meta.get("avatar_url"), "html_url": contrib_meta.get("html_url"), "api_url": contrib_meta.get("url"), - "contributions_count": contrib_meta.get("contributions") # Specific to contributor endpoint - } + "contributions_count": contrib_meta.get( + "contributions" + ), # Specific to contributor endpoint + } # --- ADDED HELPER for activity user data --- - def _extract_activity_user_data(self, user_meta: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]: - """ - Extracts relevant user fields from GitHub activity items like issues, PRs, or comments. - - This is similar to contributor extraction but uses the 'user' sub-object found - in issue/PR/comment payloads, which might not include contribution counts. - - Args: - user_meta: The dictionary representing the 'user' associated with an - activity item (e.g., issue author, commenter). - - Returns: - A dictionary containing structured data suitable for creating or updating - a local Contributor record (acting as the user/author), or None if input is invalid. - """ - if not user_meta or not isinstance(user_meta, dict): - return None - # Map GitHub API fields to local database model fields - return { + def _extract_activity_user_data( + self, user_meta: Optional[Dict[str, Any]] + ) -> Optional[Dict[str, Any]]: + """ + Extracts relevant user fields from GitHub activity items like issues, PRs, or comments. + + This is similar to contributor extraction but uses the 'user' sub-object found + in issue/PR/comment payloads, which might not include contribution counts. + + Args: + user_meta: The dictionary representing the 'user' associated with an + activity item (e.g., issue author, commenter). + + Returns: + A dictionary containing structured data suitable for creating or updating + a local Contributor record (acting as the user/author), or None if input is invalid. + """ + if not user_meta or not isinstance(user_meta, dict): + return None + # Map GitHub API fields to local database model fields + return { "github_id": user_meta.get("id"), "login": user_meta.get("login"), - "type": user_meta.get("type"), # Usually 'User' or potentially 'Bot' + "type": user_meta.get("type"), # Usually 'User' or potentially 'Bot' "avatar_url": user_meta.get("avatar_url"), "html_url": user_meta.get("html_url"), - "api_url": user_meta.get("url"), # User-specific API URL - } - # --- END ADDED HELPER --- + "api_url": user_meta.get("url"), # User-specific API URL + } + # --- END ADDED HELPER --- def _parse_requirements_txt(self, content: str) -> List[Tuple[str, Optional[str]]]: """ @@ -244,27 +266,33 @@ def _parse_requirements_txt(self, content: str) -> List[Tuple[str, Optional[str] # Regex to capture the package name (group 1) at the start of a line, # optionally followed by version specifiers, ignoring comments. # Allows letters, numbers, underscore, dot, hyphen in package names. - pattern = re.compile(r"^\s*([a-zA-Z0-9_.-]+)\s*(?:[!=<>~]=?.*)?(?=\s*(?:#.*)?$)") + pattern = re.compile( + r"^\s*([a-zA-Z0-9_.-]+)\s*(?:[!=<>~]=?.*)?(?=\s*(?:#.*)?$)" + ) lines = content.splitlines() for line in lines: line = line.strip() # Skip empty lines and lines that are purely comments - if not line or line.startswith('#'): + if not line or line.startswith("#"): continue match = pattern.match(line) if match: - dep_name = match.group(1).lower() # Normalize package name to lowercase + dep_name = match.group(1).lower() # Normalize package name to lowercase # Attempt to find any version constraint part in the original line constraint_match = re.search(r"[!=<>~]=?.*", line) - constraint = constraint_match.group(0).strip() if constraint_match else None + constraint = ( + constraint_match.group(0).strip() if constraint_match else None + ) dependencies.append((dep_name, constraint)) else: - # Log lines that couldn't be parsed by the simple regex - logger.debug(f"Could not parse line in requirements.txt: '{line}'") + # Log lines that couldn't be parsed by the simple regex + logger.debug(f"Could not parse line in requirements.txt: '{line}'") return dependencies - def _parse_package_json(self, content: str) -> List[Tuple[str, Optional[str], bool]]: + def _parse_package_json( + self, content: str + ) -> List[Tuple[str, Optional[str], bool]]: """ Parses package.json content to extract dependencies and devDependencies. @@ -282,17 +310,21 @@ def _parse_package_json(self, content: str) -> List[Tuple[str, Optional[str], bo # Process regular dependencies deps = data.get("dependencies", {}) - if isinstance(deps, dict): # Ensure it's a dictionary + if isinstance(deps, dict): # Ensure it's a dictionary for name, version in deps.items(): # Normalize name and store version string - dependencies.append((name.lower(), str(version) if version else None, False)) # is_dev = False + dependencies.append( + (name.lower(), str(version) if version else None, False) + ) # is_dev = False # Process development dependencies dev_deps = data.get("devDependencies", {}) - if isinstance(dev_deps, dict): # Ensure it's a dictionary + if isinstance(dev_deps, dict): # Ensure it's a dictionary for name, version in dev_deps.items(): - # Normalize name and store version string - dependencies.append((name.lower(), str(version) if version else None, True)) # is_dev = True + # Normalize name and store version string + dependencies.append( + (name.lower(), str(version) if version else None, True) + ) # is_dev = True except json.JSONDecodeError: # Log specific error for invalid JSON @@ -308,7 +340,7 @@ def _process_dependencies( repository: Repository, parent_chain: DiscoveryChain, owner_login: str, - repo_name: str + repo_name: str, ) -> bool: """ Fetches and processes common software dependency files (e.g., requirements.txt, package.json). @@ -327,14 +359,19 @@ def _process_dependencies( True if dependency processing completed without fatal errors, False otherwise. Note: Individual file fetch/parse errors are logged but may not cause a False return unless critical. """ - dep_chain: Optional[DiscoveryChain] = None # Chain for the overall dependency process - processing_successful = True # Flag to track overall success + dep_chain: Optional[DiscoveryChain] = ( + None # Chain for the overall dependency process + ) + processing_successful = True # Flag to track overall success try: # Create a discovery chain specifically for dependency processing dep_chain = self.discovery_chain_service.create_child_chain( - db=db, parent_chain=parent_chain, discovery_type='PROCESS_DEPENDENCIES', - parameters={'repo_id': repository.id} ) + db=db, + parent_chain=parent_chain, + discovery_type="PROCESS_DEPENDENCIES", + parameters={"repo_id": repository.id}, + ) self.discovery_chain_service.start_chain(db, dep_chain) dep_repo = SoftwareDependencyRepository(db) @@ -345,41 +382,53 @@ def _process_dependencies( # Add other dependency file types here (e.g., pom.xml, Gemfile) } - dependencies_to_add = [] # Accumulate dependency objects before flushing + dependencies_to_add = [] # Accumulate dependency objects before flushing # Iterate through the files we know how to parse for file_path, (dep_type, parser_func) in files_to_check.items(): content: Optional[str] = None - file_chain: Optional[DiscoveryChain] = None # Chain for processing a single file + file_chain: Optional[DiscoveryChain] = ( + None # Chain for processing a single file + ) try: # Create a sub-chain for processing this specific dependency file file_chain = self.discovery_chain_service.create_child_chain( - db=db, parent_chain=dep_chain, - discovery_type=f'PARSE_DEPENDENCY_FILE', - parameters={'file_path': file_path} + db=db, + parent_chain=dep_chain, + discovery_type="PARSE_DEPENDENCY_FILE", + parameters={"file_path": file_path}, ) self.discovery_chain_service.start_chain(db, file_chain) - logger.debug(f"Attempting to fetch dependency file: {owner_login}/{repo_name}/{file_path}") + logger.debug( + f"Attempting to fetch dependency file: {owner_login}/{repo_name}/{file_path}" + ) # Fetch file content using the GitHub client - content = self.github_client.get_file_content(owner_login, repo_name, file_path) + content = self.github_client.get_file_content( + owner_login, repo_name, file_path + ) if content: - logger.info(f"Parsing '{file_path}' for {dep_type} dependencies...") + logger.info( + f"Parsing '{file_path}' for {dep_type} dependencies..." + ) # Use the appropriate parser function for the file type parsed_deps = parser_func(content) - logger.info(f"Found {len(parsed_deps)} potential dependencies in {file_path}.") + logger.info( + f"Found {len(parsed_deps)} potential dependencies in {file_path}." + ) # Process each parsed dependency for dep_data in parsed_deps: - is_dev = False # Default for non-npm types + is_dev = False # Default for non-npm types # Unpack data based on parser return type if dep_type == "npm": dep_name, version_constraint, is_dev = dep_data - else: # requirements.txt format + else: # requirements.txt format dep_name, version_constraint = dep_data - if not dep_name: continue # Skip if name is empty/invalid + if not dep_name: + continue # Skip if name is empty/invalid # Prepare data for SoftwareDependency record dependency_input = { @@ -388,91 +437,144 @@ def _process_dependencies( "version_constraint": version_constraint, "source_file": file_path, "dependency_type": dep_type, - "is_dev_dependency": is_dev if dep_type == "npm" else None, # Applicable only to npm + "is_dev_dependency": is_dev + if dep_type == "npm" + else None, # Applicable only to npm } # Get existing or prepare new dependency object (without committing) - dep_db = dep_repo.get_or_create(obj_in_data=dependency_input) - dependencies_to_add.append(dep_db) # Add to list for bulk flush/association + dep_db = dep_repo.get_or_create( + obj_in_data=dependency_input + ) + dependencies_to_add.append( + dep_db + ) # Add to list for bulk flush/association # Mark the file processing chain as complete self.discovery_chain_service.complete_chain(db, file_chain) else: # File exists but is empty, or fetch returned None gracefully (e.g., API error handled) - logger.debug(f"Dependency file '{file_path}' not found or empty.") + logger.debug( + f"Dependency file '{file_path}' not found or empty." + ) # Mark the file processing chain as failed due to missing file - self.discovery_chain_service.fail_chain(db, file_chain, error_message="File not found or empty") + self.discovery_chain_service.fail_chain( + db, file_chain, error_message="File not found or empty" + ) except ApiClientError as e: # Handle specific API errors during file fetch if e.status_code == 404: - logger.debug(f"Dependency file not found via API: {owner_login}/{repo_name}/{file_path} (404)") + logger.debug( + f"Dependency file not found via API: {owner_login}/{repo_name}/{file_path} (404)" + ) else: # Log other API errors but potentially continue with other files - logger.error(f"API Error fetching dep file {file_path}: {e}", exc_info=False) - processing_successful = False # Mark overall process as having issues + logger.error( + f"API Error fetching dep file {file_path}: {e}", + exc_info=False, + ) + processing_successful = ( + False # Mark overall process as having issues + ) if file_chain: # Mark file chain as failed due to API error - self.discovery_chain_service.fail_chain(db, file_chain, error_message=f"API Error {e.status_code}") + self.discovery_chain_service.fail_chain( + db, file_chain, error_message=f"API Error {e.status_code}" + ) except Exception as e: # Catch unexpected errors during parsing or processing - logger.error(f"Error processing dependency file {file_path}: {e}", exc_info=True) - processing_successful = False # Mark overall process as having issues + logger.error( + f"Error processing dependency file {file_path}: {e}", + exc_info=True, + ) + processing_successful = ( + False # Mark overall process as having issues + ) if file_chain: - # Mark file chain as failed due to processing error - self.discovery_chain_service.fail_chain(db, file_chain, error_message=f"Processing error: {str(e)[:50]}") + # Mark file chain as failed due to processing error + self.discovery_chain_service.fail_chain( + db, + file_chain, + error_message=f"Processing error: {str(e)[:50]}", + ) # --- Flush accumulated dependencies --- # After processing all files, flush the session to assign IDs to new dependencies if dependencies_to_add: - logger.info(f"Flushing {len(dependencies_to_add)} dependency objects...") + logger.info( + f"Flushing {len(dependencies_to_add)} dependency objects..." + ) try: - db.flush() # Persist new/updated dependency records + db.flush() # Persist new/updated dependency records logger.info("Dependency flush successful.") # Now associate the flushed entities with their respective file chains for dep_db in dependencies_to_add: - if dep_db.id: # Check if ID was assigned after flush - # Find the corresponding file processing chain again - # This requires querying based on parameters stored in the chain - # Note: This lookup might be inefficient if parameters are complex. Consider storing file_chain_id temporarily. - file_chain_for_assoc = db.query(DiscoveryChain).filter( - DiscoveryChain.parent_chain_id == dep_chain.id, - # Assuming 'file_path' is stored reliably in parameters as text - DiscoveryChain.parameters['file_path'].astext == dep_db.source_file - ).first() - - if file_chain_for_assoc: - # Link the dependency record to the chain for the file it came from - self.discovery_chain_service.associate_entity(db, file_chain_for_assoc, dep_db, is_direct=True) - else: - # Log if the corresponding file chain couldn't be found - logger.warning(f"Could not find file_chain for dependency {dep_db.dependency_name} from {dep_db.source_file} to associate.") - else: - # This indicates a problem with the flush or session state - logger.error(f"Dependency {dep_db.dependency_name} from {dep_db.source_file} missing ID after flush.") - processing_successful = False + if dep_db.id: # Check if ID was assigned after flush + # Find the corresponding file processing chain again + # This requires querying based on parameters stored in the chain + # Note: This lookup might be inefficient if parameters are complex. Consider storing file_chain_id temporarily. + file_chain_for_assoc = ( + db.query(DiscoveryChain) + .filter( + DiscoveryChain.parent_chain_id == dep_chain.id, + # Assuming 'file_path' is stored reliably in parameters as text + DiscoveryChain.parameters["file_path"].astext + == dep_db.source_file, + ) + .first() + ) + + if file_chain_for_assoc: + # Link the dependency record to the chain for the file it came from + self.discovery_chain_service.associate_entity( + db, file_chain_for_assoc, dep_db, is_direct=True + ) + else: + # Log if the corresponding file chain couldn't be found + logger.warning( + f"Could not find file_chain for dependency {dep_db.dependency_name} from {dep_db.source_file} to associate." + ) + else: + # This indicates a problem with the flush or session state + logger.error( + f"Dependency {dep_db.dependency_name} from {dep_db.source_file} missing ID after flush." + ) + processing_successful = False except (IntegrityError, SQLAlchemyError) as flush_err: - # Catch errors during the flush operation itself - logger.error(f"Error during dependency flush: {flush_err}", exc_info=True) - processing_successful = False # Mark overall process as failed + # Catch errors during the flush operation itself + logger.error( + f"Error during dependency flush: {flush_err}", exc_info=True + ) + processing_successful = False # Mark overall process as failed # Finalize the main dependency processing chain based on overall success if processing_successful: self.discovery_chain_service.complete_chain(db, dep_chain) else: - self.discovery_chain_service.fail_chain(db, dep_chain, error_message="One or more errors during dependency processing/flush.") + self.discovery_chain_service.fail_chain( + db, + dep_chain, + error_message="One or more errors during dependency processing/flush.", + ) except Exception as main_dep_err: # Catch errors in the setup phase of dependency processing - logger.error(f"Fatal error during dependency processing setup for repo {repository.id}: {main_dep_err}", exc_info=True) + logger.error( + f"Fatal error during dependency processing setup for repo {repository.id}: {main_dep_err}", + exc_info=True, + ) if dep_chain: # Attempt to mark the main dependency chain as failed - try: self.discovery_chain_service.fail_chain(db, dep_chain, error_message="Fatal setup error") - except Exception: pass # Ignore errors during this final failure handling - return False # Indicate a fatal setup error occurred + try: + self.discovery_chain_service.fail_chain( + db, dep_chain, error_message="Fatal setup error" + ) + except Exception: + pass # Ignore errors during this final failure handling + return False # Indicate a fatal setup error occurred return processing_successful - def ingest_repository_by_url( self, db: Session, repo_url: str ) -> Optional[DiscoveryChain]: @@ -530,8 +632,10 @@ def ingest_repository_by_url( # --- Step 2: Create Root Discovery Chain --- # Tracks the overall ingestion process initiated by this URL. root_chain = self.discovery_chain_service.create_root_chain( - db=db, discovery_type='DIRECT_URL', - parameters={'url': repo_url, 'owner': owner_login, 'repo': repo_name} ) + db=db, + discovery_type="DIRECT_URL", + parameters={"url": repo_url, "owner": owner_login, "repo": repo_name}, + ) # Mark the chain as started self.discovery_chain_service.start_chain(db, root_chain) @@ -545,26 +649,42 @@ def ingest_repository_by_url( pr_comment_repo = PRReviewCommentRepository(db) # --- Step 3: Fetch Repository Metadata & Owner --- - self.logger.info(f"Fetching repository metadata for {owner_login}/{repo_name}") - repo_meta = self.github_client.get_repository_metadata(owner_login, repo_name) + self.logger.info( + f"Fetching repository metadata for {owner_login}/{repo_name}" + ) + repo_meta = self.github_client.get_repository_metadata( + owner_login, repo_name + ) # Handle case where repository is not found or API fails if not repo_meta: - raise ValueError(f"Repository {owner_login}/{repo_name} not found or inaccessible via API.") + raise ValueError( + f"Repository {owner_login}/{repo_name} not found or inaccessible via API." + ) owner_meta = repo_meta.get("owner") if not owner_meta or not owner_meta.get("id"): - raise ValueError(f"Could not extract valid owner data for {owner_login}/{repo_name}.") + raise ValueError( + f"Could not extract valid owner data for {owner_login}/{repo_name}." + ) # Process and store Owner owner_data = self._extract_owner_data_from_github(owner_meta) - owner_db = owner_repo.get_or_create_by_github_id(github_id=owner_data["github_id"], obj_in_data=owner_data) + owner_db = owner_repo.get_or_create_by_github_id( + github_id=owner_data["github_id"], obj_in_data=owner_data + ) if not owner_db: # This should ideally not happen with get_or_create logic, but check defensively - raise RuntimeError(f"Failed to get or create Owner object for GH ID {owner_data.get('github_id')}") + raise RuntimeError( + f"Failed to get or create Owner object for GH ID {owner_data.get('github_id')}" + ) try: # Flush early to ensure owner_db gets an ID if newly created - logger.debug(f"Flushing Owner object (GH ID {owner_db.github_id})..."); db.flush(); logger.debug(f"Owner flushed successfully (DB ID: {owner_db.id})") + logger.debug(f"Flushing Owner object (GH ID {owner_db.github_id})...") + db.flush() + logger.debug(f"Owner flushed successfully (DB ID: {owner_db.id})") except (IntegrityError, SQLAlchemyError) as owner_flush_e: - logger.error(f"Error during Owner flush: {owner_flush_e}", exc_info=True) + logger.error( + f"Error during Owner flush: {owner_flush_e}", exc_info=True + ) raise owner_flush_e if owner_db.id is None: # ID should be assigned after flush @@ -573,170 +693,272 @@ def ingest_repository_by_url( # Process and store Repository, linking to the Owner repo_data = self._extract_repo_data_from_github(repo_meta) # Pass the owner_obj to establish the relationship during creation/update - repo_db = repo_repo.get_or_create_by_github_id(github_id=repo_data["github_id"], obj_in_data=repo_data, owner_obj=owner_db) + repo_db = repo_repo.get_or_create_by_github_id( + github_id=repo_data["github_id"], + obj_in_data=repo_data, + owner_obj=owner_db, + ) if not repo_db or repo_db.id is None: # Repository should always have an ID after get_or_create and potential flush - raise RuntimeError("Repository ID not available after get_or_create.") - self.logger.info(f"Owner ID: {owner_db.id}, Repo ID: {repo_db.id}, Repo Owner ID field: {repo_db.owner_id} obtained/set.") + raise RuntimeError("Repository ID not available after get_or_create.") + self.logger.info( + f"Owner ID: {owner_db.id}, Repo ID: {repo_db.id}, Repo Owner ID field: {repo_db.owner_id} obtained/set." + ) # Associate the discovered Owner and Repository with the root chain - self.discovery_chain_service.associate_entity(db, root_chain, owner_db, is_direct=False) # Owner is related, not direct result - self.discovery_chain_service.associate_entity(db, root_chain, repo_db, is_direct=True) # Repository is the direct result + self.discovery_chain_service.associate_entity( + db, root_chain, owner_db, is_direct=False + ) # Owner is related, not direct result + self.discovery_chain_service.associate_entity( + db, root_chain, repo_db, is_direct=True + ) # Repository is the direct result # --- Step 4: Fetch Contributors & Store Associations --- - contrib_chain = self.discovery_chain_service.create_child_chain(db, root_chain, 'FETCH_CONTRIBUTORS', {'repo_id': repo_db.id}) + contrib_chain = self.discovery_chain_service.create_child_chain( + db, root_chain, "FETCH_CONTRIBUTORS", {"repo_id": repo_db.id} + ) self.discovery_chain_service.start_chain(db, contrib_chain) - contributors_processed_successfully = True # Track success within this step - contributors_to_add = [] # Accumulate Contributor objects - associations_to_add_or_update = [] # Accumulate association data + contributors_processed_successfully = True # Track success within this step + contributors_to_add = [] # Accumulate Contributor objects + associations_to_add_or_update = [] # Accumulate association data try: # Fetch list of contributors from GitHub API - contributors_meta = self.github_client.get_contributors(owner_login, repo_name) + contributors_meta = self.github_client.get_contributors( + owner_login, repo_name + ) if contributors_meta: - self.logger.info(f"Processing {len(contributors_meta)} contributors for {repo_db.full_name}") + self.logger.info( + f"Processing {len(contributors_meta)} contributors for {repo_db.full_name}" + ) for contrib_meta in contributors_meta: - # Basic validation of contributor data from API - if not contrib_meta or not contrib_meta.get("id"): - logger.warning(f"Skipping invalid contributor data: {contrib_meta}") - continue - - # Extract contributor data and contribution count - contrib_data = self._extract_contributor_data_from_github(contrib_meta) - contributions_count = contrib_data.pop("contributions_count", None) # Remove count before passing to repo - - # Get or create the Contributor record - contrib_db = contrib_repo.get_or_create_by_github_id(github_id=contrib_data["github_id"], obj_in_data=contrib_data) - contributors_to_add.append(contrib_db) # Add to list for bulk flush + # Basic validation of contributor data from API + if not contrib_meta or not contrib_meta.get("id"): + logger.warning( + f"Skipping invalid contributor data: {contrib_meta}" + ) + continue - # Prepare data for the association link (Repository <-> Contributor) - associations_to_add_or_update.append({ - "repository_id": repo_db.id, - "contributor": contrib_db, # Keep the object reference - "contributions_count": contributions_count - }) + # Extract contributor data and contribution count + contrib_data = self._extract_contributor_data_from_github( + contrib_meta + ) + contributions_count = contrib_data.pop( + "contributions_count", None + ) # Remove count before passing to repo + + # Get or create the Contributor record + contrib_db = contrib_repo.get_or_create_by_github_id( + github_id=contrib_data["github_id"], + obj_in_data=contrib_data, + ) + contributors_to_add.append( + contrib_db + ) # Add to list for bulk flush + + # Prepare data for the association link (Repository <-> Contributor) + associations_to_add_or_update.append( + { + "repository_id": repo_db.id, + "contributor": contrib_db, # Keep the object reference + "contributions_count": contributions_count, + } + ) else: - logger.info(f"No contributors found or returned for {repo_db.full_name}") + logger.info( + f"No contributors found or returned for {repo_db.full_name}" + ) # Flush new/updated Contributor objects to get their IDs if contributors_to_add: - self.logger.info(f"Flushing {len(contributors_to_add)} contributor objects...") - try: - db.flush(); self.logger.info("Contributor flush successful.") - except (IntegrityError, SQLAlchemyError) as contrib_flush_err: - logger.error(f"Error during contributor flush: {contrib_flush_err}", exc_info=True) - contributors_processed_successfully = False # Mark step as failed + self.logger.info( + f"Flushing {len(contributors_to_add)} contributor objects..." + ) + try: + db.flush() + self.logger.info("Contributor flush successful.") + except (IntegrityError, SQLAlchemyError) as contrib_flush_err: + logger.error( + f"Error during contributor flush: {contrib_flush_err}", + exc_info=True, + ) + contributors_processed_successfully = ( + False # Mark step as failed + ) # Process associations only if contributor flush was okay if contributors_processed_successfully: - self.logger.info(f"Processing {len(associations_to_add_or_update)} contributor associations...") + self.logger.info( + f"Processing {len(associations_to_add_or_update)} contributor associations..." + ) for assoc_data in associations_to_add_or_update: contrib_obj = assoc_data["contributor"] # Ensure the contributor object has an ID after the flush if not contrib_obj or contrib_obj.id is None: - logger.error(f"Contributor object missing or has no ID after flush: {contrib_obj}") + logger.error( + f"Contributor object missing or has no ID after flush: {contrib_obj}" + ) contributors_processed_successfully = False continue # Check if association already exists - association = db.query(RepositoryContributorAssociation).filter_by( - repository_id=assoc_data["repository_id"], - contributor_id=contrib_obj.id - ).first() + association = ( + db.query(RepositoryContributorAssociation) + .filter_by( + repository_id=assoc_data["repository_id"], + contributor_id=contrib_obj.id, + ) + .first() + ) if association: # Update contribution count if it changed - if association.contributions_count != assoc_data["contributions_count"]: - association.contributions_count = assoc_data["contributions_count"] - db.add(association) # Mark for update - logger.debug(f"Updated contribution count for Repo {assoc_data['repository_id']} / Contrib {contrib_obj.id} to {assoc_data['contributions_count']}") + if ( + association.contributions_count + != assoc_data["contributions_count"] + ): + association.contributions_count = assoc_data[ + "contributions_count" + ] + db.add(association) # Mark for update + logger.debug( + f"Updated contribution count for Repo {assoc_data['repository_id']} / Contrib {contrib_obj.id} to {assoc_data['contributions_count']}" + ) else: # Create new association record association = RepositoryContributorAssociation( repository_id=assoc_data["repository_id"], contributor_id=contrib_obj.id, - contributions_count=assoc_data["contributions_count"] + contributions_count=assoc_data["contributions_count"], + ) + db.add(association) # Mark for insertion + logger.debug( + f"Prepared new association for Repo {assoc_data['repository_id']} / Contrib {contrib_obj.id} with count {assoc_data['contributions_count']}" ) - db.add(association) # Mark for insertion - logger.debug(f"Prepared new association for Repo {assoc_data['repository_id']} / Contrib {contrib_obj.id} with count {assoc_data['contributions_count']}") # Associate the Contributor entity (not the association link) with the contributor chain - self.discovery_chain_service.associate_entity(db, contrib_chain, contrib_obj, is_direct=True) + self.discovery_chain_service.associate_entity( + db, contrib_chain, contrib_obj, is_direct=True + ) # Flush association changes (updates/inserts) if contributors_processed_successfully: - try: - logger.debug("Flushing contributor associations..."); db.flush(); logger.debug("Contributor associations flushed.") - except (IntegrityError, SQLAlchemyError) as assoc_flush_err: - logger.error(f"Error during contributor association flush: {assoc_flush_err}", exc_info=True) - contributors_processed_successfully = False # Mark step as failed + try: + logger.debug("Flushing contributor associations...") + db.flush() + logger.debug("Contributor associations flushed.") + except (IntegrityError, SQLAlchemyError) as assoc_flush_err: + logger.error( + f"Error during contributor association flush: {assoc_flush_err}", + exc_info=True, + ) + contributors_processed_successfully = ( + False # Mark step as failed + ) # Finalize contributor chain status if contributors_processed_successfully: self.discovery_chain_service.complete_chain(db, contrib_chain) else: - self.discovery_chain_service.fail_chain(db, contrib_chain, error_message="One or more errors during contributor/association processing.") + self.discovery_chain_service.fail_chain( + db, + contrib_chain, + error_message="One or more errors during contributor/association processing.", + ) except (ApiClientError, Exception) as contrib_e: # Catch errors during the initial contributor fetch - logger.error(f"Failed fetching contributors list for {repo_db.full_name}: {contrib_e}", exc_info=True) - contributors_processed_successfully = False # Mark step as failed + logger.error( + f"Failed fetching contributors list for {repo_db.full_name}: {contrib_e}", + exc_info=True, + ) + contributors_processed_successfully = False # Mark step as failed if contrib_chain: try: # Attempt to mark chain as failed due to fetch error - self.discovery_chain_service.fail_chain(db, contrib_chain, error_message=f"Failed to fetch list: {str(contrib_e)[:100]}") + self.discovery_chain_service.fail_chain( + db, + contrib_chain, + error_message=f"Failed to fetch list: {str(contrib_e)[:100]}", + ) except Exception as chain_fail_err: # Log error during failure handling itself - logger.error(f"Error trying to fail contributor chain {contrib_chain.id} after fetch error: {chain_fail_err}") - + logger.error( + f"Error trying to fail contributor chain {contrib_chain.id} after fetch error: {chain_fail_err}" + ) # --- Step 5: Process Dependencies --- # Delegate dependency file processing to the helper method - self.logger.info(f"Initiating dependency processing for {repo_db.full_name}...") + self.logger.info( + f"Initiating dependency processing for {repo_db.full_name}..." + ) self._process_dependencies( - db=db, repository=repo_db, parent_chain=root_chain, - owner_login=owner_login, repo_name=repo_name + db=db, + repository=repo_db, + parent_chain=root_chain, + owner_login=owner_login, + repo_name=repo_name, + ) + self.logger.info( + f"Dependency processing step finished for {repo_db.full_name}." ) - self.logger.info(f"Dependency processing step finished for {repo_db.full_name}.") # --- Step 6: Process DOI Files --- # Create chain for DOI processing step files_chain = self.discovery_chain_service.create_child_chain( - db=db, parent_chain=root_chain, discovery_type='PROCESS_DOI_FILES', - parameters={'repo_id': repo_db.id} + db=db, + parent_chain=root_chain, + discovery_type="PROCESS_DOI_FILES", + parameters={"repo_id": repo_db.id}, ) self.discovery_chain_service.start_chain(db, files_chain) # Define common files where DOIs might be found files_to_check = ["README.md", "README", "README.rst", "CITATION.cff"] - files_processed_without_errors = True # Track success within this step + files_processed_without_errors = True # Track success within this step - self.logger.info(f"Processing files {files_to_check} for DOIs in {repo_db.full_name}") + self.logger.info( + f"Processing files {files_to_check} for DOIs in {repo_db.full_name}" + ) for file_path in files_to_check: content: Optional[str] = None try: # Fetch file content - logger.debug(f"Attempting to fetch file: {owner_login}/{repo_name}/{file_path}") - content = self.github_client.get_file_content(owner_login, repo_name, file_path) + logger.debug( + f"Attempting to fetch file: {owner_login}/{repo_name}/{file_path}" + ) + content = self.github_client.get_file_content( + owner_login, repo_name, file_path + ) logger.debug(f"Fetch attempt for {file_path} completed.") except ApiClientError as e: # Handle API errors (e.g., 404 Not Found) gracefully if e.status_code == 404: - logger.debug(f"File not found via API: {owner_login}/{repo_name}/{file_path} (404)") + logger.debug( + f"File not found via API: {owner_login}/{repo_name}/{file_path} (404)" + ) else: # Log other API errors and mark step as having issues - logger.error(f"API Error fetching file {file_path}: {e}", exc_info=False) + logger.error( + f"API Error fetching file {file_path}: {e}", exc_info=False + ) files_processed_without_errors = False - continue # Move to the next file + continue # Move to the next file except ValueError as ve: - # Catch potential errors decoding content (if applicable in get_file_content) - logger.error(f"Content processing error for file {file_path}: {ve}", exc_info=True) - files_processed_without_errors = False - continue + # Catch potential errors decoding content (if applicable in get_file_content) + logger.error( + f"Content processing error for file {file_path}: {ve}", + exc_info=True, + ) + files_processed_without_errors = False + continue except Exception as e: - # Catch unexpected errors during file fetch/processing - logger.error(f"Unexpected Error fetching/processing file {file_path}: {e}", exc_info=True) - files_processed_without_errors = False - continue + # Catch unexpected errors during file fetch/processing + logger.error( + f"Unexpected Error fetching/processing file {file_path}: {e}", + exc_info=True, + ) + files_processed_without_errors = False + continue # If content was successfully fetched if content: @@ -745,270 +967,465 @@ def ingest_repository_by_url( # Delegate DOI extraction, resolution, and storage to DOIProcessingService # This service manages its own savepoints and commits internally. self.doi_processing_service.extract_resolve_and_store_dois( - db=db, # Pass the main session - parent_chain=files_chain, # Link DOI chains to this file chain + db=db, # Pass the main session + parent_chain=files_chain, # Link DOI chains to this file chain repository=repo_db, file_content=content, - source_file=file_path + source_file=file_path, ) except Exception as doi_proc_e: # Catch errors originating from the DOI service call itself - logger.error(f"Error occurred during DOI processing trigger/setup for file {file_path}: {doi_proc_e}", exc_info=True) + logger.error( + f"Error occurred during DOI processing trigger/setup for file {file_path}: {doi_proc_e}", + exc_info=True, + ) files_processed_without_errors = False else: # Log cases where file was found but empty, or fetch failed gracefully - logger.debug(f"File found but content was empty or fetch failed gracefully (e.g. 404), skipping DOI processing: {file_path}") + logger.debug( + f"File found but content was empty or fetch failed gracefully (e.g. 404), skipping DOI processing: {file_path}" + ) # Finalize the DOI file processing chain status try: - db.add(files_chain) # Ensure chain is in session + db.add(files_chain) # Ensure chain is in session if files_processed_without_errors: self.discovery_chain_service.complete_chain(db, files_chain) else: - self.discovery_chain_service.fail_chain(db, files_chain, error_message="One or more errors during file/DOI processing trigger.") - db.flush() # Persist chain status update + self.discovery_chain_service.fail_chain( + db, + files_chain, + error_message="One or more errors during file/DOI processing trigger.", + ) + db.flush() # Persist chain status update except Exception as files_chain_update_e: - # Log error if updating the chain status fails - logger.error(f"Failed to update final status for files_chain {files_chain.id}: {files_chain_update_e}") - + # Log error if updating the chain status fails + logger.error( + f"Failed to update final status for files_chain {files_chain.id}: {files_chain_update_e}" + ) # --- Step 7: Process Issues and Comments --- self.logger.info(f"Initiating issue processing for {repo_db.full_name}...") - issues_processed_successfully = True # Track success for this step - issues_chain = self.discovery_chain_service.create_child_chain(db, root_chain, 'FETCH_ISSUES', {'repo_id': repo_db.id}) + issues_processed_successfully = True # Track success for this step + issues_chain = self.discovery_chain_service.create_child_chain( + db, root_chain, "FETCH_ISSUES", {"repo_id": repo_db.id} + ) self.discovery_chain_service.start_chain(db, issues_chain) try: # Fetch issues (potentially paginated by the client) - assumes fetching all states issues_meta = self.github_client.get_issues(owner_login, repo_name) - self.logger.info(f"Fetched {len(issues_meta)} issues for {repo_db.full_name}.") + self.logger.info( + f"Fetched {len(issues_meta)} issues for {repo_db.full_name}." + ) for issue_meta in issues_meta: # Extract key identifiers and user data - issue_gh_id = issue_meta.get('id') - issue_user_data = self._extract_activity_user_data(issue_meta.get('user')) + issue_gh_id = issue_meta.get("id") + issue_user_data = self._extract_activity_user_data( + issue_meta.get("user") + ) # Basic validation - if not issue_gh_id or not issue_user_data or not issue_user_data.get('github_id'): - logger.warning(f"Skipping issue due to missing ID or user data: Issue number {issue_meta.get('number')}") + if ( + not issue_gh_id + or not issue_user_data + or not issue_user_data.get("github_id") + ): + logger.warning( + f"Skipping issue due to missing ID or user data: Issue number {issue_meta.get('number')}" + ) continue - issue_chain: Optional[DiscoveryChain] = None # Chain for processing this single issue + issue_chain: Optional[DiscoveryChain] = ( + None # Chain for processing this single issue + ) try: # Create a sub-chain for this specific issue - issue_chain = self.discovery_chain_service.create_child_chain(db, issues_chain, 'PROCESS_ISSUE', {'issue_gh_id': issue_gh_id}) + issue_chain = self.discovery_chain_service.create_child_chain( + db, + issues_chain, + "PROCESS_ISSUE", + {"issue_gh_id": issue_gh_id}, + ) self.discovery_chain_service.start_chain(db, issue_chain) # Get/Create the author (as a Contributor record) - issue_author_db = contrib_repo.get_or_create_by_github_id(github_id=issue_user_data['github_id'], obj_in_data=issue_user_data) - db.flush() # Ensure author has an ID + issue_author_db = contrib_repo.get_or_create_by_github_id( + github_id=issue_user_data["github_id"], + obj_in_data=issue_user_data, + ) + db.flush() # Ensure author has an ID if issue_author_db.id is None: - raise RuntimeError(f"Issue author Contributor ID is None after flush for GH ID {issue_user_data['github_id']}") + raise RuntimeError( + f"Issue author Contributor ID is None after flush for GH ID {issue_user_data['github_id']}" + ) # Associate author with the issue chain (indirect discovery) - self.discovery_chain_service.associate_entity(db, issue_chain, issue_author_db, is_direct=False) + self.discovery_chain_service.associate_entity( + db, issue_chain, issue_author_db, is_direct=False + ) # Prepare data for the Issue record issue_input = { "github_id": issue_gh_id, "repository_id": repo_db.id, - "user_id": issue_author_db.id, # Link to Contributor record - "number": issue_meta.get('number'), - "title": issue_meta.get('title'), - "state": issue_meta.get('state'), # e.g., 'open', 'closed' - "gh_created_at": _parse_github_timestamp(issue_meta.get("created_at")), - "gh_updated_at": _parse_github_timestamp(issue_meta.get("updated_at")), - "gh_closed_at": _parse_github_timestamp(issue_meta.get("closed_at")), + "user_id": issue_author_db.id, # Link to Contributor record + "number": issue_meta.get("number"), + "title": issue_meta.get("title"), + "state": issue_meta.get("state"), # e.g., 'open', 'closed' + "gh_created_at": _parse_github_timestamp( + issue_meta.get("created_at") + ), + "gh_updated_at": _parse_github_timestamp( + issue_meta.get("updated_at") + ), + "gh_closed_at": _parse_github_timestamp( + issue_meta.get("closed_at") + ), } # Get or create the Issue record - issue_db = issue_repo.get_or_create_by_github_id(github_id=issue_gh_id, obj_in_data=issue_input) - db.flush() # Ensure issue has an ID + issue_db = issue_repo.get_or_create_by_github_id( + github_id=issue_gh_id, obj_in_data=issue_input + ) + db.flush() # Ensure issue has an ID if issue_db.id is None: - raise RuntimeError(f"Issue ID is None after flush for GH ID {issue_gh_id}") + raise RuntimeError( + f"Issue ID is None after flush for GH ID {issue_gh_id}" + ) # Associate the Issue with its processing chain (direct discovery) - self.discovery_chain_service.associate_entity(db, issue_chain, issue_db, is_direct=True) + self.discovery_chain_service.associate_entity( + db, issue_chain, issue_db, is_direct=True + ) # --- Process Issue Comments --- # Fetch comments for this specific issue number - comments_meta = self.github_client.get_issue_comments(owner_login, repo_name, issue_number=issue_db.number) - logger.debug(f"Fetched {len(comments_meta)} comments for Issue #{issue_db.number}") + comments_meta = self.github_client.get_issue_comments( + owner_login, repo_name, issue_number=issue_db.number + ) + logger.debug( + f"Fetched {len(comments_meta)} comments for Issue #{issue_db.number}" + ) for comment_meta in comments_meta: - # Extract key identifiers and user data for the comment - comment_gh_id = comment_meta.get('id') - comment_user_data = self._extract_activity_user_data(comment_meta.get('user')) - # Basic validation - if not comment_gh_id or not comment_user_data or not comment_user_data.get('github_id'): - logger.warning(f"Skipping issue comment due to missing ID or user data on Issue #{issue_db.number}") - continue - - # Get/Create the comment author (as Contributor) - comment_author_db = contrib_repo.get_or_create_by_github_id(github_id=comment_user_data['github_id'], obj_in_data=comment_user_data) - db.flush() # Ensure author has ID - if comment_author_db.id is None: - logger.error(f"Comment author Contributor ID is None for GH ID {comment_user_data['github_id']}") - continue # Skip this comment if author failed - - # Prepare data for IssueComment record - comment_input = { - "github_id": comment_gh_id, - "issue_id": issue_db.id, # Link to the parent Issue - "user_id": comment_author_db.id, # Link to the author Contributor - "body": comment_meta.get('body'), # Comment text - "gh_created_at": _parse_github_timestamp(comment_meta.get("created_at")), - "gh_updated_at": _parse_github_timestamp(comment_meta.get("updated_at")), - } - # Get or create the IssueComment record - comment_db = issue_comment_repo.get_or_create_by_github_id(github_id=comment_gh_id, obj_in_data=comment_input) - # Associate comment with the *issue* chain (indirect discovery via issue) - self.discovery_chain_service.associate_entity(db, issue_chain, comment_db, is_direct=False) + # Extract key identifiers and user data for the comment + comment_gh_id = comment_meta.get("id") + comment_user_data = self._extract_activity_user_data( + comment_meta.get("user") + ) + # Basic validation + if ( + not comment_gh_id + or not comment_user_data + or not comment_user_data.get("github_id") + ): + logger.warning( + f"Skipping issue comment due to missing ID or user data on Issue #{issue_db.number}" + ) + continue + + # Get/Create the comment author (as Contributor) + comment_author_db = contrib_repo.get_or_create_by_github_id( + github_id=comment_user_data["github_id"], + obj_in_data=comment_user_data, + ) + db.flush() # Ensure author has ID + if comment_author_db.id is None: + logger.error( + f"Comment author Contributor ID is None for GH ID {comment_user_data['github_id']}" + ) + continue # Skip this comment if author failed + + # Prepare data for IssueComment record + comment_input = { + "github_id": comment_gh_id, + "issue_id": issue_db.id, # Link to the parent Issue + "user_id": comment_author_db.id, # Link to the author Contributor + "body": comment_meta.get("body"), # Comment text + "gh_created_at": _parse_github_timestamp( + comment_meta.get("created_at") + ), + "gh_updated_at": _parse_github_timestamp( + comment_meta.get("updated_at") + ), + } + # Get or create the IssueComment record + comment_db = issue_comment_repo.get_or_create_by_github_id( + github_id=comment_gh_id, obj_in_data=comment_input + ) + # Associate comment with the *issue* chain (indirect discovery via issue) + self.discovery_chain_service.associate_entity( + db, issue_chain, comment_db, is_direct=False + ) # Mark the individual issue processing chain as complete self.discovery_chain_service.complete_chain(db, issue_chain) - except (ApiClientError, IntegrityError, SQLAlchemyError, ValueError, RuntimeError) as issue_err: + except ( + ApiClientError, + IntegrityError, + SQLAlchemyError, + ValueError, + RuntimeError, + ) as issue_err: # Catch errors related to processing a single issue or its comments - logger.error(f"Error processing issue GH ID {issue_gh_id} or its comments: {issue_err}", exc_info=False) - issues_processed_successfully = False # Mark overall issue step as having issues + logger.error( + f"Error processing issue GH ID {issue_gh_id} or its comments: {issue_err}", + exc_info=False, + ) + issues_processed_successfully = ( + False # Mark overall issue step as having issues + ) if issue_chain: try: # Attempt to mark the specific issue chain as failed - self.discovery_chain_service.fail_chain(db, issue_chain, error_message=f"Issue/Comment processing error: {str(issue_err)[:100]}") + self.discovery_chain_service.fail_chain( + db, + issue_chain, + error_message=f"Issue/Comment processing error: {str(issue_err)[:100]}", + ) except Exception as chain_fail_err: - logger.error(f"Error failing issue chain {issue_chain.id}: {chain_fail_err}") + logger.error( + f"Error failing issue chain {issue_chain.id}: {chain_fail_err}" + ) except (ApiClientError, Exception) as e: # Catch errors during the initial fetch of the issues list - logger.error(f"Failed fetching issues list for {repo_db.full_name}: {e}", exc_info=True) - issues_processed_successfully = False # Mark step as failed + logger.error( + f"Failed fetching issues list for {repo_db.full_name}: {e}", + exc_info=True, + ) + issues_processed_successfully = False # Mark step as failed finally: # Finalize the main issues processing chain status if issues_chain: if issues_processed_successfully: self.discovery_chain_service.complete_chain(db, issues_chain) else: - self.discovery_chain_service.fail_chain(db, issues_chain, "One or more errors during issue/comment processing.") + self.discovery_chain_service.fail_chain( + db, + issues_chain, + "One or more errors during issue/comment processing.", + ) try: - db.flush() # Persist final chain status + db.flush() # Persist final chain status except Exception as flush_err: - logger.error(f"Error flushing issues chain final status: {flush_err}") + logger.error( + f"Error flushing issues chain final status: {flush_err}" + ) # --- Step 8: Process Pull Requests and Comments --- # This section mirrors the structure of Issue processing - self.logger.info(f"Initiating pull request processing for {repo_db.full_name}...") - prs_processed_successfully = True # Track success for this step - prs_chain = self.discovery_chain_service.create_child_chain(db, root_chain, 'FETCH_PULL_REQUESTS', {'repo_id': repo_db.id}) + self.logger.info( + f"Initiating pull request processing for {repo_db.full_name}..." + ) + prs_processed_successfully = True # Track success for this step + prs_chain = self.discovery_chain_service.create_child_chain( + db, root_chain, "FETCH_PULL_REQUESTS", {"repo_id": repo_db.id} + ) self.discovery_chain_service.start_chain(db, prs_chain) try: # Fetch pull requests (potentially paginated) - assumes fetching all states prs_meta = self.github_client.get_pull_requests(owner_login, repo_name) - self.logger.info(f"Fetched {len(prs_meta)} pull requests for {repo_db.full_name}.") + self.logger.info( + f"Fetched {len(prs_meta)} pull requests for {repo_db.full_name}." + ) for pr_meta in prs_meta: # Extract key identifiers and user data - pr_gh_id = pr_meta.get('id') - pr_user_data = self._extract_activity_user_data(pr_meta.get('user')) + pr_gh_id = pr_meta.get("id") + pr_user_data = self._extract_activity_user_data(pr_meta.get("user")) # Basic validation - if not pr_gh_id or not pr_user_data or not pr_user_data.get('github_id'): - logger.warning(f"Skipping PR due to missing ID or user data: PR number {pr_meta.get('number')}") + if ( + not pr_gh_id + or not pr_user_data + or not pr_user_data.get("github_id") + ): + logger.warning( + f"Skipping PR due to missing ID or user data: PR number {pr_meta.get('number')}" + ) continue - pr_chain: Optional[DiscoveryChain] = None # Chain for processing this single PR + pr_chain: Optional[DiscoveryChain] = ( + None # Chain for processing this single PR + ) try: # Create a sub-chain for this specific PR - pr_chain = self.discovery_chain_service.create_child_chain(db, prs_chain, 'PROCESS_PULL_REQUEST', {'pr_gh_id': pr_gh_id}) + pr_chain = self.discovery_chain_service.create_child_chain( + db, + prs_chain, + "PROCESS_PULL_REQUEST", + {"pr_gh_id": pr_gh_id}, + ) self.discovery_chain_service.start_chain(db, pr_chain) # Get/Create the author (as Contributor) - pr_author_db = contrib_repo.get_or_create_by_github_id(github_id=pr_user_data['github_id'], obj_in_data=pr_user_data) - db.flush() # Ensure author has ID + pr_author_db = contrib_repo.get_or_create_by_github_id( + github_id=pr_user_data["github_id"], + obj_in_data=pr_user_data, + ) + db.flush() # Ensure author has ID if pr_author_db.id is None: - raise RuntimeError(f"PR author Contributor ID is None after flush for GH ID {pr_user_data['github_id']}") + raise RuntimeError( + f"PR author Contributor ID is None after flush for GH ID {pr_user_data['github_id']}" + ) # Associate author with the PR chain (indirect discovery) - self.discovery_chain_service.associate_entity(db, pr_chain, pr_author_db, is_direct=False) + self.discovery_chain_service.associate_entity( + db, pr_chain, pr_author_db, is_direct=False + ) # Prepare data for PullRequest record pr_input = { "github_id": pr_gh_id, "repository_id": repo_db.id, - "user_id": pr_author_db.id, # Link to author Contributor - "number": pr_meta.get('number'), - "title": pr_meta.get('title'), - "state": pr_meta.get('state'), # e.g., 'open', 'closed', 'merged' - "gh_created_at": _parse_github_timestamp(pr_meta.get("created_at")), - "gh_updated_at": _parse_github_timestamp(pr_meta.get("updated_at")), - "gh_closed_at": _parse_github_timestamp(pr_meta.get("closed_at")), - "gh_merged_at": _parse_github_timestamp(pr_meta.get("merged_at")), # Specific to PRs + "user_id": pr_author_db.id, # Link to author Contributor + "number": pr_meta.get("number"), + "title": pr_meta.get("title"), + "state": pr_meta.get( + "state" + ), # e.g., 'open', 'closed', 'merged' + "gh_created_at": _parse_github_timestamp( + pr_meta.get("created_at") + ), + "gh_updated_at": _parse_github_timestamp( + pr_meta.get("updated_at") + ), + "gh_closed_at": _parse_github_timestamp( + pr_meta.get("closed_at") + ), + "gh_merged_at": _parse_github_timestamp( + pr_meta.get("merged_at") + ), # Specific to PRs } # Get or create the PullRequest record - pr_db = pr_repo.get_or_create_by_github_id(github_id=pr_gh_id, obj_in_data=pr_input) - db.flush() # Ensure PR has ID + pr_db = pr_repo.get_or_create_by_github_id( + github_id=pr_gh_id, obj_in_data=pr_input + ) + db.flush() # Ensure PR has ID if pr_db.id is None: - raise RuntimeError(f"PullRequest ID is None after flush for GH ID {pr_gh_id}") + raise RuntimeError( + f"PullRequest ID is None after flush for GH ID {pr_gh_id}" + ) # Associate PullRequest with its chain (direct discovery) - self.discovery_chain_service.associate_entity(db, pr_chain, pr_db, is_direct=True) + self.discovery_chain_service.associate_entity( + db, pr_chain, pr_db, is_direct=True + ) # --- Process PR Review Comments --- # Fetch review comments specific to this PR number - pr_comments_meta = self.github_client.get_pr_review_comments(owner_login, repo_name, pull_number=pr_db.number) - logger.debug(f"Fetched {len(pr_comments_meta)} comments for PR #{pr_db.number}") + pr_comments_meta = self.github_client.get_pr_review_comments( + owner_login, repo_name, pull_number=pr_db.number + ) + logger.debug( + f"Fetched {len(pr_comments_meta)} comments for PR #{pr_db.number}" + ) for pr_comment_meta in pr_comments_meta: - # Extract key identifiers and user data - pr_comment_gh_id = pr_comment_meta.get('id') - pr_comment_user_data = self._extract_activity_user_data(pr_comment_meta.get('user')) - # Basic validation - if not pr_comment_gh_id or not pr_comment_user_data or not pr_comment_user_data.get('github_id'): - logger.warning(f"Skipping PR comment due to missing ID or user data on PR #{pr_db.number}") + # Extract key identifiers and user data + pr_comment_gh_id = pr_comment_meta.get("id") + pr_comment_user_data = self._extract_activity_user_data( + pr_comment_meta.get("user") + ) + # Basic validation + if ( + not pr_comment_gh_id + or not pr_comment_user_data + or not pr_comment_user_data.get("github_id") + ): + logger.warning( + f"Skipping PR comment due to missing ID or user data on PR #{pr_db.number}" + ) continue - # Get/Create comment author (as Contributor) - pr_comment_author_db = contrib_repo.get_or_create_by_github_id(github_id=pr_comment_user_data['github_id'], obj_in_data=pr_comment_user_data) - db.flush() # Ensure author has ID - if pr_comment_author_db.id is None: - logger.error(f"PR Comment author Contributor ID is None for GH ID {pr_comment_user_data['github_id']}") - continue # Skip comment if author failed - - # Prepare data for PRReviewComment record - pr_comment_input = { - "github_id": pr_comment_gh_id, - "pr_id": pr_db.id, # Link to parent PullRequest - "user_id": pr_comment_author_db.id, # Link to author Contributor - "pull_request_review_id": pr_comment_meta.get('pull_request_review_id'), # ID of the review it belongs to - "body": pr_comment_meta.get('body'), # Comment text - "gh_created_at": _parse_github_timestamp(pr_comment_meta.get("created_at")), - "gh_updated_at": _parse_github_timestamp(pr_comment_meta.get("updated_at")), - } - # Get or create the PRReviewComment record - pr_comment_db = pr_comment_repo.get_or_create_by_github_id(github_id=pr_comment_gh_id, obj_in_data=pr_comment_input) - # Associate comment with the *PR* chain (indirect discovery via PR) - self.discovery_chain_service.associate_entity(db, pr_chain, pr_comment_db, is_direct=False) + # Get/Create comment author (as Contributor) + pr_comment_author_db = ( + contrib_repo.get_or_create_by_github_id( + github_id=pr_comment_user_data["github_id"], + obj_in_data=pr_comment_user_data, + ) + ) + db.flush() # Ensure author has ID + if pr_comment_author_db.id is None: + logger.error( + f"PR Comment author Contributor ID is None for GH ID {pr_comment_user_data['github_id']}" + ) + continue # Skip comment if author failed + + # Prepare data for PRReviewComment record + pr_comment_input = { + "github_id": pr_comment_gh_id, + "pr_id": pr_db.id, # Link to parent PullRequest + "user_id": pr_comment_author_db.id, # Link to author Contributor + "pull_request_review_id": pr_comment_meta.get( + "pull_request_review_id" + ), # ID of the review it belongs to + "body": pr_comment_meta.get("body"), # Comment text + "gh_created_at": _parse_github_timestamp( + pr_comment_meta.get("created_at") + ), + "gh_updated_at": _parse_github_timestamp( + pr_comment_meta.get("updated_at") + ), + } + # Get or create the PRReviewComment record + pr_comment_db = pr_comment_repo.get_or_create_by_github_id( + github_id=pr_comment_gh_id, obj_in_data=pr_comment_input + ) + # Associate comment with the *PR* chain (indirect discovery via PR) + self.discovery_chain_service.associate_entity( + db, pr_chain, pr_comment_db, is_direct=False + ) # Mark the individual PR processing chain as complete self.discovery_chain_service.complete_chain(db, pr_chain) - except (ApiClientError, IntegrityError, SQLAlchemyError, ValueError, RuntimeError) as pr_err: + except ( + ApiClientError, + IntegrityError, + SQLAlchemyError, + ValueError, + RuntimeError, + ) as pr_err: # Catch errors during processing of a single PR or its comments - logger.error(f"Error processing PR GH ID {pr_gh_id} or its comments: {pr_err}", exc_info=False) - prs_processed_successfully = False # Mark overall PR step as having issues + logger.error( + f"Error processing PR GH ID {pr_gh_id} or its comments: {pr_err}", + exc_info=False, + ) + prs_processed_successfully = ( + False # Mark overall PR step as having issues + ) if pr_chain: try: # Attempt to mark the specific PR chain as failed - self.discovery_chain_service.fail_chain(db, pr_chain, error_message=f"PR/Comment processing error: {str(pr_err)[:100]}") + self.discovery_chain_service.fail_chain( + db, + pr_chain, + error_message=f"PR/Comment processing error: {str(pr_err)[:100]}", + ) except Exception as chain_fail_err: - logger.error(f"Error failing PR chain {pr_chain.id}: {chain_fail_err}") + logger.error( + f"Error failing PR chain {pr_chain.id}: {chain_fail_err}" + ) except (ApiClientError, Exception) as e: # Catch errors during the initial fetch of the PR list - logger.error(f"Failed fetching pull requests list for {repo_db.full_name}: {e}", exc_info=True) - prs_processed_successfully = False # Mark step as failed + logger.error( + f"Failed fetching pull requests list for {repo_db.full_name}: {e}", + exc_info=True, + ) + prs_processed_successfully = False # Mark step as failed finally: - # Finalize the main PR processing chain status - if prs_chain: + # Finalize the main PR processing chain status + if prs_chain: if prs_processed_successfully: self.discovery_chain_service.complete_chain(db, prs_chain) else: - self.discovery_chain_service.fail_chain(db, prs_chain, "One or more errors during PR/comment processing.") + self.discovery_chain_service.fail_chain( + db, + prs_chain, + "One or more errors during PR/comment processing.", + ) try: - db.flush() # Persist final chain status + db.flush() # Persist final chain status except Exception as flush_err: - logger.error(f"Error flushing PRs chain final status: {flush_err}") - + logger.error( + f"Error flushing PRs chain final status: {flush_err}" + ) # --- Step 9: Finalize Root Chain and Commit --- # If all steps completed or handled errors gracefully, mark root chain complete @@ -1016,44 +1433,71 @@ def ingest_repository_by_url( # ingestion process for the URL itself is considered complete at this point. # The status of the root chain indicates if the *entire* workflow triggered by the URL finished. self.discovery_chain_service.complete_chain(db, root_chain) - self.logger.info(f"Successfully completed all ingestion steps setup for {repo_url}, chain {root_chain.id}") + self.logger.info( + f"Successfully completed all ingestion steps setup for {repo_url}, chain {root_chain.id}" + ) # Commit the entire transaction for this repository ingestion db.commit() self.logger.info("Main ingestion transaction committed successfully.") - logger.info(f"ACTION COMPLETE - Synchronous ingestion steps for URL '{repo_url}' (Chain: {root_chain.id}) finished.") + logger.info( + f"ACTION COMPLETE - Synchronous ingestion steps for URL '{repo_url}' (Chain: {root_chain.id}) finished." + ) - except (ApiClientError, ValueError, IntegrityError, SQLAlchemyError, Exception) as e: + except ( + ApiClientError, + ValueError, + IntegrityError, + SQLAlchemyError, + Exception, + ) as e: # --- Global Error Handling --- # Catch any unhandled exceptions from the steps above - self.logger.error(f"Ingestion failed for URL {repo_url}: {e}", exc_info=True) - db.rollback() # Roll back the entire transaction on any critical failure + self.logger.error( + f"Ingestion failed for URL {repo_url}: {e}", exc_info=True + ) + db.rollback() # Roll back the entire transaction on any critical failure self.logger.warning("Main ingestion transaction rolled back due to error.") # Attempt to mark the root chain as FAILED (best-effort using a separate session) if root_chain and root_chain.id: - try: - # Use a new session to avoid issues with the rolled-back main session state - fail_db = SessionLocal(); - try: - # Re-fetch the chain in the new session - failed_chain = self.discovery_chain_service.get_by_uuid(fail_db, root_chain.id) - # Update status only if it's not already failed - if failed_chain and failed_chain.status != 'FAILED': - self.discovery_chain_service.fail_chain(fail_db, failed_chain, error_message=f"Outer transaction failed: {str(e)[:200]}") - fail_db.commit() # Commit the failure status update - elif not failed_chain: - logger.error(f"Could not find root chain {root_chain.id} to mark as failed after error.") - else: # Chain was already FAILED, possibly from an earlier step - logger.warning(f"Root chain {root_chain.id} was already marked as FAILED.") - except Exception as fail_e: - logger.error(f"Failed to mark root chain {root_chain.id} as FAILED after outer error: {fail_e}", exc_info=True) - fail_db.rollback() # Rollback the attempt to mark as failed - finally: - fail_db.close() # Close the temporary session - except Exception as final_fail_e: - # Log errors occurring during the failure marking process itself - logger.error(f"Further error during root chain failure marking: {final_fail_e}") + try: + # Use a new session to avoid issues with the rolled-back main session state + fail_db = SessionLocal() + try: + # Re-fetch the chain in the new session + failed_chain = self.discovery_chain_service.get_by_uuid( + fail_db, root_chain.id + ) + # Update status only if it's not already failed + if failed_chain and failed_chain.status != "FAILED": + self.discovery_chain_service.fail_chain( + fail_db, + failed_chain, + error_message=f"Outer transaction failed: {str(e)[:200]}", + ) + fail_db.commit() # Commit the failure status update + elif not failed_chain: + logger.error( + f"Could not find root chain {root_chain.id} to mark as failed after error." + ) + else: # Chain was already FAILED, possibly from an earlier step + logger.warning( + f"Root chain {root_chain.id} was already marked as FAILED." + ) + except Exception as fail_e: + logger.error( + f"Failed to mark root chain {root_chain.id} as FAILED after outer error: {fail_e}", + exc_info=True, + ) + fail_db.rollback() # Rollback the attempt to mark as failed + finally: + fail_db.close() # Close the temporary session + except Exception as final_fail_e: + # Log errors occurring during the failure marking process itself + logger.error( + f"Further error during root chain failure marking: {final_fail_e}" + ) # Re-raise the exception as a RuntimeError to signal failure to the caller raise RuntimeError(f"Ingestion failed for {repo_url}") from e @@ -1061,4 +1505,4 @@ def ingest_repository_by_url( # The main session 'db' closure is handled by the caller (e.g., the API endpoint or task runner) pass - return root_chain \ No newline at end of file + return root_chain diff --git a/backend/services/keyword_discovery_service.py b/backend/services/keyword_discovery_service.py index 150106e..e4be03a 100644 --- a/backend/services/keyword_discovery_service.py +++ b/backend/services/keyword_discovery_service.py @@ -4,14 +4,16 @@ Handles the discovery of software repositories based on keyword searches using the GitHub API and initiates their ingestion into the system. """ + import logging from datetime import datetime, timezone -from typing import Any, Dict, Tuple, Optional, List # Added List +from typing import Any, Dict, Tuple, Optional, List # Added List from sqlalchemy.orm import Session from sqlalchemy.exc import SQLAlchemyError from backend.data.models import KeywordSearchSession, Repository + # Use SessionLocal for creating isolated sessions for individual repository ingestions from backend.data.database import SessionLocal from backend.data.repositories import ( @@ -20,12 +22,14 @@ RepositoryRepository, ) from backend.external import GitHubClient, ApiClientError + # Import IngestionService for dependency injection and type hinting from backend.services.ingestion_service import IngestionService from .base_service import BaseService logger = logging.getLogger(__name__) + class KeywordDiscoveryService(BaseService): """ Service responsible for discovering repositories via keyword search and managing the process. @@ -43,7 +47,9 @@ class KeywordDiscoveryService(BaseService): 7. Tracks counts of processed repositories, ingestion errors, and association errors. """ - def __init__(self, github_client: GitHubClient, ingestion_service: IngestionService): + def __init__( + self, github_client: GitHubClient, ingestion_service: IngestionService + ): """ Initializes the KeywordDiscoveryService. @@ -58,7 +64,11 @@ def __init__(self, github_client: GitHubClient, ingestion_service: IngestionServ self.ingestion_service = ingestion_service def discover_and_ingest_by_keywords( - self, db: Session, session_id: int, keywords: str, max_repos_to_process: int = 1000 + self, + db: Session, + session_id: int, + keywords: str, + max_repos_to_process: int = 1000, ) -> Tuple[int, int, int]: """ Executes the keyword discovery and ingestion process for a given search session. @@ -83,7 +93,7 @@ def discover_and_ingest_by_keywords( processed_count = 0 association_errors = 0 ingestion_errors = 0 - items: List[Dict[str, Any]] = [] # Initialize list for GitHub search results + items: List[Dict[str, Any]] = [] # Initialize list for GitHub search results try: # Instantiate repository access objects using the main task's session @@ -94,14 +104,18 @@ def discover_and_ingest_by_keywords( # --- Step 1: Fetch and Update Search Session Status --- search_session = session_repo.get(id=session_id) if not search_session: - logger.error(f"Service: KeywordSearchSession ID {session_id} not found. Cannot proceed.") + logger.error( + f"Service: KeywordSearchSession ID {session_id} not found. Cannot proceed." + ) # Indicate session not found error; caller handles final status. - return 0, 0, 1 # (processed, ingest_err, assoc_err) + return 0, 0, 1 # (processed, ingest_err, assoc_err) # Avoid reprocessing sessions already in a terminal state if search_session.status in ["COMPLETED", "FAILED"]: - logger.warning(f"Service: KeywordSearchSession {session_id} already in terminal state ({search_session.status}). Exiting.") - return 0, 0, 0 # Nothing to process + logger.warning( + f"Service: KeywordSearchSession {session_id} already in terminal state ({search_session.status}). Exiting." + ) + return 0, 0, 0 # Nothing to process # Update status to RUNNING and record start time if not already set search_session.status = "RUNNING" @@ -113,34 +127,41 @@ def discover_and_ingest_by_keywords( logger.info(f"Service: Session {session_id}: Status set to RUNNING.") # --- Step 2: Perform GitHub Search --- - self.logger.info(f"Service: Session {session_id}: Searching GitHub repos for session {session_id}: '{keywords}', max={max_repos_to_process}") + self.logger.info( + f"Service: Session {session_id}: Searching GitHub repos for session {session_id}: '{keywords}', max={max_repos_to_process}" + ) search_result_tuple = self.github_client.search_repositories( query=keywords, max_results=max_repos_to_process ) # Handle potential failures in the GitHub search itself if search_result_tuple is None: - logger.error(f"Service: Session {session_id}: GitHub search request failed.") - # Indicate search failure; caller handles setting session to FAILED. - return 0, 1, 0 # (processed, ingest_err, assoc_err) + logger.error( + f"Service: Session {session_id}: GitHub search request failed." + ) + # Indicate search failure; caller handles setting session to FAILED. + return 0, 1, 0 # (processed, ingest_err, assoc_err) items, total_count_reported = search_result_tuple - self.logger.info(f"Service: Session {session_id}: GitHub search call returned {len(items)} items (GitHub reported total: {total_count_reported}).") - + self.logger.info( + f"Service: Session {session_id}: GitHub search call returned {len(items)} items (GitHub reported total: {total_count_reported})." + ) # Handle case where search returns no results if not items: - logger.info(f"Service: Session {session_id}: No repositories found/fetched.") - # Update results count immediately if no items found - if search_session: - search_session.results_count = 0 # Explicitly set to zero - db.add(search_session) - db.commit() # Commit the final count using the main session - # Return success, as no processing errors occurred - return 0, 0, 0 + logger.info( + f"Service: Session {session_id}: No repositories found/fetched." + ) + # Update results count immediately if no items found + if search_session: + search_session.results_count = 0 # Explicitly set to zero + db.add(search_session) + db.commit() # Commit the final count using the main session + # Return success, as no processing errors occurred + return 0, 0, 0 else: - # If items were found, but count hasn't been set, mark as in progress (or set actual count later) - if search_session and search_session.results_count is None: + # If items were found, but count hasn't been set, mark as in progress (or set actual count later) + if search_session and search_session.results_count is None: # Optionally set the fetched count here, or wait until the end. # Setting it now might be slightly inaccurate if some items are skipped. # Let's defer setting the final count until the end of processing. @@ -148,151 +169,226 @@ def discover_and_ingest_by_keywords( # db.add(search_session) # db.commit() - # --- Step 3: Iterate Search Results and Process Repositories --- - logger.info(f"Service: Session {session_id}: Starting processing loop for {len(items)} items.") + logger.info( + f"Service: Session {session_id}: Starting processing loop for {len(items)} items." + ) for item_index, item in enumerate(items): # Extract essential info from the GitHub search result item repo_github_id = item.get("id") repo_full_name = item.get("full_name") repo_url = item.get("html_url") # Consistent logging prefix for messages related to this specific item - item_log_prefix = f"Service: Session {session_id}: Item {item_index+1}/{len(items)} ({repo_full_name or 'N/A'})" + item_log_prefix = f"Service: Session {session_id}: Item {item_index + 1}/{len(items)} ({repo_full_name or 'N/A'})" logger.info(f"{item_log_prefix}: --- Processing START ---") # Basic validation of the search result item if not repo_url or not repo_github_id or not repo_full_name: - self.logger.warning(f"{item_log_prefix}: Skipping search item due to missing URL/ID/FullName.") - continue # Skip to the next item + self.logger.warning( + f"{item_log_prefix}: Skipping search item due to missing URL/ID/FullName." + ) + continue # Skip to the next item self.logger.info(f"{item_log_prefix}: Processing search result.") - ingestion_succeeded = False # Track if ingestion was successful for this item - repo_exists_before_ingest = False # Track if repo existed before attempting ingest - repository_db_for_assoc: Optional[Repository] = None # Holds the DB object for association + ingestion_succeeded = ( + False # Track if ingestion was successful for this item + ) + repo_exists_before_ingest = ( + False # Track if repo existed before attempting ingest + ) + repository_db_for_assoc: Optional[Repository] = ( + None # Holds the DB object for association + ) try: # --- Step 3a: Check if Repository Exists Locally --- # Use the main task's session 'db' for this check. - logger.debug(f"{item_log_prefix}: Checking if repo exists (GH ID: {repo_github_id})...") + logger.debug( + f"{item_log_prefix}: Checking if repo exists (GH ID: {repo_github_id})..." + ) existing_repo = repo_repo.get_by_github_id(github_id=repo_github_id) if existing_repo: # Repository already in the database, no need to re-ingest. - logger.info(f"{item_log_prefix}: Repo already exists (DB ID: {existing_repo.id}). Skipping ingestion call.") - repository_db_for_assoc = existing_repo # Use existing object for association - ingestion_succeeded = True # Mark as success for association purposes + logger.info( + f"{item_log_prefix}: Repo already exists (DB ID: {existing_repo.id}). Skipping ingestion call." + ) + repository_db_for_assoc = ( + existing_repo # Use existing object for association + ) + ingestion_succeeded = ( + True # Mark as success for association purposes + ) repo_exists_before_ingest = True else: # --- Step 3b: Ingest New Repository (in Isolated Session) --- - logger.info(f"{item_log_prefix}: Repo not found. Calling ingestion service for URL: {repo_url}") - ingestion_db_session: Optional[Session] = None # Define session variable for this block + logger.info( + f"{item_log_prefix}: Repo not found. Calling ingestion service for URL: {repo_url}" + ) + ingestion_db_session: Optional[Session] = ( + None # Define session variable for this block + ) try: # Create a *new, separate* database session just for this ingestion. ingestion_db_session = SessionLocal() - logger.debug(f"{item_log_prefix}: Created separate session for ingestion.") + logger.debug( + f"{item_log_prefix}: Created separate session for ingestion." + ) # Call the IngestionService, passing the isolated session. chain = self.ingestion_service.ingest_repository_by_url( db=ingestion_db_session, repo_url=repo_url ) # Check the outcome of the ingestion process via the discovery chain status - ingestion_status = chain.status if chain else "FAILED (None returned)" - logger.info(f"{item_log_prefix}: Ingestion service call returned. Chain Status: {ingestion_status}") + ingestion_status = ( + chain.status if chain else "FAILED (None returned)" + ) + logger.info( + f"{item_log_prefix}: Ingestion service call returned. Chain Status: {ingestion_status}" + ) - if chain and chain.status == 'COMPLETED': + if chain and chain.status == "COMPLETED": ingestion_succeeded = True - self.logger.info(f"{item_log_prefix}: Successfully ingested.") + self.logger.info( + f"{item_log_prefix}: Successfully ingested." + ) # After successful ingestion in the separate session, # fetch the newly created repository using the *main task's session* # to ensure it's available for association in that context. - repository_db_for_assoc = repo_repo.get_by_github_id(github_id=repo_github_id) + repository_db_for_assoc = repo_repo.get_by_github_id( + github_id=repo_github_id + ) if not repository_db_for_assoc: # This would be unusual but indicates a potential timing or session issue. - logger.error(f"{item_log_prefix}: Ingestion supposedly OK, but repo GH ID {repo_github_id} not found in main session immediately after.") - ingestion_succeeded = False # Treat as failure if repo not found after ingest + logger.error( + f"{item_log_prefix}: Ingestion supposedly OK, but repo GH ID {repo_github_id} not found in main session immediately after." + ) + ingestion_succeeded = False # Treat as failure if repo not found after ingest elif chain: # Ingestion finished but didn't complete successfully (e.g., FAILED, PARTIAL) - self.logger.warning(f"{item_log_prefix}: Ingestion finished with status {chain.status}.") + self.logger.warning( + f"{item_log_prefix}: Ingestion finished with status {chain.status}." + ) ingestion_errors += 1 else: # Ingestion service returned None, indicating an early failure (e.g., bad URL) - self.logger.error(f"{item_log_prefix}: Ingestion call failed (returned None).") + self.logger.error( + f"{item_log_prefix}: Ingestion call failed (returned None)." + ) ingestion_errors += 1 except Exception as ingest_exc: # Catch any unexpected exceptions during the ingestion call itself - logger.error(f"{item_log_prefix}: EXCEPTION during ingestion service call: {ingest_exc}", exc_info=True) + logger.error( + f"{item_log_prefix}: EXCEPTION during ingestion service call: {ingest_exc}", + exc_info=True, + ) ingestion_errors += 1 - ingestion_succeeded = False # Ensure failure is marked + ingestion_succeeded = False # Ensure failure is marked finally: - # Always close the isolated ingestion session - if ingestion_db_session: - logger.debug(f"{item_log_prefix}: Closing separate ingestion session.") - ingestion_db_session.close() + # Always close the isolated ingestion session + if ingestion_db_session: + logger.debug( + f"{item_log_prefix}: Closing separate ingestion session." + ) + ingestion_db_session.close() # --- Step 3c: Create Association (in Main Session) --- - logger.debug(f"{item_log_prefix}: Entering association logic. ingestion_succeeded={ingestion_succeeded}") + logger.debug( + f"{item_log_prefix}: Entering association logic. ingestion_succeeded={ingestion_succeeded}" + ) # Proceed only if ingestion succeeded (or repo existed) and we have a valid repo object and search session. - if ingestion_succeeded and repository_db_for_assoc and search_session: - try: - logger.debug(f"{item_log_prefix}: Attempting to create/find association for DB Repo ID {repository_db_for_assoc.id}...") - # Check if this specific association already exists using the main session - existing_assoc = assoc_repo.get_by_session_and_repo_id( - session_id=search_session.id, - repository_id=repository_db_for_assoc.id - ) - if not existing_assoc: - # Create the association link in the main session's context - assoc_repo.create_association( - session_id=search_session.id, - repository_id=repository_db_for_assoc.id, - # Store relevance score from GitHub search if available - match_details={'score': item.get('score')} - ) - # Commit the association immediately using the main task's session 'db' - db.commit() - processed_count += 1 # Increment count of successfully processed/associated repos - logger.info(f"{item_log_prefix}: Association successful (Processed count incremented).") - else: - # Association already existed, no action needed, don't increment processed count again. - logger.debug(f"{item_log_prefix}: Association already exists.") - # If the repo existed before *and* the association existed, it means this search - # rediscovered an already known and associated repo. - # If the repo was ingested *this run* but the association somehow existed, - # that would be an anomaly. The current logic correctly handles avoiding duplicates. - - except Exception as assoc_exc: - # Catch errors during association creation/commit - logger.error(f"{item_log_prefix}: EXCEPTION during association: {assoc_exc}", exc_info=True) - association_errors += 1 - try: - # Rollback the main session to undo the failed association attempt - db.rollback() - logger.warning(f"{item_log_prefix}: Rolled back main session after association failure.") - except Exception as rb_err: - logger.error(f"Error rolling back main session after association failure: {rb_err}") + if ( + ingestion_succeeded + and repository_db_for_assoc + and search_session + ): + try: + logger.debug( + f"{item_log_prefix}: Attempting to create/find association for DB Repo ID {repository_db_for_assoc.id}..." + ) + # Check if this specific association already exists using the main session + existing_assoc = assoc_repo.get_by_session_and_repo_id( + session_id=search_session.id, + repository_id=repository_db_for_assoc.id, + ) + if not existing_assoc: + # Create the association link in the main session's context + assoc_repo.create_association( + session_id=search_session.id, + repository_id=repository_db_for_assoc.id, + # Store relevance score from GitHub search if available + match_details={"score": item.get("score")}, + ) + # Commit the association immediately using the main task's session 'db' + db.commit() + processed_count += 1 # Increment count of successfully processed/associated repos + logger.info( + f"{item_log_prefix}: Association successful (Processed count incremented)." + ) + else: + # Association already existed, no action needed, don't increment processed count again. + logger.debug( + f"{item_log_prefix}: Association already exists." + ) + # If the repo existed before *and* the association existed, it means this search + # rediscovered an already known and associated repo. + # If the repo was ingested *this run* but the association somehow existed, + # that would be an anomaly. The current logic correctly handles avoiding duplicates. + + except Exception as assoc_exc: + # Catch errors during association creation/commit + logger.error( + f"{item_log_prefix}: EXCEPTION during association: {assoc_exc}", + exc_info=True, + ) + association_errors += 1 + try: + # Rollback the main session to undo the failed association attempt + db.rollback() + logger.warning( + f"{item_log_prefix}: Rolled back main session after association failure." + ) + except Exception as rb_err: + logger.error( + f"Error rolling back main session after association failure: {rb_err}" + ) elif ingestion_succeeded and not repository_db_for_assoc: # Handle the unusual case where ingestion was marked successful but the repo object wasn't found association_errors += 1 - self.logger.error(f"{item_log_prefix}: Association failed: Repo supposedly ingested/existed but not found in main session (GH ID: {repo_github_id}).") + self.logger.error( + f"{item_log_prefix}: Association failed: Repo supposedly ingested/existed but not found in main session (GH ID: {repo_github_id})." + ) elif not ingestion_succeeded: - # Skip association if ingestion failed - logger.debug(f"{item_log_prefix}: Skipping association due to ingestion failure.") + # Skip association if ingestion failed + logger.debug( + f"{item_log_prefix}: Skipping association due to ingestion failure." + ) except Exception as outer_loop_exc: # Catch unexpected errors in the main loop for this item (e.g., during repo check) - logger.error(f"{item_log_prefix}: EXCEPTION in outer item processing loop: {outer_loop_exc}", exc_info=True) - ingestion_errors += 1 # Count this as an error preventing processing of this item + logger.error( + f"{item_log_prefix}: EXCEPTION in outer item processing loop: {outer_loop_exc}", + exc_info=True, + ) + ingestion_errors += ( + 1 # Count this as an error preventing processing of this item + ) try: # Attempt to rollback the main session if an outer loop error occurred db.rollback() - logger.warning(f"{item_log_prefix}: Rolled back main session after outer loop exception.") - except: pass # Ignore rollback errors during exception handling + logger.warning( + f"{item_log_prefix}: Rolled back main session after outer loop exception." + ) + except: + pass # Ignore rollback errors during exception handling finally: logger.info(f"{item_log_prefix}: --- Processing END ---") # --- End of loop for processing search items --- - logger.info(f"Service: Session {session_id}: Finished processing loop for {len(items)} items.") + logger.info( + f"Service: Session {session_id}: Finished processing loop for {len(items)} items." + ) # --- Step 4: Update Final Session Counts (Optional but recommended) --- # It might be useful to store the final counts back into the search_session record here. @@ -314,24 +410,38 @@ def discover_and_ingest_by_keywords( # --- Global Error Handling for the Service Method --- except ApiClientError as api_e: # Errors during the initial setup or the main GitHub search call - logger.error(f"Service: API Client Error during keyword discovery task setup/search for session {session_id}: {api_e}", exc_info=True) - ingestion_errors += 1 # Count as a general failure for the session + logger.error( + f"Service: API Client Error during keyword discovery task setup/search for session {session_id}: {api_e}", + exc_info=True, + ) + ingestion_errors += 1 # Count as a general failure for the session # Let the task runner handle setting the final FAILED status based on return/exception except SQLAlchemyError as db_e: # Database errors during session status updates or initial checks - logger.error(f"Service: Database Error during keyword discovery task setup/search for session {session_id}: {db_e}", exc_info=True) - try: db.rollback() # Rollback the main session - except: pass - association_errors +=1 # Count as DB error likely affecting state + logger.error( + f"Service: Database Error during keyword discovery task setup/search for session {session_id}: {db_e}", + exc_info=True, + ) + try: + db.rollback() # Rollback the main session + except: + pass + association_errors += 1 # Count as DB error likely affecting state # Let the task runner handle final status except Exception as e: # Catch-all for any other unexpected critical errors - logger.exception(f"Service: Unexpected critical error during keyword discovery task for session {session_id}: {e}") - try: db.rollback() # Rollback the main session - except: pass - ingestion_errors += 1 # Count as a general failure + logger.exception( + f"Service: Unexpected critical error during keyword discovery task for session {session_id}: {e}" + ) + try: + db.rollback() # Rollback the main session + except: + pass + ingestion_errors += 1 # Count as a general failure # Let the task runner handle final status # The main database session `db` is managed (committed/rolled back/closed) by the caller (Celery task). - logger.info(f"Service: Keyword discovery processing finished for session {session_id}. Returning counts: Processed={processed_count}, IngestErrors={ingestion_errors}, AssocErrors={association_errors}") - return processed_count, ingestion_errors, association_errors \ No newline at end of file + logger.info( + f"Service: Keyword discovery processing finished for session {session_id}. Returning counts: Processed={processed_count}, IngestErrors={ingestion_errors}, AssocErrors={association_errors}" + ) + return processed_count, ingestion_errors, association_errors diff --git a/backend/services/scholarly_processing_service.py b/backend/services/scholarly_processing_service.py index edeabf0..2bbfb6f 100644 --- a/backend/services/scholarly_processing_service.py +++ b/backend/services/scholarly_processing_service.py @@ -8,26 +8,41 @@ import logging import re -import uuid -from typing import Dict, Any, Optional, List, Tuple, Set # Added Set +from typing import Dict, Any, Optional, List, Tuple, Set # Added Set from sqlalchemy.orm import Session -from sqlalchemy.exc import IntegrityError, SQLAlchemyError +from sqlalchemy.exc import SQLAlchemyError from backend.external import OpenAlexClient from backend.data.models import ( - Work, Person, Institution, Authorship, Affiliation, WorkCitation, DiscoveryChain, - Domain, Field, Subfield, Topic, WorkTopic # Topic hierarchy models + Work, + Person, + Institution, + Authorship, + Affiliation, + DiscoveryChain, + Domain, + Field, + Subfield, + Topic, + WorkTopic, # Topic hierarchy models ) from backend.data.repositories import ( - PersonRepository, InstitutionRepository, WorkRepository, - DomainRepository, FieldRepository, SubfieldRepository, TopicRepository # Hierarchy repositories + PersonRepository, + InstitutionRepository, + DomainRepository, + FieldRepository, + SubfieldRepository, + TopicRepository, # Hierarchy repositories ) from .base_service import BaseService -from .discovery_chain_service import DiscoveryChainService # Service for managing provenance +from .discovery_chain_service import ( + DiscoveryChainService, +) # Service for managing provenance logger = logging.getLogger(__name__) + class ScholarlyProcessingService(BaseService): """ Handles the detailed processing of scholarly metadata associated with a Work. @@ -69,37 +84,55 @@ def _get_id_from_oa_url(self, url: Optional[str]) -> Optional[str]: or None if parsing fails or the format is unrecognized. """ # --- Logic unchanged from previous version --- - if not url or not isinstance(url, str): return None + if not url or not isinstance(url, str): + return None try: id_part: Optional[str] = None # Determine ID type and extract based on URL prefix or pattern if url.startswith("https://orcid.org/"): - match = re.search(r'(\d{4}-\d{4}-\d{4}-\d{3}[0-9X])', url) - id_part = match.group(1) if match else None + match = re.search(r"(\d{4}-\d{4}-\d{4}-\d{3}[0-9X])", url) + id_part = match.group(1) if match else None elif url.startswith("https://ror.org/"): - id_part = url.split('/')[-1] # ROR ID is the last path segment + id_part = url.split("/")[-1] # ROR ID is the last path segment elif url.startswith("https://openalex.org/"): - id_part = url.split('/')[-1] # OpenAlex ID is the last path segment + id_part = url.split("/")[-1] # OpenAlex ID is the last path segment elif url.startswith("https://doi.org/"): - id_part = url[len("https://doi.org/"):] # Extract DOI string after prefix - elif url and url[0].isalpha() and url[1:].isdigit(): # Check for bare OA ID (e.g., W123) - id_part = url + id_part = url[ + len("https://doi.org/") : + ] # Extract DOI string after prefix + elif ( + url and url[0].isalpha() and url[1:].isdigit() + ): # Check for bare OA ID (e.g., W123) + id_part = url else: - id_part = None # Unrecognized format + id_part = None # Unrecognized format # Basic validation based on expected patterns for the extracted ID part is_valid = False if id_part: - if (url.startswith("https://openalex.org/") and id_part[0].isalpha() and id_part[1:].isdigit()): is_valid = True - elif (url.startswith("https://orcid.org/") and match): is_valid = True # ORCID must match regex - elif (url.startswith("https://ror.org/") and id_part.startswith('0') and len(id_part) == 9): is_valid = True - elif url.startswith("https://doi.org/"): is_valid = True # Assume valid DOI string if extracted - elif (id_part == url and url[0].isalpha() and url[1:].isdigit()): is_valid = True # Valid bare OA ID - - return id_part if is_valid else None # Return ID only if considered valid + if ( + url.startswith("https://openalex.org/") + and id_part[0].isalpha() + and id_part[1:].isdigit() + ): + is_valid = True + elif url.startswith("https://orcid.org/") and match: + is_valid = True # ORCID must match regex + elif ( + url.startswith("https://ror.org/") + and id_part.startswith("0") + and len(id_part) == 9 + ): + is_valid = True + elif url.startswith("https://doi.org/"): + is_valid = True # Assume valid DOI string if extracted + elif id_part == url and url[0].isalpha() and url[1:].isdigit(): + is_valid = True # Valid bare OA ID + + return id_part if is_valid else None # Return ID only if considered valid except Exception as e: - # Log errors during parsing but avoid interrupting the flow - logger.error(f"Error parsing ID/URL {url}: {e}", exc_info=False) + # Log errors during parsing but avoid interrupting the flow + logger.error(f"Error parsing ID/URL {url}: {e}", exc_info=False) return None def process_openalex_work_data( @@ -107,7 +140,7 @@ def process_openalex_work_data( db: Session, work_db: Work, work_api_data: Dict[str, Any], - parent_chain: DiscoveryChain + parent_chain: DiscoveryChain, ) -> Tuple[List[str], List[str], Optional[str]]: """ Processes detailed work data from an OpenAlex API response. @@ -137,11 +170,15 @@ def process_openalex_work_data( """ # Input validation if not work_db or not parent_chain: - logger.error(f"Work DB object or Parent Chain is None. Aborting scholarly processing.") - # Return empty results indicating failure to process - return [], [], None - - self.logger.info(f"Starting scholarly processing for Work ID: {work_db.id} (OA: {work_db.openalex_id}) under Chain: {parent_chain.id}") + logger.error( + "Work DB object or Parent Chain is None. Aborting scholarly processing." + ) + # Return empty results indicating failure to process + return [], [], None + + self.logger.info( + f"Starting scholarly processing for Work ID: {work_db.id} (OA: {work_db.openalex_id}) under Chain: {parent_chain.id}" + ) # Initialize return values referenced_oa_ids: List[str] = [] related_oa_ids: List[str] = [] @@ -160,19 +197,23 @@ def process_openalex_work_data( # --- 1. Process Authorships and Affiliations --- try: # Retrieve authorship list from the API data - authorships_data = work_api_data.get('authorships', []) - self.logger.debug(f"Processing {len(authorships_data)} authorships for Work ID: {work_db.id}") + authorships_data = work_api_data.get("authorships", []) + self.logger.debug( + f"Processing {len(authorships_data)} authorships for Work ID: {work_db.id}" + ) # Iterate through each authorship entry for the work for authorship_item in authorships_data: # --- 1a. Process Author (Person) --- - author_data = authorship_item.get('author', {}) - person_oa_id = self._get_id_from_oa_url(author_data.get('id')) - person_name = author_data.get('display_name') + author_data = authorship_item.get("author", {}) + person_oa_id = self._get_id_from_oa_url(author_data.get("id")) + person_name = author_data.get("display_name") # Basic validation for essential author data if not person_oa_id or not person_name: - logger.warning(f"Skipping authorship due to missing person ID or name: {author_data}") - continue # Skip this authorship entry + logger.warning( + f"Skipping authorship due to missing person ID or name: {author_data}" + ) + continue # Skip this authorship entry person_db: Optional[Person] = None person_chain: Optional[DiscoveryChain] = None @@ -180,40 +221,67 @@ def process_openalex_work_data( # Prepare data for creating/updating the Person record person_input_data = { "openalex_id": person_oa_id, - "orcid": self._get_id_from_oa_url(author_data.get('orcid')), # Extract ORCID if available + "orcid": self._get_id_from_oa_url( + author_data.get("orcid") + ), # Extract ORCID if available "display_name": person_name, - "display_name_alternatives": author_data.get('display_name_alternatives', []) # Store alternative names + "display_name_alternatives": author_data.get( + "display_name_alternatives", [] + ), # Store alternative names } - person_input_data = {k: v for k, v in person_input_data.items() if v is not None} # Clean None values + person_input_data = { + k: v for k, v in person_input_data.items() if v is not None + } # Clean None values # Retrieve existing Person by OpenAlex ID or create a new one - person_db = person_repo.get_or_create_by_openalex_id(openalex_id=person_oa_id, obj_in_data=person_input_data) - db.flush() # Persist changes and ensure Person gets an ID if new + person_db = person_repo.get_or_create_by_openalex_id( + openalex_id=person_oa_id, obj_in_data=person_input_data + ) + db.flush() # Persist changes and ensure Person gets an ID if new if person_db.id is None: # If ID is still None after flush, something went wrong - raise RuntimeError(f"Person ID is None after flush for OA ID {person_oa_id}") + raise RuntimeError( + f"Person ID is None after flush for OA ID {person_oa_id}" + ) # Create a discovery chain record for this Person entity person_chain = discovery_chain_service.create_child_chain( - db=db, parent_chain=parent_chain, discovery_type='REL_PERSON_FROM_AUTHORSHIP', - parameters={'work_id': work_db.id, 'person_oa_id': person_oa_id} + db=db, + parent_chain=parent_chain, + discovery_type="REL_PERSON_FROM_AUTHORSHIP", + parameters={ + "work_id": work_db.id, + "person_oa_id": person_oa_id, + }, ) # Link the Person DB record to its discovery chain - discovery_chain_service.associate_entity(db=db, chain=person_chain, entity=person_db) + discovery_chain_service.associate_entity( + db=db, chain=person_chain, entity=person_db + ) # Mark the discovery chain for this person as complete discovery_chain_service.complete_chain(db=db, chain=person_chain) except (SQLAlchemyError, ValueError, RuntimeError) as e_person: # Handle errors specifically during Person processing - logger.error(f"Error processing Person OA ID {person_oa_id} for Work ID {work_db.id}: {e_person}", exc_info=False) + logger.error( + f"Error processing Person OA ID {person_oa_id} for Work ID {work_db.id}: {e_person}", + exc_info=False, + ) if person_chain: # Attempt to mark the associated discovery chain as FAILED (best-effort) - try: discovery_chain_service.fail_chain(db=db, chain=person_chain, error_message=str(e_person)) - except Exception as fail_err: logger.error(f"Failed attempt to mark person_chain {person_chain.id} as FAILED: {fail_err}") - raise e_person # Re-raise critical database or validation errors for transaction rollback by caller + try: + discovery_chain_service.fail_chain( + db=db, chain=person_chain, error_message=str(e_person) + ) + except Exception as fail_err: + logger.error( + f"Failed attempt to mark person_chain {person_chain.id} as FAILED: {fail_err}" + ) + raise e_person # Re-raise critical database or validation errors for transaction rollback by caller # If Person processing failed, skip the rest of the steps for this authorship - if not person_db: continue + if not person_db: + continue # --- 1b. Process Authorship Link --- # Create the link between the Work and the Person @@ -223,350 +291,592 @@ def process_openalex_work_data( # Ensure the person discovery chain exists before linking from it if not person_chain: # This state indicates an unexpected issue after successful person processing - raise RuntimeError(f"Person chain is None for Person {person_db.id}, cannot proceed with authorship link.") + raise RuntimeError( + f"Person chain is None for Person {person_db.id}, cannot proceed with authorship link." + ) # Create a discovery chain specifically for the Authorship link itself authorship_chain = discovery_chain_service.create_child_chain( - db=db, parent_chain=person_chain, discovery_type='LINK_AUTHORSHIP', - parameters={'work_id': work_db.id, 'person_id': person_db.id} + db=db, + parent_chain=person_chain, + discovery_type="LINK_AUTHORSHIP", + parameters={"work_id": work_db.id, "person_id": person_db.id}, ) # Check if this specific Work-Person authorship link already exists - existing_authorship = db.query(Authorship).filter_by(work_id=work_db.id, person_id=person_db.id).first() + existing_authorship = ( + db.query(Authorship) + .filter_by(work_id=work_db.id, person_id=person_db.id) + .first() + ) if existing_authorship: authorship_db = existing_authorship - logger.debug(f"Authorship link W:{work_db.id}/P:{person_db.id} already exists.") + logger.debug( + f"Authorship link W:{work_db.id}/P:{person_db.id} already exists." + ) else: # Prepare data for the new Authorship link record authorship_input_data = { "work_id": work_db.id, "person_id": person_db.id, - "author_position": authorship_item.get('author_position'), # e.g., 'first', 'middle', 'last' - "is_corresponding": authorship_item.get('is_corresponding') # Boolean flag + "author_position": authorship_item.get( + "author_position" + ), # e.g., 'first', 'middle', 'last' + "is_corresponding": authorship_item.get( + "is_corresponding" + ), # Boolean flag } authorship_db = Authorship(**authorship_input_data) db.add(authorship_db) - db.flush() # Persist the new Authorship link - self.logger.info(f"Created Authorship W:{work_db.id}/P:{person_db.id}") + db.flush() # Persist the new Authorship link + self.logger.info( + f"Created Authorship W:{work_db.id}/P:{person_db.id}" + ) # Associate the Authorship link record with its discovery chain # Note: Authorship uses a composite primary key; associate_entity handles this. - discovery_chain_service.associate_entity(db=db, chain=authorship_chain, entity=authorship_db, is_direct=True) + discovery_chain_service.associate_entity( + db=db, + chain=authorship_chain, + entity=authorship_db, + is_direct=True, + ) # Mark the authorship link discovery chain as complete - discovery_chain_service.complete_chain(db=db, chain=authorship_chain) + discovery_chain_service.complete_chain( + db=db, chain=authorship_chain + ) except (SQLAlchemyError, ValueError, RuntimeError) as e_author: # Handle errors during Authorship link creation or flush - logger.error(f"Error creating/flushing Authorship W:{work_db.id}/P:{person_db.id}: {e_author}", exc_info=False) + logger.error( + f"Error creating/flushing Authorship W:{work_db.id}/P:{person_db.id}: {e_author}", + exc_info=False, + ) if authorship_chain: # Attempt to mark the chain as failed - try: discovery_chain_service.fail_chain(db=db, chain=authorship_chain, error_message=str(e_author)) - except Exception as fail_err: logger.error(f"Failed attempt to mark authorship_chain {authorship_chain.id} as FAILED: {fail_err}") - raise e_author # Re-raise critical errors + try: + discovery_chain_service.fail_chain( + db=db, + chain=authorship_chain, + error_message=str(e_author), + ) + except Exception as fail_err: + logger.error( + f"Failed attempt to mark authorship_chain {authorship_chain.id} as FAILED: {fail_err}" + ) + raise e_author # Re-raise critical errors # If Authorship link creation failed, skip processing affiliations for this author - if not authorship_db: continue + if not authorship_db: + continue # --- 1c. Process Affiliations (Institutions) --- # Iterate through the institutions listed for this specific authorship - institutions_data = authorship_item.get('institutions', []) + institutions_data = authorship_item.get("institutions", []) for institution_item in institutions_data: # Extract institution identifiers and name - inst_oa_id = self._get_id_from_oa_url(institution_item.get('id')) - inst_name = institution_item.get('display_name') + inst_oa_id = self._get_id_from_oa_url(institution_item.get("id")) + inst_name = institution_item.get("display_name") # Basic validation for institution data if not inst_oa_id or not inst_name: - logger.warning(f"Skipping affiliation due to missing institution ID or name: {institution_item}") - continue # Skip this institution entry + logger.warning( + f"Skipping affiliation due to missing institution ID or name: {institution_item}" + ) + continue # Skip this institution entry institution_db: Optional[Institution] = None institution_chain: Optional[DiscoveryChain] = None try: # Ensure the authorship chain exists before linking institution discovery to it if not authorship_chain: - # This indicates an unexpected state after successful authorship processing - raise RuntimeError(f"Authorship chain is None for Auth W:{authorship_db.work_id}/P:{authorship_db.person_id}, cannot process institution.") + # This indicates an unexpected state after successful authorship processing + raise RuntimeError( + f"Authorship chain is None for Auth W:{authorship_db.work_id}/P:{authorship_db.person_id}, cannot process institution." + ) # Prepare data for creating/updating the Institution record inst_input_data = { "openalex_id": inst_oa_id, - "ror": self._get_id_from_oa_url(institution_item.get('ror')), # Extract ROR if present + "ror": self._get_id_from_oa_url( + institution_item.get("ror") + ), # Extract ROR if present "display_name": inst_name, - "country_code": institution_item.get('country_code'), - "type": institution_item.get('type') # e.g., 'education', 'company', 'government' + "country_code": institution_item.get("country_code"), + "type": institution_item.get( + "type" + ), # e.g., 'education', 'company', 'government' } - inst_input_data = {k: v for k, v in inst_input_data.items() if v is not None} # Clean None values + inst_input_data = { + k: v for k, v in inst_input_data.items() if v is not None + } # Clean None values # Retrieve existing Institution by OpenAlex ID or create a new one - institution_db = institution_repo.get_or_create_by_openalex_id(openalex_id=inst_oa_id, obj_in_data=inst_input_data) - db.flush() # Persist changes and ensure Institution gets an ID if new + institution_db = institution_repo.get_or_create_by_openalex_id( + openalex_id=inst_oa_id, obj_in_data=inst_input_data + ) + db.flush() # Persist changes and ensure Institution gets an ID if new if institution_db.id is None: - # If ID is still None after flush, something went wrong - raise RuntimeError(f"Institution ID is None after flush for OA ID {inst_oa_id}") + # If ID is still None after flush, something went wrong + raise RuntimeError( + f"Institution ID is None after flush for OA ID {inst_oa_id}" + ) # Create a discovery chain record for this Institution entity institution_chain = discovery_chain_service.create_child_chain( - db=db, parent_chain=authorship_chain, discovery_type='REL_INST_FROM_AFFILIATION', - parameters={'authorship': f"W:{work_db.id}/P:{person_db.id}", 'inst_oa_id': inst_oa_id} + db=db, + parent_chain=authorship_chain, + discovery_type="REL_INST_FROM_AFFILIATION", + parameters={ + "authorship": f"W:{work_db.id}/P:{person_db.id}", + "inst_oa_id": inst_oa_id, + }, ) # Link the Institution DB record to its discovery chain - discovery_chain_service.associate_entity(db=db, chain=institution_chain, entity=institution_db) + discovery_chain_service.associate_entity( + db=db, chain=institution_chain, entity=institution_db + ) # Mark the discovery chain for this institution as complete - discovery_chain_service.complete_chain(db=db, chain=institution_chain) + discovery_chain_service.complete_chain( + db=db, chain=institution_chain + ) except (SQLAlchemyError, ValueError, RuntimeError) as e_inst: # Handle errors specifically during Institution processing - logger.error(f"Error processing Inst OA ID {inst_oa_id} for Auth W:{work_db.id}/P:{person_db.id}: {e_inst}", exc_info=False) + logger.error( + f"Error processing Inst OA ID {inst_oa_id} for Auth W:{work_db.id}/P:{person_db.id}: {e_inst}", + exc_info=False, + ) if institution_chain: # Attempt to mark the chain as failed - try: discovery_chain_service.fail_chain(db=db, chain=institution_chain, error_message=str(e_inst)) - except Exception as fail_err: logger.error(f"Failed attempt to mark institution_chain {institution_chain.id} as FAILED: {fail_err}") - raise e_inst # Re-raise critical errors + try: + discovery_chain_service.fail_chain( + db=db, + chain=institution_chain, + error_message=str(e_inst), + ) + except Exception as fail_err: + logger.error( + f"Failed attempt to mark institution_chain {institution_chain.id} as FAILED: {fail_err}" + ) + raise e_inst # Re-raise critical errors # If Institution processing failed, skip creating the affiliation link - if not institution_db: continue + if not institution_db: + continue # --- 1d. Process Affiliation Link --- # Create the link between the Authorship (Work-Person) and the Institution affiliation_db: Optional[Affiliation] = None affiliation_chain: Optional[DiscoveryChain] = None try: - # Ensure the institution discovery chain exists before linking from it - if not institution_chain: - # Indicates an unexpected state after successful institution processing - raise RuntimeError(f"Institution chain is None for Inst {institution_db.id}, cannot process affiliation link.") - - # Create a discovery chain specifically for the Affiliation link itself - affiliation_chain = discovery_chain_service.create_child_chain( - db=db, parent_chain=institution_chain, discovery_type='LINK_AFFILIATION', - parameters={'institution_id': institution_db.id} # Link refers back to institution - ) - - # Check if this specific Authorship-Institution affiliation link already exists - existing_affiliation = db.query(Affiliation).filter_by( - authorship_work_id=authorship_db.work_id, - authorship_person_id=authorship_db.person_id, - institution_id=institution_db.id - ).first() - - if existing_affiliation: - affiliation_db = existing_affiliation - logger.debug(f"Affiliation link Auth W:{authorship_db.work_id}/P:{person_db.id}, Inst {institution_db.id} already exists.") - else: - # Prepare data for the new Affiliation link record (uses composite FK) - affiliation_input_data = { - "authorship_work_id": authorship_db.work_id, # Part of composite FK to Authorship - "authorship_person_id": authorship_db.person_id, # Part of composite FK to Authorship - "institution_id": institution_db.id # FK to Institution - } - affiliation_db = Affiliation(**affiliation_input_data) - db.add(affiliation_db) - db.flush() # Persist the new Affiliation link - self.logger.info(f"Created Affiliation Auth W:{authorship_db.work_id}/P:{person_db.id}, Inst {institution_db.id}") - - # Associate the Affiliation link record with its discovery chain - # Note: Affiliation uses a composite primary key; associate_entity handles this. - discovery_chain_service.associate_entity(db=db, chain=affiliation_chain, entity=affiliation_db, is_direct=True) - # Mark the affiliation link discovery chain as complete - discovery_chain_service.complete_chain(db=db, chain=affiliation_chain) + # Ensure the institution discovery chain exists before linking from it + if not institution_chain: + # Indicates an unexpected state after successful institution processing + raise RuntimeError( + f"Institution chain is None for Inst {institution_db.id}, cannot process affiliation link." + ) + + # Create a discovery chain specifically for the Affiliation link itself + affiliation_chain = discovery_chain_service.create_child_chain( + db=db, + parent_chain=institution_chain, + discovery_type="LINK_AFFILIATION", + parameters={ + "institution_id": institution_db.id + }, # Link refers back to institution + ) + + # Check if this specific Authorship-Institution affiliation link already exists + existing_affiliation = ( + db.query(Affiliation) + .filter_by( + authorship_work_id=authorship_db.work_id, + authorship_person_id=authorship_db.person_id, + institution_id=institution_db.id, + ) + .first() + ) + + if existing_affiliation: + affiliation_db = existing_affiliation + logger.debug( + f"Affiliation link Auth W:{authorship_db.work_id}/P:{person_db.id}, Inst {institution_db.id} already exists." + ) + else: + # Prepare data for the new Affiliation link record (uses composite FK) + affiliation_input_data = { + "authorship_work_id": authorship_db.work_id, # Part of composite FK to Authorship + "authorship_person_id": authorship_db.person_id, # Part of composite FK to Authorship + "institution_id": institution_db.id, # FK to Institution + } + affiliation_db = Affiliation(**affiliation_input_data) + db.add(affiliation_db) + db.flush() # Persist the new Affiliation link + self.logger.info( + f"Created Affiliation Auth W:{authorship_db.work_id}/P:{person_db.id}, Inst {institution_db.id}" + ) + + # Associate the Affiliation link record with its discovery chain + # Note: Affiliation uses a composite primary key; associate_entity handles this. + discovery_chain_service.associate_entity( + db=db, + chain=affiliation_chain, + entity=affiliation_db, + is_direct=True, + ) + # Mark the affiliation link discovery chain as complete + discovery_chain_service.complete_chain( + db=db, chain=affiliation_chain + ) except (SQLAlchemyError, ValueError, RuntimeError) as e_affil: - # Handle errors during Affiliation link creation or flush - logger.error(f"Error creating/flushing Affiliation Auth W:{authorship_db.work_id}/P:{person_db.id}, Inst {institution_db.id}: {e_affil}", exc_info=False); - if affiliation_chain: - # Attempt to mark the chain as failed - try: discovery_chain_service.fail_chain(db=db, chain=affiliation_chain, error_message=str(e_affil)) - except Exception as fail_err: logger.error(f"Failed attempt to mark affiliation_chain {affiliation_chain.id} as FAILED: {fail_err}") - raise e_affil # Re-raise critical errors + # Handle errors during Affiliation link creation or flush + logger.error( + f"Error creating/flushing Affiliation Auth W:{authorship_db.work_id}/P:{person_db.id}, Inst {institution_db.id}: {e_affil}", + exc_info=False, + ) + if affiliation_chain: + # Attempt to mark the chain as failed + try: + discovery_chain_service.fail_chain( + db=db, + chain=affiliation_chain, + error_message=str(e_affil), + ) + except Exception as fail_err: + logger.error( + f"Failed attempt to mark affiliation_chain {affiliation_chain.id} as FAILED: {fail_err}" + ) + raise e_affil # Re-raise critical errors # Catch potential errors in the setup or iteration of the main authorships loop itself except Exception as e_auth_outer: - logger.error(f"Critical error during authorship/affiliation processing loop for Work ID {work_db.id}: {e_auth_outer}", exc_info=True) + logger.error( + f"Critical error during authorship/affiliation processing loop for Work ID {work_db.id}: {e_auth_outer}", + exc_info=True, + ) # Re-raise to indicate a failure in this major processing block, likely requiring transaction rollback raise e_auth_outer # --- 2. Process Topics and Hierarchy --- try: # Retrieve primary topic and list of other topics from the API data - primary_topic_data = work_api_data.get('primary_topic') - topics_data = work_api_data.get('topics', []) - all_topic_entries = [] # Combined list to process, ensuring uniqueness - processed_topic_oa_ids: Set[str] = set() # Track OpenAlex IDs to avoid duplicates + primary_topic_data = work_api_data.get("primary_topic") + topics_data = work_api_data.get("topics", []) + all_topic_entries = [] # Combined list to process, ensuring uniqueness + processed_topic_oa_ids: Set[str] = ( + set() + ) # Track OpenAlex IDs to avoid duplicates # Add the primary topic if it's valid and provided as a dictionary if primary_topic_data and isinstance(primary_topic_data, dict): - primary_topic_data['is_primary'] = True # Mark this entry as the primary topic + primary_topic_data["is_primary"] = ( + True # Mark this entry as the primary topic + ) all_topic_entries.append(primary_topic_data) - primary_topic_oa_id = self._get_id_from_oa_url(primary_topic_data.get('id')) + primary_topic_oa_id = self._get_id_from_oa_url( + primary_topic_data.get("id") + ) if primary_topic_oa_id: - processed_topic_oa_ids.add(primary_topic_oa_id) # Track its ID + processed_topic_oa_ids.add(primary_topic_oa_id) # Track its ID elif primary_topic_data: - # Log if primary topic data is present but not in the expected dictionary format - logger.warning(f"Primary topic data for work {work_db.id} is not a dictionary: {type(primary_topic_data)}") - + # Log if primary topic data is present but not in the expected dictionary format + logger.warning( + f"Primary topic data for work {work_db.id} is not a dictionary: {type(primary_topic_data)}" + ) # Add other topics from the list if valid and not already added as the primary topic if isinstance(topics_data, list): - for topic_item in topics_data: - # Ensure each item in the list is a dictionary - if not isinstance(topic_item, dict): - logger.warning(f"Skipping non-dictionary item in topics list for work {work_db.id}: {topic_item}") - continue - topic_oa_id = self._get_id_from_oa_url(topic_item.get('id')) - # Add only if it has a valid ID and wasn't the primary topic already processed - if topic_oa_id and topic_oa_id not in processed_topic_oa_ids: - topic_item['is_primary'] = False # Mark as not the primary topic - all_topic_entries.append(topic_item) - processed_topic_oa_ids.add(topic_oa_id) # Track its ID + for topic_item in topics_data: + # Ensure each item in the list is a dictionary + if not isinstance(topic_item, dict): + logger.warning( + f"Skipping non-dictionary item in topics list for work {work_db.id}: {topic_item}" + ) + continue + topic_oa_id = self._get_id_from_oa_url(topic_item.get("id")) + # Add only if it has a valid ID and wasn't the primary topic already processed + if topic_oa_id and topic_oa_id not in processed_topic_oa_ids: + topic_item["is_primary"] = ( + False # Mark as not the primary topic + ) + all_topic_entries.append(topic_item) + processed_topic_oa_ids.add(topic_oa_id) # Track its ID elif topics_data: - # Log if topics data is present but not in the expected list format - logger.warning(f"Topics data for work {work_db.id} is not a list: {type(topics_data)}") + # Log if topics data is present but not in the expected list format + logger.warning( + f"Topics data for work {work_db.id} is not a list: {type(topics_data)}" + ) - self.logger.debug(f"Processing {len(all_topic_entries)} unique topic entries for Work ID: {work_db.id}") + self.logger.debug( + f"Processing {len(all_topic_entries)} unique topic entries for Work ID: {work_db.id}" + ) # Process each unique topic entry found for the work for topic_entry in all_topic_entries: - topic_oa_id = self._get_id_from_oa_url(topic_entry.get('id')) - topic_name = topic_entry.get('display_name') + topic_oa_id = self._get_id_from_oa_url(topic_entry.get("id")) + topic_name = topic_entry.get("display_name") # Basic validation for the topic entry itself if not topic_oa_id or not topic_name: - logger.warning(f"Skipping topic entry due to missing ID or name: {topic_entry}") - continue # Skip this topic entry + logger.warning( + f"Skipping topic entry due to missing ID or name: {topic_entry}" + ) + continue # Skip this topic entry # Variables to hold the database objects for the topic and its hierarchy domain_db: Optional[Domain] = None field_db: Optional[Field] = None subfield_db: Optional[Subfield] = None topic_db: Optional[Topic] = None - work_topic_db: Optional[WorkTopic] = None # The Work <-> Topic link object - topic_entry_chain: Optional[DiscoveryChain] = None # Provenance chain for this entry + work_topic_db: Optional[WorkTopic] = ( + None # The Work <-> Topic link object + ) + topic_entry_chain: Optional[DiscoveryChain] = ( + None # Provenance chain for this entry + ) try: # Create a discovery chain for processing this specific topic entry and its hierarchy topic_entry_chain = discovery_chain_service.create_child_chain( - db=db, parent_chain=parent_chain, discovery_type='REL_TOPIC_ENTRY', - parameters={'work_id': work_db.id, 'topic_oa_id': topic_oa_id} + db=db, + parent_chain=parent_chain, + discovery_type="REL_TOPIC_ENTRY", + parameters={"work_id": work_db.id, "topic_oa_id": topic_oa_id}, ) # --- Process Hierarchy (Domain -> Field -> Subfield -> Topic) --- # Traverse the hierarchy provided within the topic entry data # 2a. Domain (Top Level) - domain_data = topic_entry.get('domain', {}) - domain_id_url = domain_data.get('id') - domain_oa_id = self._get_id_from_oa_url(domain_id_url) if domain_id_url else None + domain_data = topic_entry.get("domain", {}) + domain_id_url = domain_data.get("id") + domain_oa_id = ( + self._get_id_from_oa_url(domain_id_url) + if domain_id_url + else None + ) # Domain is essential for the hierarchy; skip if missing if not domain_oa_id: - logger.warning(f"Missing Domain ID/URL for Topic {topic_oa_id}, skipping hierarchy processing for this entry.") - # Fail the chain for this topic entry if essential hierarchy is missing - discovery_chain_service.fail_chain(db, topic_entry_chain, "Missing Domain ID") - continue # Move to the next topic entry - domain_input = {"openalex_id": domain_oa_id, "display_name": domain_data.get('display_name', 'Unknown Domain')} - domain_db = domain_repo.get_or_create_by_openalex_id(openalex_id=domain_oa_id, obj_in_data=domain_input) - db.flush(); # Ensure Domain object has an ID - if domain_db.id is None: raise RuntimeError(f"Domain ID is None after flush for OA ID {domain_oa_id}") + logger.warning( + f"Missing Domain ID/URL for Topic {topic_oa_id}, skipping hierarchy processing for this entry." + ) + # Fail the chain for this topic entry if essential hierarchy is missing + discovery_chain_service.fail_chain( + db, topic_entry_chain, "Missing Domain ID" + ) + continue # Move to the next topic entry + domain_input = { + "openalex_id": domain_oa_id, + "display_name": domain_data.get( + "display_name", "Unknown Domain" + ), + } + domain_db = domain_repo.get_or_create_by_openalex_id( + openalex_id=domain_oa_id, obj_in_data=domain_input + ) + db.flush() # Ensure Domain object has an ID + if domain_db.id is None: + raise RuntimeError( + f"Domain ID is None after flush for OA ID {domain_oa_id}" + ) # Associate the Domain with the topic entry chain (indirect discovery) - discovery_chain_service.associate_entity(db=db, chain=topic_entry_chain, entity=domain_db, is_direct=False) + discovery_chain_service.associate_entity( + db=db, + chain=topic_entry_chain, + entity=domain_db, + is_direct=False, + ) # 2b. Field (Child of Domain) - field_data = topic_entry.get('field', {}) - field_id_url = field_data.get('id') - field_oa_id = self._get_id_from_oa_url(field_id_url) if field_id_url else None + field_data = topic_entry.get("field", {}) + field_id_url = field_data.get("id") + field_oa_id = ( + self._get_id_from_oa_url(field_id_url) if field_id_url else None + ) # Proceed only if Field ID is present and the parent Domain was processed successfully if not field_oa_id or not (domain_db and domain_db.id): - logger.warning(f"Missing Field ID/URL or Domain DB/ID for Topic {topic_oa_id}, skipping Field/Subfield/Topic.") - discovery_chain_service.fail_chain(db, topic_entry_chain, "Missing Field ID or Domain") - continue # Move to the next topic entry - field_input = {"openalex_id": field_oa_id, "display_name": field_data.get('display_name', 'Unknown Field'), "domain_id": domain_db.id} - field_db = field_repo.get_or_create_by_openalex_id(openalex_id=field_oa_id, obj_in_data=field_input) - db.flush(); # Ensure Field object has an ID - if field_db.id is None: raise RuntimeError(f"Field ID is None after flush for OA ID {field_oa_id}") + logger.warning( + f"Missing Field ID/URL or Domain DB/ID for Topic {topic_oa_id}, skipping Field/Subfield/Topic." + ) + discovery_chain_service.fail_chain( + db, topic_entry_chain, "Missing Field ID or Domain" + ) + continue # Move to the next topic entry + field_input = { + "openalex_id": field_oa_id, + "display_name": field_data.get("display_name", "Unknown Field"), + "domain_id": domain_db.id, + } + field_db = field_repo.get_or_create_by_openalex_id( + openalex_id=field_oa_id, obj_in_data=field_input + ) + db.flush() # Ensure Field object has an ID + if field_db.id is None: + raise RuntimeError( + f"Field ID is None after flush for OA ID {field_oa_id}" + ) # Associate the Field (indirect discovery) - discovery_chain_service.associate_entity(db=db, chain=topic_entry_chain, entity=field_db, is_direct=False) + discovery_chain_service.associate_entity( + db=db, chain=topic_entry_chain, entity=field_db, is_direct=False + ) # 2c. Subfield (Child of Field) - subfield_data = topic_entry.get('subfield', {}) - subfield_id_url = subfield_data.get('id') - subfield_oa_id = self._get_id_from_oa_url(subfield_id_url) if subfield_id_url else None + subfield_data = topic_entry.get("subfield", {}) + subfield_id_url = subfield_data.get("id") + subfield_oa_id = ( + self._get_id_from_oa_url(subfield_id_url) + if subfield_id_url + else None + ) # Proceed only if Subfield ID is present and the parent Field was processed successfully if not subfield_oa_id or not (field_db and field_db.id): - logger.warning(f"Missing Subfield ID/URL or Field DB/ID for Topic {topic_oa_id}, skipping Subfield/Topic.") - discovery_chain_service.fail_chain(db, topic_entry_chain, "Missing Subfield ID or Field") - continue # Move to the next topic entry - subfield_input = {"openalex_id": subfield_oa_id, "display_name": subfield_data.get('display_name', 'Unknown Subfield'), "field_id": field_db.id} - subfield_db = subfield_repo.get_or_create_by_openalex_id(openalex_id=subfield_oa_id, obj_in_data=subfield_input) - db.flush(); # Ensure Subfield object has an ID - if subfield_db.id is None: raise RuntimeError(f"Subfield ID is None after flush for OA ID {subfield_oa_id}") + logger.warning( + f"Missing Subfield ID/URL or Field DB/ID for Topic {topic_oa_id}, skipping Subfield/Topic." + ) + discovery_chain_service.fail_chain( + db, topic_entry_chain, "Missing Subfield ID or Field" + ) + continue # Move to the next topic entry + subfield_input = { + "openalex_id": subfield_oa_id, + "display_name": subfield_data.get( + "display_name", "Unknown Subfield" + ), + "field_id": field_db.id, + } + subfield_db = subfield_repo.get_or_create_by_openalex_id( + openalex_id=subfield_oa_id, obj_in_data=subfield_input + ) + db.flush() # Ensure Subfield object has an ID + if subfield_db.id is None: + raise RuntimeError( + f"Subfield ID is None after flush for OA ID {subfield_oa_id}" + ) # Associate the Subfield (indirect discovery) - discovery_chain_service.associate_entity(db=db, chain=topic_entry_chain, entity=subfield_db, is_direct=False) + discovery_chain_service.associate_entity( + db=db, + chain=topic_entry_chain, + entity=subfield_db, + is_direct=False, + ) # 2d. Topic (Child of Subfield - Leaf Level) # Proceed only if the Topic ID itself is valid and the parent Subfield was processed successfully if not topic_oa_id or not (subfield_db and subfield_db.id): - logger.warning(f"Missing Topic ID or Subfield DB/ID for Topic OA ID {topic_oa_id}.") - discovery_chain_service.fail_chain(db, topic_entry_chain, "Missing Topic ID or Subfield") - continue # Move to the next topic entry + logger.warning( + f"Missing Topic ID or Subfield DB/ID for Topic OA ID {topic_oa_id}." + ) + discovery_chain_service.fail_chain( + db, topic_entry_chain, "Missing Topic ID or Subfield" + ) + continue # Move to the next topic entry topic_input = { "openalex_id": topic_oa_id, "display_name": topic_name, - "description": topic_entry.get('description'), # Optional description from OpenAlex - "subfield_id": subfield_db.id # Link to parent Subfield + "description": topic_entry.get( + "description" + ), # Optional description from OpenAlex + "subfield_id": subfield_db.id, # Link to parent Subfield } - topic_input = {k: v for k, v in topic_input.items() if v is not None} # Clean None values - topic_db = topic_repo.get_or_create_by_openalex_id(openalex_id=topic_oa_id, obj_in_data=topic_input) - db.flush(); # Ensure Topic object has an ID - if topic_db.id is None: raise RuntimeError(f"Topic ID is None after flush for OA ID {topic_oa_id}") + topic_input = { + k: v for k, v in topic_input.items() if v is not None + } # Clean None values + topic_db = topic_repo.get_or_create_by_openalex_id( + openalex_id=topic_oa_id, obj_in_data=topic_input + ) + db.flush() # Ensure Topic object has an ID + if topic_db.id is None: + raise RuntimeError( + f"Topic ID is None after flush for OA ID {topic_oa_id}" + ) # Associate the Topic (direct discovery for this topic entry) - discovery_chain_service.associate_entity(db=db, chain=topic_entry_chain, entity=topic_db, is_direct=True) + discovery_chain_service.associate_entity( + db=db, chain=topic_entry_chain, entity=topic_db, is_direct=True + ) # 2e. WorkTopic Association (Link the Work to the processed Topic) # Proceed only if the Topic object was successfully processed if not (topic_db and topic_db.id): - logger.warning(f"Missing Topic DB/ID for Topic {topic_oa_id}, cannot create WorkTopic link.") - discovery_chain_service.fail_chain(db, topic_entry_chain, "Missing Topic DB/ID for association") - continue # Move to the next topic entry + logger.warning( + f"Missing Topic DB/ID for Topic {topic_oa_id}, cannot create WorkTopic link." + ) + discovery_chain_service.fail_chain( + db, topic_entry_chain, "Missing Topic DB/ID for association" + ) + continue # Move to the next topic entry # Check if the specific Work-Topic link already exists in the database - existing_work_topic = db.query(WorkTopic).filter_by(work_id=work_db.id, topic_id=topic_db.id).first() + existing_work_topic = ( + db.query(WorkTopic) + .filter_by(work_id=work_db.id, topic_id=topic_db.id) + .first() + ) if not existing_work_topic: # Create the association record linking the Work and Topic work_topic_input = { "work_id": work_db.id, "topic_id": topic_db.id, - "score": topic_entry.get('score'), # Store the relevance score from OpenAlex - "is_primary": topic_entry.get('is_primary', False) # Store whether this was the primary topic + "score": topic_entry.get( + "score" + ), # Store the relevance score from OpenAlex + "is_primary": topic_entry.get( + "is_primary", False + ), # Store whether this was the primary topic } work_topic_db = WorkTopic(**work_topic_input) db.add(work_topic_db) - db.flush() # Persist the link - self.logger.info(f"Created WorkTopic link W:{work_db.id} <-> T:{topic_db.id}") + db.flush() # Persist the link + self.logger.info( + f"Created WorkTopic link W:{work_db.id} <-> T:{topic_db.id}" + ) # Associate the WorkTopic link record itself with the discovery chain # Note: WorkTopic uses a composite primary key; associate_entity handles this. - discovery_chain_service.associate_entity(db=db, chain=topic_entry_chain, entity=work_topic_db, is_direct=True) + discovery_chain_service.associate_entity( + db=db, + chain=topic_entry_chain, + entity=work_topic_db, + is_direct=True, + ) else: # Link already exists, no action needed for creation - self.logger.debug(f"WorkTopic link W:{work_db.id} <-> T:{topic_db.id} already exists.") - work_topic_db = existing_work_topic # Assign if needed for potential future use + self.logger.debug( + f"WorkTopic link W:{work_db.id} <-> T:{topic_db.id} already exists." + ) + work_topic_db = existing_work_topic # Assign if needed for potential future use # Mark the discovery chain for this entire topic entry (including hierarchy) as complete - discovery_chain_service.complete_chain(db=db, chain=topic_entry_chain) + discovery_chain_service.complete_chain( + db=db, chain=topic_entry_chain + ) except (SQLAlchemyError, ValueError, RuntimeError) as e_topic_hierarchy: # Catch errors occurring during the processing of a SINGLE topic entry's hierarchy or link - logger.error(f"Error processing hierarchy/link for Topic OA ID {topic_oa_id} for Work ID {work_db.id}: {e_topic_hierarchy}", exc_info=False) # Keep log concise for production + logger.error( + f"Error processing hierarchy/link for Topic OA ID {topic_oa_id} for Work ID {work_db.id}: {e_topic_hierarchy}", + exc_info=False, + ) # Keep log concise for production if topic_entry_chain: # Attempt to mark the specific topic entry chain as failed try: - discovery_chain_service.fail_chain(db=db, chain=topic_entry_chain, error_message=str(e_topic_hierarchy)) + discovery_chain_service.fail_chain( + db=db, + chain=topic_entry_chain, + error_message=str(e_topic_hierarchy), + ) except Exception as fail_err: # Log error during failure handling itself - logger.error(f"Failed attempt to mark topic_entry_chain {topic_entry_chain.id} as FAILED: {fail_err}") + logger.error( + f"Failed attempt to mark topic_entry_chain {topic_entry_chain.id} as FAILED: {fail_err}" + ) # Re-raise critical database or validation errors to allow transaction rollback by caller raise e_topic_hierarchy # Catch potential errors in the setup or iteration of the main topics loop itself except Exception as e_topic_outer: - logger.error(f"Critical error during topic processing setup/loop for Work ID {work_db.id}: {e_topic_outer}", exc_info=True) + logger.error( + f"Critical error during topic processing setup/loop for Work ID {work_db.id}: {e_topic_outer}", + exc_info=True, + ) # Re-raise to indicate a failure in this major processing block raise e_topic_outer @@ -576,32 +886,56 @@ def process_openalex_work_data( # primarily for enqueueing further background processing tasks. try: # Get relevant fields from the OpenAlex API data dictionary - referenced_work_urls = work_api_data.get('referenced_works', []) # Works cited BY this work - related_work_urls = work_api_data.get('related_works', []) # Semantically related works - cited_by_api_url = work_api_data.get('cited_by_api_url') # API endpoint to get works CITING this work + referenced_work_urls = work_api_data.get( + "referenced_works", [] + ) # Works cited BY this work + related_work_urls = work_api_data.get( + "related_works", [] + ) # Semantically related works + cited_by_api_url = work_api_data.get( + "cited_by_api_url" + ) # API endpoint to get works CITING this work # Extract the OpenAlex IDs from the provided URLs using the helper function # Use list comprehensions for concise extraction and filtering - referenced_oa_ids = [oa_id for url in referenced_work_urls if isinstance(url, str) and (oa_id := self._get_id_from_oa_url(url))] - related_oa_ids = [oa_id for url in related_work_urls if isinstance(url, str) and (oa_id := self._get_id_from_oa_url(url))] + referenced_oa_ids = [ + oa_id + for url in referenced_work_urls + if isinstance(url, str) and (oa_id := self._get_id_from_oa_url(url)) + ] + related_oa_ids = [ + oa_id + for url in related_work_urls + if isinstance(url, str) and (oa_id := self._get_id_from_oa_url(url)) + ] # Filter out any None values that might result from failed ID parsing referenced_oa_ids = [id for id in referenced_oa_ids if id is not None] related_oa_ids = [id for id in related_oa_ids if id is not None] - - self.logger.debug(f"Extracted {len(referenced_oa_ids)} referenced work IDs for Work ID: {work_db.id}") - self.logger.debug(f"Extracted {len(related_oa_ids)} related work IDs for Work ID: {work_db.id}") - self.logger.debug(f"Extracted cited_by_api_url: {'Present' if cited_by_api_url else 'Absent'}") + self.logger.debug( + f"Extracted {len(referenced_oa_ids)} referenced work IDs for Work ID: {work_db.id}" + ) + self.logger.debug( + f"Extracted {len(related_oa_ids)} related work IDs for Work ID: {work_db.id}" + ) + self.logger.debug( + f"Extracted cited_by_api_url: {'Present' if cited_by_api_url else 'Absent'}" + ) except Exception as e_ref_extract: - # Handle potential errors during the extraction of these lists/URL - logger.error(f"Error extracting referenced/related works lists or cited_by_url for Work ID {work_db.id}: {e_ref_extract}", exc_info=True) - # Reset lists/URL to safe defaults if extraction fails - referenced_oa_ids = [] - related_oa_ids = [] - cited_by_api_url = None - - self.logger.info(f"Finished scholarly processing for Work ID: {work_db.id} (OA: {work_db.openalex_id})") + # Handle potential errors during the extraction of these lists/URL + logger.error( + f"Error extracting referenced/related works lists or cited_by_url for Work ID {work_db.id}: {e_ref_extract}", + exc_info=True, + ) + # Reset lists/URL to safe defaults if extraction fails + referenced_oa_ids = [] + related_oa_ids = [] + cited_by_api_url = None + + self.logger.info( + f"Finished scholarly processing for Work ID: {work_db.id} (OA: {work_db.openalex_id})" + ) # Return the extracted IDs and URL needed by the caller - return referenced_oa_ids, related_oa_ids, cited_by_api_url \ No newline at end of file + return referenced_oa_ids, related_oa_ids, cited_by_api_url diff --git a/backend/services/surfacing_service.py b/backend/services/surfacing_service.py index 75a6f71..a05a6cb 100644 --- a/backend/services/surfacing_service.py +++ b/backend/services/surfacing_service.py @@ -7,26 +7,37 @@ """ import logging -from typing import List, Optional, Dict, Any # Add Optional, Dict, Any +from typing import List, Dict, Any # Add Optional, Dict, Any -from sqlalchemy.orm import Session, aliased, joinedload, contains_eager -from sqlalchemy import func, distinct, select, and_ # Add and_ +from sqlalchemy.orm import Session, aliased, joinedload +from sqlalchemy import func, distinct, select, and_ # Add and_ # Import necessary models representing graph entities and relationships from backend.data.models import ( - Work, Repository, WorkCitation, DOIReference, Contributor, Person, Institution, - RepositoryContributorAssociation, Authorship, Affiliation, - RepositoryInstitutionAffiliation, # Model for stored affiliation predictions - SoftwareDependency # Model for dependencies + Work, + Repository, + WorkCitation, + DOIReference, + Contributor, + Person, + Institution, + RepositoryContributorAssociation, + Authorship, + Affiliation, + RepositoryInstitutionAffiliation, # Model for stored affiliation predictions + SoftwareDependency, # Model for dependencies ) + # Import Repositories for direct data access where needed from backend.data.repositories import ( - DOIReferenceRepository, SoftwareDependencyRepository + DOIReferenceRepository, + SoftwareDependencyRepository, ) from .base_service import BaseService logger = logging.getLogger(__name__) + class SurfacingService(BaseService): """ Service layer for retrieving connected information from the MOSS knowledge graph. @@ -76,10 +87,11 @@ def get_works_for_repository(self, db: Session, repository_id: int) -> List[Work works.append(ref.work) unique_work_ids.add(ref.work.id) - logger.info(f"Found {len(works)} unique Works for Repository ID: {repository_id}") + logger.info( + f"Found {len(works)} unique Works for Repository ID: {repository_id}" + ) return works - def get_repositories_for_work(self, db: Session, work_id: int) -> List[Repository]: """ Retrieves all unique Repositories where a reference to a given Work ID @@ -101,12 +113,14 @@ def get_repositories_for_work(self, db: Session, work_id: int) -> List[Repositor unique_repo_ids = set() repositories = [] for ref in references: - # Ensure the reference links to a repository and it hasn't been added already + # Ensure the reference links to a repository and it hasn't been added already if ref.repository and ref.repository.id not in unique_repo_ids: repositories.append(ref.repository) unique_repo_ids.add(ref.repository.id) - logger.info(f"Found {len(repositories)} unique Repositories for Work ID: {work_id}") + logger.info( + f"Found {len(repositories)} unique Repositories for Work ID: {work_id}" + ) return repositories # --- Methods for Work <-> Work Citation Connections --- @@ -129,17 +143,22 @@ def get_works_cited_by(self, db: Session, work_id: int) -> List[Work]: # Query the WorkCitation link table, filtering by the 'cited_work_id' # Eager load the 'citing_work' relationship to avoid N+1 queries if accessing citing work details later. - citations = db.query(WorkCitation)\ - .filter(WorkCitation.cited_work_id == work_id)\ - .options(joinedload(WorkCitation.citing_work))\ - .all() + citations = ( + db.query(WorkCitation) + .filter(WorkCitation.cited_work_id == work_id) + .options(joinedload(WorkCitation.citing_work)) + .all() + ) if citations: - for citation_link in citations: - # Add the citing work if it exists and hasn't been added yet - if citation_link.citing_work and citation_link.citing_work.id not in unique_citing_work_ids: - citing_works.append(citation_link.citing_work) - unique_citing_work_ids.add(citation_link.citing_work.id) + for citation_link in citations: + # Add the citing work if it exists and hasn't been added yet + if ( + citation_link.citing_work + and citation_link.citing_work.id not in unique_citing_work_ids + ): + citing_works.append(citation_link.citing_work) + unique_citing_work_ids.add(citation_link.citing_work.id) logger.info(f"Found {len(citing_works)} unique Works citing Work ID: {work_id}") return citing_works @@ -162,24 +181,33 @@ def get_works_citing(self, db: Session, work_id: int) -> List[Work]: # Query the WorkCitation link table, filtering by the 'citing_work_id' # Eager load the 'cited_work' relationship. - references = db.query(WorkCitation)\ - .filter(WorkCitation.citing_work_id == work_id)\ - .options(joinedload(WorkCitation.cited_work))\ - .all() + references = ( + db.query(WorkCitation) + .filter(WorkCitation.citing_work_id == work_id) + .options(joinedload(WorkCitation.cited_work)) + .all() + ) if references: - for reference_link in references: - # Add the cited work if it exists and hasn't been added yet - if reference_link.cited_work and reference_link.cited_work.id not in unique_cited_work_ids: - cited_works.append(reference_link.cited_work) - unique_cited_work_ids.add(reference_link.cited_work.id) - - logger.info(f"Found {len(cited_works)} unique Works cited by Work ID: {work_id}") + for reference_link in references: + # Add the cited work if it exists and hasn't been added yet + if ( + reference_link.cited_work + and reference_link.cited_work.id not in unique_cited_work_ids + ): + cited_works.append(reference_link.cited_work) + unique_cited_work_ids.add(reference_link.cited_work.id) + + logger.info( + f"Found {len(cited_works)} unique Works cited by Work ID: {work_id}" + ) return cited_works # --- Methods for Aggregated Data --- - def get_repository_aggregated_citations(self, db: Session, repository_id: int) -> Dict[str, int]: + def get_repository_aggregated_citations( + self, db: Session, repository_id: int + ) -> Dict[str, int]: """ Calculates citation counts for a repository based on its linked works. @@ -198,16 +226,20 @@ def get_repository_aggregated_citations(self, db: Session, repository_id: int) - and `moss_discovered_citations`. Returns counts of 0 if the repository is not found or has no linked works. """ - logger.info(f"Calculating aggregated and discovered citations for Repository ID: {repository_id}") + logger.info( + f"Calculating aggregated and discovered citations for Repository ID: {repository_id}" + ) # Step 1: Find all unique Work IDs linked to this repository via DOI references. linked_work_ids_query = ( select(distinct(DOIReference.work_id)) .where(DOIReference.repository_id == repository_id) - .where(DOIReference.work_id.isnot(None)) # Exclude references not linked to a work + .where( + DOIReference.work_id.isnot(None) + ) # Exclude references not linked to a work ) linked_work_ids_result = db.execute(linked_work_ids_query).scalars().all() - linked_work_ids = set(linked_work_ids_result) # Use a set for efficient lookup + linked_work_ids = set(linked_work_ids_result) # Use a set for efficient lookup # Handle case where repository has no linked works if not linked_work_ids: @@ -215,39 +247,53 @@ def get_repository_aggregated_citations(self, db: Session, repository_id: int) - return { "repository_id": repository_id, "openalex_aggregated_citations": 0, - "moss_discovered_citations": 0 + "moss_discovered_citations": 0, } # Step 2: Calculate OpenAlex aggregated citations. # Sum the 'cited_by_count' field from the Work records linked to the repository. openalex_citations_query = ( - select(func.sum(Work.cited_by_count)) # Sum the counts - .where(Work.id.in_(linked_work_ids)) # Filter for linked works + select(func.sum(Work.cited_by_count)).where( # Sum the counts + Work.id.in_(linked_work_ids) + ) # Filter for linked works ) openalex_citations_result = db.execute(openalex_citations_query).scalar() # Handle potential None result if sum is over zero rows or contains nulls - openalex_aggregated_citations = openalex_citations_result if openalex_citations_result is not None else 0 - logger.info(f"OpenAlex Aggregated Citations for Repo {repository_id}: {openalex_aggregated_citations}") + openalex_aggregated_citations = ( + openalex_citations_result if openalex_citations_result is not None else 0 + ) + logger.info( + f"OpenAlex Aggregated Citations for Repo {repository_id}: {openalex_aggregated_citations}" + ) # Step 3: Calculate MOSS discovered citations. # Count distinct citing works found in the WorkCitation table where the cited work is one linked to the repository. moss_citations_query = ( - select(func.count(distinct(WorkCitation.citing_work_id))) # Count unique citing work IDs - .where(WorkCitation.cited_work_id.in_(linked_work_ids)) # Where the cited work is linked to our repo + select( + func.count(distinct(WorkCitation.citing_work_id)) + ).where( # Count unique citing work IDs + WorkCitation.cited_work_id.in_(linked_work_ids) + ) # Where the cited work is linked to our repo ) moss_citations_result = db.execute(moss_citations_query).scalar() - moss_discovered_citations = moss_citations_result if moss_citations_result is not None else 0 - logger.info(f"MOSS Discovered Citations for Repo {repository_id}: {moss_discovered_citations}") + moss_discovered_citations = ( + moss_citations_result if moss_citations_result is not None else 0 + ) + logger.info( + f"MOSS Discovered Citations for Repo {repository_id}: {moss_discovered_citations}" + ) return { "repository_id": repository_id, "openalex_aggregated_citations": openalex_aggregated_citations, - "moss_discovered_citations": moss_discovered_citations + "moss_discovered_citations": moss_discovered_citations, } # --- Methods for Repository <-> Repository Connections --- - def get_repositories_sharing_contributors(self, db: Session, repository_id: int) -> List[Repository]: + def get_repositories_sharing_contributors( + self, db: Session, repository_id: int + ) -> List[Repository]: """ Finds other repositories that share at least one contributor with the target repository. @@ -258,37 +304,48 @@ def get_repositories_sharing_contributors(self, db: Session, repository_id: int) Returns: A list of unique Repository objects that share contributors, excluding the target repository itself. """ - logger.info(f"Finding repositories sharing contributors with Repository ID: {repository_id}") + logger.info( + f"Finding repositories sharing contributors with Repository ID: {repository_id}" + ) # Step 1: Get IDs of all contributors associated with the target repository. target_contributor_ids = ( select(RepositoryContributorAssociation.contributor_id) .where(RepositoryContributorAssociation.repository_id == repository_id) - .subquery() # Use as a subquery for efficient filtering + .subquery() # Use as a subquery for efficient filtering ) # Step 2: Find distinct repositories associated with any of those contributors, # excluding the original target repository. - RepoAlias = aliased(Repository) # Use alias to avoid ambiguity if joining Repository multiple times + RepoAlias = aliased( + Repository + ) # Use alias to avoid ambiguity if joining Repository multiple times shared_repos_query = ( - select(RepoAlias).distinct() # Select distinct repositories + select(RepoAlias) + .distinct() # Select distinct repositories .join( - RepositoryContributorAssociation, # Join Repository to the association table - RepoAlias.id == RepositoryContributorAssociation.repository_id + RepositoryContributorAssociation, # Join Repository to the association table + RepoAlias.id == RepositoryContributorAssociation.repository_id, ) .where( # Filter for associations involving contributors from the target repo - RepositoryContributorAssociation.contributor_id.in_(target_contributor_ids) + RepositoryContributorAssociation.contributor_id.in_( + target_contributor_ids + ) ) .where( - RepoAlias.id != repository_id # Exclude the target repository itself + RepoAlias.id != repository_id # Exclude the target repository itself ) ) results = db.execute(shared_repos_query).scalars().all() - logger.info(f"Found {len(results)} repositories sharing contributors with Repository ID: {repository_id}") + logger.info( + f"Found {len(results)} repositories sharing contributors with Repository ID: {repository_id}" + ) return list(results) - def get_repositories_sharing_works(self, db: Session, repository_id: int) -> List[Repository]: + def get_repositories_sharing_works( + self, db: Session, repository_id: int + ) -> List[Repository]: """ Finds other repositories that have references to at least one of the same Works as the target repository. @@ -300,35 +357,42 @@ def get_repositories_sharing_works(self, db: Session, repository_id: int) -> Lis Returns: A list of unique Repository objects that share linked works, excluding the target repository itself. """ - logger.info(f"Finding repositories sharing works with Repository ID: {repository_id}") + logger.info( + f"Finding repositories sharing works with Repository ID: {repository_id}" + ) # Step 1: Get IDs of all Works linked to the target repository via DOIReferences. target_work_ids = ( select(DOIReference.work_id) .where(DOIReference.repository_id == repository_id) - .where(DOIReference.work_id.isnot(None)) # Ensure the reference is linked to a work - .subquery() # Use as a subquery + .where( + DOIReference.work_id.isnot(None) + ) # Ensure the reference is linked to a work + .subquery() # Use as a subquery ) # Step 2: Find distinct repositories that also have DOIReferences pointing to any of those Works, # excluding the original target repository. - RepoAlias = aliased(Repository) # Use alias + RepoAlias = aliased(Repository) # Use alias shared_repos_query = ( - select(RepoAlias).distinct() # Select distinct repositories + select(RepoAlias) + .distinct() # Select distinct repositories .join( - DOIReference, # Join Repository to DOIReference table - RepoAlias.id == DOIReference.repository_id + DOIReference, # Join Repository to DOIReference table + RepoAlias.id == DOIReference.repository_id, ) .where( # Filter for references involving works linked to the target repo DOIReference.work_id.in_(target_work_ids) ) .where( - RepoAlias.id != repository_id # Exclude the target repository itself + RepoAlias.id != repository_id # Exclude the target repository itself ) ) results = db.execute(shared_repos_query).scalars().all() - logger.info(f"Found {len(results)} repositories sharing works with Repository ID: {repository_id}") + logger.info( + f"Found {len(results)} repositories sharing works with Repository ID: {repository_id}" + ) return list(results) # --- Methods involving Persons and Institutions --- @@ -352,17 +416,26 @@ def get_people_citing_work(self, db: Session, work_id: int) -> List[Person]: CitingWork = aliased(Work) # Construct the query joining through the citation and authorship links people_query = ( - select(Person).distinct() # Select distinct Person objects - .join(Authorship, Person.id == Authorship.person_id) # Person -> Authorship - .join(CitingWork, Authorship.work_id == CitingWork.id) # Authorship -> Citing Work - .join(WorkCitation, CitingWork.id == WorkCitation.citing_work_id) # Citing Work -> Citation Link - .where(WorkCitation.cited_work_id == work_id) # Filter for citations of the target work + select(Person) + .distinct() # Select distinct Person objects + .join(Authorship, Person.id == Authorship.person_id) # Person -> Authorship + .join( + CitingWork, Authorship.work_id == CitingWork.id + ) # Authorship -> Citing Work + .join( + WorkCitation, CitingWork.id == WorkCitation.citing_work_id + ) # Citing Work -> Citation Link + .where( + WorkCitation.cited_work_id == work_id + ) # Filter for citations of the target work ) results = db.execute(people_query).scalars().all() logger.info(f"Found {len(results)} unique people citing Work ID: {work_id}") return list(results) - def get_institutions_citing_work(self, db: Session, work_id: int) -> List[Institution]: + def get_institutions_citing_work( + self, db: Session, work_id: int + ) -> List[Institution]: """ Finds unique Institutions affiliated with authors of Works that cite the target Work ID. @@ -375,26 +448,44 @@ def get_institutions_citing_work(self, db: Session, work_id: int) -> List[Instit Returns: A list of unique Institution objects affiliated with authors of citing works. """ - logger.info(f"Finding institutions affiliated with authors citing Work ID: {work_id}") + logger.info( + f"Finding institutions affiliated with authors citing Work ID: {work_id}" + ) - CitingWork = aliased(Work) # Alias for clarity + CitingWork = aliased(Work) # Alias for clarity # Construct the query joining through citations, authorships, and affiliations institution_query = ( - select(Institution).distinct() # Select distinct Institution objects + select(Institution) + .distinct() # Select distinct Institution objects # Join Institution -> Affiliation -> Authorship -> CitingWork -> WorkCitation .join(Affiliation, Institution.id == Affiliation.institution_id) # Join Affiliation to Authorship using the composite foreign key - .join(Authorship, and_(Affiliation.authorship_work_id == Authorship.work_id, - Affiliation.authorship_person_id == Authorship.person_id)) - .join(CitingWork, Authorship.work_id == CitingWork.id) # Link Authorship to the Citing Work - .join(WorkCitation, CitingWork.id == WorkCitation.citing_work_id) # Link Citing Work via citation - .where(WorkCitation.cited_work_id == work_id) # Filter for citations of the target work + .join( + Authorship, + and_( + Affiliation.authorship_work_id == Authorship.work_id, + Affiliation.authorship_person_id == Authorship.person_id, + ), + ) + .join( + CitingWork, Authorship.work_id == CitingWork.id + ) # Link Authorship to the Citing Work + .join( + WorkCitation, CitingWork.id == WorkCitation.citing_work_id + ) # Link Citing Work via citation + .where( + WorkCitation.cited_work_id == work_id + ) # Filter for citations of the target work ) results = db.execute(institution_query).scalars().all() - logger.info(f"Found {len(results)} unique institutions citing Work ID: {work_id}") + logger.info( + f"Found {len(results)} unique institutions citing Work ID: {work_id}" + ) return list(results) - def get_repositories_by_institution(self, db: Session, institution_id: int) -> List[Repository]: + def get_repositories_by_institution( + self, db: Session, institution_id: int + ) -> List[Repository]: """ Finds unique Repositories linked (via DOIReferences) to Works authored by people affiliated with the given Institution ID at the time of authorship. @@ -408,21 +499,33 @@ def get_repositories_by_institution(self, db: Session, institution_id: int) -> L Returns: A list of unique Repository objects linked to the institution. """ - logger.info(f"Finding repositories associated with Institution ID: {institution_id}") + logger.info( + f"Finding repositories associated with Institution ID: {institution_id}" + ) # Construct the query joining through affiliations, authorships, works, and references repo_query = ( - select(Repository).distinct() # Select distinct Repository objects + select(Repository) + .distinct() # Select distinct Repository objects # Join Repository -> DOIReference -> Work -> Authorship -> Affiliation .join(DOIReference, Repository.id == DOIReference.repository_id) .join(Work, DOIReference.work_id == Work.id) .join(Authorship, Work.id == Authorship.work_id) # Join Authorship to Affiliation using composite key - .join(Affiliation, and_(Authorship.work_id == Affiliation.authorship_work_id, - Authorship.person_id == Affiliation.authorship_person_id)) - .where(Affiliation.institution_id == institution_id) # Filter by the target institution + .join( + Affiliation, + and_( + Authorship.work_id == Affiliation.authorship_work_id, + Authorship.person_id == Affiliation.authorship_person_id, + ), + ) + .where( + Affiliation.institution_id == institution_id + ) # Filter by the target institution ) results = db.execute(repo_query).scalars().all() - logger.info(f"Found {len(results)} unique repositories linked to Institution ID: {institution_id}") + logger.info( + f"Found {len(results)} unique repositories linked to Institution ID: {institution_id}" + ) return list(results) def get_works_by_person(self, db: Session, person_id: int) -> List[Work]: @@ -441,12 +544,15 @@ def get_works_by_person(self, db: Session, person_id: int) -> List[Work]: logger.info(f"Finding works associated with Person ID: {person_id}") # Construct the query joining Work to Authorship work_query = ( - select(Work).distinct() # Select distinct Work objects - .join(Authorship, Work.id == Authorship.work_id) # Join Work -> Authorship - .where(Authorship.person_id == person_id) # Filter by the target person + select(Work) + .distinct() # Select distinct Work objects + .join(Authorship, Work.id == Authorship.work_id) # Join Work -> Authorship + .where(Authorship.person_id == person_id) # Filter by the target person ) results = db.execute(work_query).scalars().all() - logger.info(f"Found {len(results)} unique works linked to Person ID: {person_id}") + logger.info( + f"Found {len(results)} unique works linked to Person ID: {person_id}" + ) return list(results) # --- Methods for Stored Affiliation Predictions --- @@ -470,41 +576,63 @@ def get_affiliations_for_repository( A list of dictionaries, each representing an affiliation record, including resolved institution and repository names. """ - logger.info(f"Getting affiliations for Repository ID: {repository_id} (min_confidence: {min_confidence})") + logger.info( + f"Getting affiliations for Repository ID: {repository_id} (min_confidence: {min_confidence})" + ) # Query the affiliation prediction table, joining to get names query = ( select( - RepositoryInstitutionAffiliation, # Select the main affiliation model object - Institution.display_name.label("institution_name"), # Get institution name - Repository.full_name.label("repository_name") # Get repository name + RepositoryInstitutionAffiliation, # Select the main affiliation model object + Institution.display_name.label( + "institution_name" + ), # Get institution name + Repository.full_name.label("repository_name"), # Get repository name ) - .join(Institution, RepositoryInstitutionAffiliation.institution_id == Institution.id) - .join(Repository, RepositoryInstitutionAffiliation.repository_id == Repository.id) - .where(RepositoryInstitutionAffiliation.repository_id == repository_id) # Filter by repo ID - .where(RepositoryInstitutionAffiliation.confidence_score >= min_confidence) # Filter by confidence - .order_by(RepositoryInstitutionAffiliation.confidence_score.desc()) # Order by confidence + .join( + Institution, + RepositoryInstitutionAffiliation.institution_id == Institution.id, + ) + .join( + Repository, + RepositoryInstitutionAffiliation.repository_id == Repository.id, + ) + .where( + RepositoryInstitutionAffiliation.repository_id == repository_id + ) # Filter by repo ID + .where( + RepositoryInstitutionAffiliation.confidence_score >= min_confidence + ) # Filter by confidence + .order_by( + RepositoryInstitutionAffiliation.confidence_score.desc() + ) # Order by confidence ) - results = db.execute(query).all() # Fetch all matching rows + results = db.execute(query).all() # Fetch all matching rows # Format results into dictionaries for API response or further use affiliation_responses = [] for row in results: - affil_model: RepositoryInstitutionAffiliation = row.RepositoryInstitutionAffiliation + affil_model: RepositoryInstitutionAffiliation = ( + row.RepositoryInstitutionAffiliation + ) inst_name = row.institution_name repo_name = row.repository_name - affiliation_responses.append({ - "repository_id": affil_model.repository_id, - "institution_id": affil_model.institution_id, - "algorithm_name": affil_model.algorithm_name, - "algorithm_version": affil_model.algorithm_version, - "confidence_score": affil_model.confidence_score, - "evidence": affil_model.evidence, # Raw evidence data stored by algorithm - "parameters_used": affil_model.parameters_used, # Parameters used by algorithm run - "calculated_at": affil_model.calculated_at, - "repository_name": repo_name, # Included for convenience - "institution_name": inst_name, # Included for convenience - }) - logger.info(f"Found {len(affiliation_responses)} affiliations for Repository ID {repository_id} meeting criteria.") + affiliation_responses.append( + { + "repository_id": affil_model.repository_id, + "institution_id": affil_model.institution_id, + "algorithm_name": affil_model.algorithm_name, + "algorithm_version": affil_model.algorithm_version, + "confidence_score": affil_model.confidence_score, + "evidence": affil_model.evidence, # Raw evidence data stored by algorithm + "parameters_used": affil_model.parameters_used, # Parameters used by algorithm run + "calculated_at": affil_model.calculated_at, + "repository_name": repo_name, # Included for convenience + "institution_name": inst_name, # Included for convenience + } + ) + logger.info( + f"Found {len(affiliation_responses)} affiliations for Repository ID {repository_id} meeting criteria." + ) return affiliation_responses def get_affiliations_for_institution( @@ -525,41 +653,63 @@ def get_affiliations_for_institution( A list of dictionaries, each representing an affiliation record, including resolved institution and repository names. """ - logger.info(f"Getting affiliations for Institution ID: {institution_id} (min_confidence: {min_confidence})") - # Query the affiliation prediction table, joining to get names + logger.info( + f"Getting affiliations for Institution ID: {institution_id} (min_confidence: {min_confidence})" + ) + # Query the affiliation prediction table, joining to get names query = ( - select( - RepositoryInstitutionAffiliation, # Select the main affiliation model object - Repository.full_name.label("repository_name"), # Get repository name - Institution.display_name.label("institution_name") # Get institution name (might seem redundant but good practice) + select( + RepositoryInstitutionAffiliation, # Select the main affiliation model object + Repository.full_name.label("repository_name"), # Get repository name + Institution.display_name.label( + "institution_name" + ), # Get institution name (might seem redundant but good practice) + ) + .join( + Repository, + RepositoryInstitutionAffiliation.repository_id == Repository.id, ) - .join(Repository, RepositoryInstitutionAffiliation.repository_id == Repository.id) - .join(Institution, RepositoryInstitutionAffiliation.institution_id == Institution.id) - .where(RepositoryInstitutionAffiliation.institution_id == institution_id) # Filter by institution ID - .where(RepositoryInstitutionAffiliation.confidence_score >= min_confidence) # Filter by confidence - .order_by(RepositoryInstitutionAffiliation.confidence_score.desc()) # Order by confidence + .join( + Institution, + RepositoryInstitutionAffiliation.institution_id == Institution.id, + ) + .where( + RepositoryInstitutionAffiliation.institution_id == institution_id + ) # Filter by institution ID + .where( + RepositoryInstitutionAffiliation.confidence_score >= min_confidence + ) # Filter by confidence + .order_by( + RepositoryInstitutionAffiliation.confidence_score.desc() + ) # Order by confidence ) - results = db.execute(query).all() # Fetch all matching rows + results = db.execute(query).all() # Fetch all matching rows # Format results into dictionaries affiliation_responses = [] for row in results: - affil_model: RepositoryInstitutionAffiliation = row.RepositoryInstitutionAffiliation + affil_model: RepositoryInstitutionAffiliation = ( + row.RepositoryInstitutionAffiliation + ) repo_name = row.repository_name inst_name = row.institution_name - affiliation_responses.append({ - "repository_id": affil_model.repository_id, - "institution_id": affil_model.institution_id, - "algorithm_name": affil_model.algorithm_name, - "algorithm_version": affil_model.algorithm_version, - "confidence_score": affil_model.confidence_score, - "evidence": affil_model.evidence, - "parameters_used": affil_model.parameters_used, - "calculated_at": affil_model.calculated_at, - "repository_name": repo_name, # Included for convenience - "institution_name": inst_name, # Included for convenience - }) - logger.info(f"Found {len(affiliation_responses)} affiliations for Institution ID {institution_id} meeting criteria.") + affiliation_responses.append( + { + "repository_id": affil_model.repository_id, + "institution_id": affil_model.institution_id, + "algorithm_name": affil_model.algorithm_name, + "algorithm_version": affil_model.algorithm_version, + "confidence_score": affil_model.confidence_score, + "evidence": affil_model.evidence, + "parameters_used": affil_model.parameters_used, + "calculated_at": affil_model.calculated_at, + "repository_name": repo_name, # Included for convenience + "institution_name": inst_name, # Included for convenience + } + ) + logger.info( + f"Found {len(affiliation_responses)} affiliations for Institution ID {institution_id} meeting criteria." + ) return affiliation_responses # --- Methods for Contributor Connections --- @@ -578,31 +728,42 @@ def get_shared_contributors_details( Returns: A list of Contributor objects associated with *both* repo_id_1 and repo_id_2. """ - logger.info(f"Finding shared contributor details between Repository ID {repo_id_1} and {repo_id_2}") + logger.info( + f"Finding shared contributor details between Repository ID {repo_id_1} and {repo_id_2}" + ) # Efficiently find shared contributors using subqueries and joins shared_contributors_query = ( - select(Contributor) # Select the Contributor object + select(Contributor) # Select the Contributor object # Join Contributor to the association table - .join(RepositoryContributorAssociation, Contributor.id == RepositoryContributorAssociation.contributor_id) + .join( + RepositoryContributorAssociation, + Contributor.id == RepositoryContributorAssociation.contributor_id, + ) .where( # Filter for contributors associated with the first repository... RepositoryContributorAssociation.repository_id == repo_id_1, # ...AND whose ID exists in the set of contributors associated with the second repository. Contributor.id.in_( - select(RepositoryContributorAssociation.contributor_id) # Subquery: Get contributor IDs for repo_id_2 - .where(RepositoryContributorAssociation.repository_id == repo_id_2) - ) + select( + RepositoryContributorAssociation.contributor_id + ).where( # Subquery: Get contributor IDs for repo_id_2 + RepositoryContributorAssociation.repository_id == repo_id_2 + ) + ), ) - .distinct() # Ensure each shared contributor is returned only once - .order_by(Contributor.login) # Optional: Order by login name + .distinct() # Ensure each shared contributor is returned only once + .order_by(Contributor.login) # Optional: Order by login name ) shared_contributors = db.execute(shared_contributors_query).scalars().all() - logger.info(f"Retrieved details for {len(shared_contributors)} shared contributors.") + logger.info( + f"Retrieved details for {len(shared_contributors)} shared contributors." + ) return list(shared_contributors) - - def get_repositories_by_contributor(self, db: Session, contributor_id: int) -> List[Repository]: + def get_repositories_by_contributor( + self, db: Session, contributor_id: int + ) -> List[Repository]: """ Finds all repositories associated with a specific contributor ID. @@ -613,25 +774,36 @@ def get_repositories_by_contributor(self, db: Session, contributor_id: int) -> L Returns: A list of Repository objects the contributor is associated with. """ - logger.info(f"Finding repositories associated with Contributor ID: {contributor_id}") + logger.info( + f"Finding repositories associated with Contributor ID: {contributor_id}" + ) # Query the Repository table, joining through the association table repo_query = ( select(Repository) - .join(RepositoryContributorAssociation, Repository.id == RepositoryContributorAssociation.repository_id) # Join Repo -> Association - .where(RepositoryContributorAssociation.contributor_id == contributor_id) # Filter by contributor ID - .order_by(Repository.full_name) # Optional: Order results for consistency + .join( + RepositoryContributorAssociation, + Repository.id == RepositoryContributorAssociation.repository_id, + ) # Join Repo -> Association + .where( + RepositoryContributorAssociation.contributor_id == contributor_id + ) # Filter by contributor ID + .order_by(Repository.full_name) # Optional: Order results for consistency # Example of eager loading the owner if needed often (can impact performance): # .options(joinedload(Repository.owner)) ) repositories = db.execute(repo_query).scalars().all() - logger.info(f"Found {len(repositories)} repositories for Contributor ID {contributor_id}.") + logger.info( + f"Found {len(repositories)} repositories for Contributor ID {contributor_id}." + ) return list(repositories) # --- Methods for Software Dependencies --- - def get_dependencies_for_repository(self, db: Session, repository_id: int) -> List[SoftwareDependency]: + def get_dependencies_for_repository( + self, db: Session, repository_id: int + ) -> List[SoftwareDependency]: """ Retrieves stored software dependencies recorded for a given repository ID. @@ -646,5 +818,7 @@ def get_dependencies_for_repository(self, db: Session, repository_id: int) -> Li # Use the dedicated repository for SoftwareDependency for optimized access dep_repo = SoftwareDependencyRepository(db) dependencies = dep_repo.find_by_repository(repository_id=repository_id) - logger.info(f"Found {len(dependencies)} dependencies for Repository ID {repository_id}.") - return dependencies \ No newline at end of file + logger.info( + f"Found {len(dependencies)} dependencies for Repository ID {repository_id}." + ) + return dependencies diff --git a/backend/tasks/__init__.py b/backend/tasks/__init__.py index 1741fce..d3b006f 100644 --- a/backend/tasks/__init__.py +++ b/backend/tasks/__init__.py @@ -1 +1 @@ -# Makes 'tasks' a Python package \ No newline at end of file +# Makes 'tasks' a Python package diff --git a/backend/tasks/discovery_tasks.py b/backend/tasks/discovery_tasks.py index 0921bb3..f754573 100644 --- a/backend/tasks/discovery_tasks.py +++ b/backend/tasks/discovery_tasks.py @@ -12,15 +12,17 @@ from typing import Optional from sqlalchemy.orm import Session -from celery.exceptions import Ignore # Used to gracefully stop task processing without failure. # Import the configured Celery application instance. from backend.celery_app import celery_app + # Import the database session factory for creating task-specific sessions. from backend.data.database import SessionLocal + # Import data models and repository classes required for database operations. from backend.data.models import KeywordSearchSession from backend.data.repositories import KeywordSearchSessionRepository + # Import application services and external API clients. from backend.services import KeywordDiscoveryService, IngestionService from backend.external import GitHubClient, ApiClientError @@ -28,13 +30,17 @@ # Setup logger for this module. logger = logging.getLogger(__name__) + @celery_app.task( - bind=True, # Makes 'self' (the task instance) available inside the function. - autoretry_for=(ApiClientError, Exception), # Automatically retry on GitHub API errors or unexpected exceptions. - retry_backoff=True, # Apply exponential backoff between retries. - max_retries=3, # Limit the number of automatic retries. - acks_late=True, # Acknowledge task message only after task success/failure (ensures retry if worker crashes). - task_reject_on_worker_lost=True # Requeue task if the worker process executing it is lost. + bind=True, # Makes 'self' (the task instance) available inside the function. + autoretry_for=( + ApiClientError, + Exception, + ), # Automatically retry on GitHub API errors or unexpected exceptions. + retry_backoff=True, # Apply exponential backoff between retries. + max_retries=3, # Limit the number of automatic retries. + acks_late=True, # Acknowledge task message only after task success/failure (ensures retry if worker crashes). + task_reject_on_worker_lost=True, # Requeue task if the worker process executing it is lost. ) def keyword_discovery_celery_task(self, session_id: int, keywords: str): """ @@ -53,16 +59,26 @@ def keyword_discovery_celery_task(self, session_id: int, keywords: str): keywords: The string of keywords used for the GitHub search. """ # Extract task ID for correlated logging. - task_id = self.request.id if hasattr(self, 'request') and self.request.id else 'UNKNOWN_TASK_ID' + task_id = ( + self.request.id + if hasattr(self, "request") and self.request.id + else "UNKNOWN_TASK_ID" + ) log_prefix = f"CELERY TASK {task_id} (Session: {session_id})" - logger.info(f"{log_prefix}: STARTING Keyword Discovery Task for keywords: '{keywords}'.") + logger.info( + f"{log_prefix}: STARTING Keyword Discovery Task for keywords: '{keywords}'." + ) - db: Session | None = None # Database session for this task run. - search_session: KeywordSearchSession | None = None # The session record being processed. - processed_count = 0 # Counter for successfully processed items. - ingestion_errors = 0 # Counter for errors during data ingestion. - association_errors = 0 # Counter for errors during association logic. - task_exception: Optional[Exception] = None # Stores any exception caught in the main try block. + db: Session | None = None # Database session for this task run. + search_session: KeywordSearchSession | None = ( + None # The session record being processed. + ) + processed_count = 0 # Counter for successfully processed items. + ingestion_errors = 0 # Counter for errors during data ingestion. + association_errors = 0 # Counter for errors during association logic. + task_exception: Optional[Exception] = ( + None # Stores any exception caught in the main try block. + ) try: # Create a new database session for this task invocation. @@ -73,37 +89,49 @@ def keyword_discovery_celery_task(self, session_id: int, keywords: str): # Catch configuration errors (e.g., missing API keys) during initialization. try: github_client = GitHubClient() - ingestion_service = IngestionService() # Assumes DB session not needed at init. + ingestion_service = ( + IngestionService() + ) # Assumes DB session not needed at init. keyword_discovery_service = KeywordDiscoveryService( - github_client=github_client, - ingestion_service=ingestion_service + github_client=github_client, ingestion_service=ingestion_service ) logger.info(f"{log_prefix}: Core services initialized.") - except ValueError as config_err: # Catch potential issues like missing API keys. - logger.error(f"{log_prefix}: CONFIGURATION ERROR during service initialization: {config_err}", exc_info=True) + except ( + ValueError + ) as config_err: # Catch potential issues like missing API keys. + logger.error( + f"{log_prefix}: CONFIGURATION ERROR during service initialization: {config_err}", + exc_info=True, + ) task_exception = config_err # Re-raise to let Celery handle retries or mark as failed based on task config. raise task_exception - logger.info(f"{log_prefix}: Invoking keyword_discovery_service.discover_and_ingest_by_keywords...") + logger.info( + f"{log_prefix}: Invoking keyword_discovery_service.discover_and_ingest_by_keywords..." + ) # --- Execute the core discovery and ingestion logic --- # The service method is responsible for: # 1. Updating the KeywordSearchSession status to 'RUNNING'. # 2. Performing the GitHub search and processing results. # 3. Ingesting data for discovered repositories. # 4. Returning counts of processed items and any errors encountered. - processed_count, ingestion_errors, association_errors = keyword_discovery_service.discover_and_ingest_by_keywords( - db=db, # Pass the task-managed database session. - session_id=session_id, - keywords=keywords + processed_count, ingestion_errors, association_errors = ( + keyword_discovery_service.discover_and_ingest_by_keywords( + db=db, # Pass the task-managed database session. + session_id=session_id, + keywords=keywords, + ) + ) + logger.info( + f"{log_prefix}: Service call completed. Results: Processed={processed_count}, IngestErrors={ingestion_errors}, AssocErrors={association_errors}" ) - logger.info(f"{log_prefix}: Service call completed. Results: Processed={processed_count}, IngestErrors={ingestion_errors}, AssocErrors={association_errors}") except Exception as e: # Catch exceptions occurring *before* or *during* the main service call. # This includes configuration errors raised above or errors within the service itself. logger.exception(f"{log_prefix}: EXCEPTION caught during task execution.") - task_exception = e # Store the exception for the finally block. + task_exception = e # Store the exception for the finally block. # Re-raise the exception to trigger Celery's retry/failure mechanisms # as defined in the task decorator (`autoretry_for`). @@ -122,27 +150,33 @@ def keyword_discovery_celery_task(self, session_id: int, keywords: str): # except Exception as retry_e: # logger.error(f"{log_prefix}: Error during explicit retry attempt: {retry_e}. Raising original exception.") # raise e - raise e # Let Celery handle the retry based on `autoretry_for` + raise e # Let Celery handle the retry based on `autoretry_for` finally: # This block executes regardless of whether an exception occurred or not. # Its primary purpose is to ensure the final status of the KeywordSearchSession # is correctly updated in the database. logger.info(f"{log_prefix}: Entering FINALLY block for final status update.") - final_status = "UNKNOWN" # Default status if logic fails. + final_status = "UNKNOWN" # Default status if logic fails. # Determine the final status based on exceptions or reported errors. if task_exception: # An exception was caught in the main try block. - logger.warning(f"{log_prefix}: FINALLY: Task exception detected ({type(task_exception).__name__}). Setting final status to FAILED.") + logger.warning( + f"{log_prefix}: FINALLY: Task exception detected ({type(task_exception).__name__}). Setting final status to FAILED." + ) final_status = "FAILED" elif ingestion_errors > 0 or association_errors > 0: # The service call completed but reported errors during processing. - logger.warning(f"{log_prefix}: FINALLY: Service reported errors (Ingest:{ingestion_errors}, Assoc:{association_errors}). Setting final status to FAILED.") + logger.warning( + f"{log_prefix}: FINALLY: Service reported errors (Ingest:{ingestion_errors}, Assoc:{association_errors}). Setting final status to FAILED." + ) final_status = "FAILED" else: # No exceptions occurred, and the service reported no errors. - logger.info(f"{log_prefix}: FINALLY: Task completed without exceptions or reported errors. Setting final status to COMPLETED.") + logger.info( + f"{log_prefix}: FINALLY: Task completed without exceptions or reported errors. Setting final status to COMPLETED." + ) final_status = "COMPLETED" # --- Safely update the database record --- @@ -151,7 +185,9 @@ def keyword_discovery_celery_task(self, session_id: int, keywords: str): # (which might be rolled back or in an error state). update_db: Session | None = None try: - logger.info(f"{log_prefix}: FINALLY: Attempting to establish NEW session for final status update.") + logger.info( + f"{log_prefix}: FINALLY: Attempting to establish NEW session for final status update." + ) update_db = SessionLocal() session_repo = KeywordSearchSessionRepository(update_db) @@ -159,46 +195,68 @@ def keyword_discovery_celery_task(self, session_id: int, keywords: str): session_to_update = session_repo.get(id=session_id) if session_to_update: - logger.info(f"{log_prefix}: FINALLY: Found session {session_id}. Current status: '{session_to_update.status}'. Attempting update to '{final_status}'.") + logger.info( + f"{log_prefix}: FINALLY: Found session {session_id}. Current status: '{session_to_update.status}'. Attempting update to '{final_status}'." + ) # Update status, completion timestamp, and results count. session_to_update.status = final_status session_to_update.completed_at = datetime.now(timezone.utc) - session_to_update.results_count = processed_count # Reflects count from service. + session_to_update.results_count = ( + processed_count # Reflects count from service. + ) update_db.add(session_to_update) logger.info(f"{log_prefix}: FINALLY: Committing final status update...") update_db.commit() - logger.info(f"{log_prefix}: FINALLY: Final status commit successful. DB status should now be '{final_status}'.") + logger.info( + f"{log_prefix}: FINALLY: Final status commit successful. DB status should now be '{final_status}'." + ) else: # This scenario is unlikely but possible if the initial record creation failed. - logger.error(f"{log_prefix}: FINALLY: CRITICAL - KeywordSearchSession record ID {session_id} not found in database for final status update.") + logger.error( + f"{log_prefix}: FINALLY: CRITICAL - KeywordSearchSession record ID {session_id} not found in database for final status update." + ) except Exception as final_upd_err: # Log critical errors during the final update but prevent crashing the finally block. - logger.exception(f"{log_prefix}: FINALLY: CRITICAL - Exception during final status update commit: {final_upd_err}") + logger.exception( + f"{log_prefix}: FINALLY: CRITICAL - Exception during final status update commit: {final_upd_err}" + ) if update_db: try: # Attempt to rollback any changes made in the failed update transaction. update_db.rollback() - logger.warning(f"{log_prefix}: FINALLY: Rolled back final status update transaction due to error.") + logger.warning( + f"{log_prefix}: FINALLY: Rolled back final status update transaction due to error." + ) except Exception as rb_err: - logger.error(f"{log_prefix}: FINALLY: Exception during rollback of failed status update: {rb_err}") + logger.error( + f"{log_prefix}: FINALLY: Exception during rollback of failed status update: {rb_err}" + ) finally: # Ensure the database session used for the final update is closed. if update_db: - logger.info(f"{log_prefix}: FINALLY: Closing the DB session used for final status update.") + logger.info( + f"{log_prefix}: FINALLY: Closing the DB session used for final status update." + ) try: update_db.close() except Exception as close_err: - logger.error(f"{log_prefix}: FINALLY: Exception closing final update DB session: {close_err}") + logger.error( + f"{log_prefix}: FINALLY: Exception closing final update DB session: {close_err}" + ) # Also ensure the original task session ('db') is closed if it was created. # Avoid double-closing if 'update_db' somehow ended up being the same instance. if db and (update_db is None or db is not update_db): - logger.info(f"{log_prefix}: FINALLY: Closing original task DB session.") - try: - db.close() - except Exception as close_err: - logger.error(f"{log_prefix}: FINALLY: Exception closing original task DB session: {close_err}") + logger.info(f"{log_prefix}: FINALLY: Closing original task DB session.") + try: + db.close() + except Exception as close_err: + logger.error( + f"{log_prefix}: FINALLY: Exception closing original task DB session: {close_err}" + ) logger.info(f"{log_prefix}: ENDING Keyword Discovery Task.") -# --- END OF FILE discovery_tasks.py --- \ No newline at end of file + + +# --- END OF FILE discovery_tasks.py --- diff --git a/backend/tasks/scholarly_tasks.py b/backend/tasks/scholarly_tasks.py index a85f5dd..6468128 100644 --- a/backend/tasks/scholarly_tasks.py +++ b/backend/tasks/scholarly_tasks.py @@ -17,31 +17,39 @@ import logging import time # For implementing delays in retry logic. -import uuid -import re # For parsing IDs from URLs. -from typing import Set, Optional, List, Dict, Any, Tuple +import re # For parsing IDs from URLs. +from typing import Set, Optional, List, Dict, Any from sqlalchemy.orm import Session + # Import specific database and ORM exceptions for targeted handling. from sqlalchemy.exc import IntegrityError, SQLAlchemyError, OperationalError + # Import Celery-specific exceptions for flow control (Ignore) and retries (Retry). -from celery.exceptions import Ignore, Retry +from celery.exceptions import Ignore # Import the configured Celery application instance. from backend.celery_app import celery_app + # Import the database session factory. from backend.data.database import SessionLocal + # Import external API clients (OpenAlex). from backend.external import OpenAlexClient, ApiClientError + # Import database models relevant to scholarly data. from backend.data.models import Work, WorkCitation, DiscoveryChain + # Import repository classes for database interactions. from backend.data.repositories import WorkRepository + # Import application services used by the tasks. from backend.services import ScholarlyProcessingService, DiscoveryChainService # Configuration: Maximum depth for recursive processing of references/citations. -MAX_RECURSION_DEPTH = 1 # Limits processing to direct citations/references only (depth 0 and 1). +MAX_RECURSION_DEPTH = ( + 1 # Limits processing to direct citations/references only (depth 0 and 1). +) # Setup logger for this module. logger = logging.getLogger(__name__) @@ -49,6 +57,7 @@ # --- Helper Functions --- + def _get_id_from_oa_url(url: Optional[str]) -> Optional[str]: """ Extracts a unique identifier from various scholarly ID URLs. @@ -65,33 +74,53 @@ def _get_id_from_oa_url(url: Optional[str]) -> Optional[str]: '01ggx4157', '10.1000/xyz123') if parsing and validation succeed, otherwise None. """ - if not url or not isinstance(url, str): return None + if not url or not isinstance(url, str): + return None try: id_part: Optional[str] = None # Extract ID based on URL prefix or structure. - if url.startswith("https://orcid.org/"): match = re.search(r'(\d{4}-\d{4}-\d{4}-\d{3}[0-9X])', url); id_part = match.group(1) if match else None - elif url.startswith("https://ror.org/"): id_part = url.split('/')[-1] - elif url.startswith("https://openalex.org/"): id_part = url.split('/')[-1] - elif url.startswith("https://doi.org/"): id_part = url[len("https://doi.org/"):] # Get part after prefix + if url.startswith("https://orcid.org/"): + match = re.search(r"(\d{4}-\d{4}-\d{4}-\d{3}[0-9X])", url) + id_part = match.group(1) if match else None + elif url.startswith("https://ror.org/"): + id_part = url.split("/")[-1] + elif url.startswith("https://openalex.org/"): + id_part = url.split("/")[-1] + elif url.startswith("https://doi.org/"): + id_part = url[len("https://doi.org/") :] # Get part after prefix # Check for bare OpenAlex ID format (e.g., W followed by digits) - elif url and url[0].isalpha() and url[1:].isdigit(): id_part = url - else: id_part = None # Does not match known patterns + elif url and url[0].isalpha() and url[1:].isdigit(): + id_part = url + else: + id_part = None # Does not match known patterns # Basic format validation for extracted IDs. if id_part: # OpenAlex ID: Starts with letter, followed by digits. - is_oa = url.startswith("https://openalex.org/") or (id_part == url and url[0].isalpha() and url[1:].isdigit()) - if is_oa and id_part[0].isalpha() and id_part[1:].isdigit(): return id_part + is_oa = url.startswith("https://openalex.org/") or ( + id_part == url and url[0].isalpha() and url[1:].isdigit() + ) + if is_oa and id_part[0].isalpha() and id_part[1:].isdigit(): + return id_part # ORCID: Matched the regex pattern. - if url.startswith("https://orcid.org/") and match: return id_part + if url.startswith("https://orcid.org/") and match: + return id_part # ROR ID: Starts with '0', has 9 characters total. - if url.startswith("https://ror.org/") and id_part.startswith('0') and len(id_part) == 9: return id_part + if ( + url.startswith("https://ror.org/") + and id_part.startswith("0") + and len(id_part) == 9 + ): + return id_part # DOI: Check if extracted part is non-empty (basic check). - if url.startswith("https://doi.org/") and id_part: return id_part + if url.startswith("https://doi.org/") and id_part: + return id_part except Exception as e: # Log errors during parsing but don't crash the calling function. - logger.error(f"Error parsing identifier from URL/string '{url}': {e}", exc_info=False) + logger.error( + f"Error parsing identifier from URL/string '{url}': {e}", exc_info=False + ) # Return None if no valid ID could be extracted and validated. return None @@ -117,43 +146,56 @@ def get_work_with_retry( Returns: The retrieved Work object if found within the allowed retries, otherwise None. """ - logger.debug(f"Attempting to retrieve Work OA ID {openalex_id} with {retries} retries (delay ~{delay}s).") + logger.debug( + f"Attempting to retrieve Work OA ID {openalex_id} with {retries} retries (delay ~{delay}s)." + ) for attempt in range(retries): - logger.debug(f"get_work_with_retry: Attempt {attempt + 1}/{retries} for OA ID {openalex_id}") + logger.debug( + f"get_work_with_retry: Attempt {attempt + 1}/{retries} for OA ID {openalex_id}" + ) work = work_repo.get_by_openalex_id(openalex_id=openalex_id) if work: - logger.debug(f"get_work_with_retry: Found Work OA ID {openalex_id} (DB ID: {work.id}) on attempt {attempt + 1}.") + logger.debug( + f"get_work_with_retry: Found Work OA ID {openalex_id} (DB ID: {work.id}) on attempt {attempt + 1}." + ) return work # Log a warning and wait before the next attempt. - wait_time = delay * (attempt + 1) # Simple linear backoff for logging clarity + wait_time = delay * (attempt + 1) # Simple linear backoff for logging clarity logger.warning( f"get_work_with_retry: Work OA ID {openalex_id} not found (Attempt {attempt + 1}/{retries}). " f"Waiting {wait_time:.1f}s before next attempt..." ) time.sleep(wait_time) # If the loop completes without finding the work. - logger.error(f"get_work_with_retry: Failed to find Work OA ID {openalex_id} after {retries} attempts.") + logger.error( + f"get_work_with_retry: Failed to find Work OA ID {openalex_id} after {retries} attempts." + ) return None # --- Celery Tasks --- + @celery_app.task( - bind=True, # Make 'self' (task instance) available. - autoretry_for=(ApiClientError,), # Automatically retry OpenAlex API client errors. - retry_backoff=True, # Use exponential backoff for retries. - max_retries=5, # Limit automatic retries for API errors. Retries for deadlocks are handled manually. - acks_late=True, # Acknowledge task only after completion/failure (for reliability). - task_reject_on_worker_lost=True # Requeue task if the worker processing it dies. + bind=True, # Make 'self' (task instance) available. + autoretry_for=(ApiClientError,), # Automatically retry OpenAlex API client errors. + retry_backoff=True, # Use exponential backoff for retries. + max_retries=5, # Limit automatic retries for API errors. Retries for deadlocks are handled manually. + acks_late=True, # Acknowledge task only after completion/failure (for reliability). + task_reject_on_worker_lost=True, # Requeue task if the worker processing it dies. ) def process_work_deeply_task( self, - openalex_id: str, # The OpenAlex ID of the Work to process in this task. - primary_work_oa_id: str, # The OpenAlex ID of the 'parent' work that led to this one. - relationship_type: str, # How this work relates to the primary ('citation' or 'reference'). - initiating_doi_ref_id: Optional[int] = None, # Optional DB ID of the initiating DoiReference. - depth: int = 0, # Current recursion depth (0 is the initial work). - visited_ids: Optional[List[str]] = None, # List of OA IDs already processed in this chain to prevent cycles. + openalex_id: str, # The OpenAlex ID of the Work to process in this task. + primary_work_oa_id: str, # The OpenAlex ID of the 'parent' work that led to this one. + relationship_type: str, # How this work relates to the primary ('citation' or 'reference'). + initiating_doi_ref_id: Optional[ + int + ] = None, # Optional DB ID of the initiating DoiReference. + depth: int = 0, # Current recursion depth (0 is the initial work). + visited_ids: Optional[ + List[str] + ] = None, # List of OA IDs already processed in this chain to prevent cycles. ): """ Celery task to fetch, process, and store details for a specific scholarly work @@ -183,7 +225,11 @@ def process_work_deeply_task( depth: The current recursion depth. visited_ids: A list of OpenAlex IDs already visited in the current processing chain. """ - task_id = self.request.id if hasattr(self, 'request') and self.request.id else 'UNKNOWN_TASK_ID' + task_id = ( + self.request.id + if hasattr(self, "request") and self.request.id + else "UNKNOWN_TASK_ID" + ) # Use a set for efficient 'in' checks for visited IDs. visited_set: Set[str] = set(visited_ids) if visited_ids is not None else set() log_prefix = f"Task {task_id} (Work OA:{openalex_id}, Depth:{depth}, Rel:{relationship_type}, PrimOA:{primary_work_oa_id})" @@ -199,18 +245,26 @@ def process_work_deeply_task( # Check if the maximum recursion depth has been exceeded. if depth > MAX_RECURSION_DEPTH: - logger.warning(f"{log_prefix}: Skipping, maximum recursion depth ({MAX_RECURSION_DEPTH}) reached.") + logger.warning( + f"{log_prefix}: Skipping, maximum recursion depth ({MAX_RECURSION_DEPTH}) reached." + ) raise Ignore() # Add the current work ID to the set for this task and potential children. visited_set.add(openalex_id) # --- Initialization --- - db: Session | None = None # Database session for this task. - root_chain: Optional[DiscoveryChain] = None # Discovery chain tracker for this task. - current_work_db: Optional[Work] = None # DB record for the work being processed (openalex_id). - primary_work_db: Optional[Work] = None # DB record for the parent work (primary_work_oa_id). - discovery_chain_service: DiscoveryChainService | None = None # Service instance. + db: Session | None = None # Database session for this task. + root_chain: Optional[DiscoveryChain] = ( + None # Discovery chain tracker for this task. + ) + current_work_db: Optional[Work] = ( + None # DB record for the work being processed (openalex_id). + ) + primary_work_db: Optional[Work] = ( + None # DB record for the parent work (primary_work_oa_id). + ) + discovery_chain_service: DiscoveryChainService | None = None # Service instance. try: # --- Setup Database Session and Services --- @@ -224,45 +278,73 @@ def process_work_deeply_task( # --- Track Progress with DiscoveryChain --- chain_params = { - "task_name": self.name, "openalex_id": openalex_id, "primary_oa_id": primary_work_oa_id, - "type": relationship_type, "depth": depth, "initiating_doi_ref_id": initiating_doi_ref_id, + "task_name": self.name, + "openalex_id": openalex_id, + "primary_oa_id": primary_work_oa_id, + "type": relationship_type, + "depth": depth, + "initiating_doi_ref_id": initiating_doi_ref_id, } - root_chain = discovery_chain_service.create_root_chain(db, "CELERY_LINKED_WORK_PROCESS", chain_params) + root_chain = discovery_chain_service.create_root_chain( + db, "CELERY_LINKED_WORK_PROCESS", chain_params + ) discovery_chain_service.start_chain(db, root_chain) - logger.info(f"{log_prefix}: Discovery chain {root_chain.id} created and started.") + logger.info( + f"{log_prefix}: Discovery chain {root_chain.id} created and started." + ) # --- Get or Create the Database Record for the Current Work --- - logger.debug(f"{log_prefix}: Retrieving/creating database record for current work...") + logger.debug( + f"{log_prefix}: Retrieving/creating database record for current work..." + ) # Use repository method that handles potential race conditions during creation. current_work_db = work_repo.get_or_create_by_openalex_id( openalex_id=openalex_id, - obj_in_data={"openalex_id": openalex_id} # Provide minimal data for creation if needed. + obj_in_data={ + "openalex_id": openalex_id + }, # Provide minimal data for creation if needed. ) # The repo method ensures the object has an ID after returning. if current_work_db.id is None: - # This case should ideally not happen if get_or_create works correctly. - error_msg = f"Critical: Work ID is None after get_or_create for OA ID {openalex_id}" - logger.error(f"{log_prefix}: {error_msg}") - discovery_chain_service.fail_chain(db, root_chain, error_msg) - db.commit() - raise RuntimeError(error_msg) # Fail the task deterministically. - logger.debug(f"{log_prefix}: Current work DB record obtained/created (ID: {current_work_db.id}).") + # This case should ideally not happen if get_or_create works correctly. + error_msg = ( + f"Critical: Work ID is None after get_or_create for OA ID {openalex_id}" + ) + logger.error(f"{log_prefix}: {error_msg}") + discovery_chain_service.fail_chain(db, root_chain, error_msg) + db.commit() + raise RuntimeError(error_msg) # Fail the task deterministically. + logger.debug( + f"{log_prefix}: Current work DB record obtained/created (ID: {current_work_db.id})." + ) # Associate the work record with the discovery chain. - discovery_chain_service.associate_entity(db, root_chain, current_work_db, is_direct=True) + discovery_chain_service.associate_entity( + db, root_chain, current_work_db, is_direct=True + ) # --- Retrieve the Database Record for the Primary Work --- - logger.debug(f"{log_prefix}: Retrieving primary work DB record ({primary_work_oa_id}) with retry...") + logger.debug( + f"{log_prefix}: Retrieving primary work DB record ({primary_work_oa_id}) with retry..." + ) # Use the helper function to handle potential delays in visibility. - primary_work_db = get_work_with_retry(work_repo, primary_work_oa_id, retries=5, delay=5.0) + primary_work_db = get_work_with_retry( + work_repo, primary_work_oa_id, retries=5, delay=5.0 + ) if not primary_work_db: # If the primary work cannot be found after retries, the task cannot proceed. - error_msg = f"Primary work {primary_work_oa_id} not found in DB after retries." + error_msg = ( + f"Primary work {primary_work_oa_id} not found in DB after retries." + ) logger.error(f"{log_prefix}: {error_msg}") discovery_chain_service.fail_chain(db, root_chain, error_msg) - db.commit() # Commit the failure status of the chain. - raise Ignore() # Ignore the task; retrying won't help if the primary is missing. - logger.debug(f"{log_prefix}: Primary work DB record found (ID: {primary_work_db.id}).") - discovery_chain_service.associate_entity(db, root_chain, primary_work_db, is_direct=False) + db.commit() # Commit the failure status of the chain. + raise Ignore() # Ignore the task; retrying won't help if the primary is missing. + logger.debug( + f"{log_prefix}: Primary work DB record found (ID: {primary_work_db.id})." + ) + discovery_chain_service.associate_entity( + db, root_chain, primary_work_db, is_direct=False + ) # --- Create Citation Link if Applicable --- # Ensure both work records have database IDs before creating the relationship. @@ -279,36 +361,60 @@ def process_work_deeply_task( citing_id, cited_id = current_work_db.id, primary_work_db.id rel_desc = f"Current(ID:{citing_id}) cites Primary(ID:{cited_id})" else: - logger.warning(f"{log_prefix}: Invalid relationship_type '{relationship_type}'. Cannot create citation link.") + logger.warning( + f"{log_prefix}: Invalid relationship_type '{relationship_type}'. Cannot create citation link." + ) # If IDs were determined, attempt to create the WorkCitation record. if citing_id is not None and cited_id is not None: - logger.debug(f"{log_prefix}: Checking/creating citation link: {rel_desc}") - try: - # Check if the citation relationship already exists. - existing_citation = db.query(WorkCitation).filter_by(citing_work_id=citing_id, cited_work_id=cited_id).first() - if not existing_citation: - # Create and add the new citation record. - citation_input_data = {"citing_work_id": citing_id, "cited_work_id": cited_id} - citation_db = WorkCitation(**citation_input_data) - db.add(citation_db) - # Flush to assign an ID to citation_db, required for association. - db.flush() - logger.info(f"{log_prefix}: Created WorkCitation link: {rel_desc} (ID: {citation_db.id})") - discovery_chain_service.associate_entity(db, root_chain, citation_db, is_direct=False) - else: - logger.debug(f"{log_prefix}: WorkCitation link already exists: {rel_desc}") - except IntegrityError as ie: - # Catch potential unique constraint violations if created concurrently. - logger.warning(f"{log_prefix}: IntegrityError creating WorkCitation ({rel_desc}), likely created concurrently. Rolling back flush and proceeding. Details: {ie}") - db.rollback() # Rollback the flush attempt. - except Exception as e_citation: - # Log other errors during citation creation but proceed with work processing. - logger.error(f"{log_prefix}: Error creating/flushing WorkCitation ({rel_desc}): {e_citation}", exc_info=True) - db.rollback() # Rollback potential partial changes. + logger.debug( + f"{log_prefix}: Checking/creating citation link: {rel_desc}" + ) + try: + # Check if the citation relationship already exists. + existing_citation = ( + db.query(WorkCitation) + .filter_by(citing_work_id=citing_id, cited_work_id=cited_id) + .first() + ) + if not existing_citation: + # Create and add the new citation record. + citation_input_data = { + "citing_work_id": citing_id, + "cited_work_id": cited_id, + } + citation_db = WorkCitation(**citation_input_data) + db.add(citation_db) + # Flush to assign an ID to citation_db, required for association. + db.flush() + logger.info( + f"{log_prefix}: Created WorkCitation link: {rel_desc} (ID: {citation_db.id})" + ) + discovery_chain_service.associate_entity( + db, root_chain, citation_db, is_direct=False + ) + else: + logger.debug( + f"{log_prefix}: WorkCitation link already exists: {rel_desc}" + ) + except IntegrityError as ie: + # Catch potential unique constraint violations if created concurrently. + logger.warning( + f"{log_prefix}: IntegrityError creating WorkCitation ({rel_desc}), likely created concurrently. Rolling back flush and proceeding. Details: {ie}" + ) + db.rollback() # Rollback the flush attempt. + except Exception as e_citation: + # Log other errors during citation creation but proceed with work processing. + logger.error( + f"{log_prefix}: Error creating/flushing WorkCitation ({rel_desc}): {e_citation}", + exc_info=True, + ) + db.rollback() # Rollback potential partial changes. else: # This should not happen if previous checks passed. - logger.error(f"{log_prefix}: Missing DB ID for current ({current_work_db.id}) or primary ({primary_work_db.id}) work. Cannot create citation link.") + logger.error( + f"{log_prefix}: Missing DB ID for current ({current_work_db.id}) or primary ({primary_work_db.id}) work. Cannot create citation link." + ) # --- Fetch and Process Full Work Details from OpenAlex --- logger.debug(f"{log_prefix}: Fetching full work details from OpenAlex API...") @@ -317,84 +423,121 @@ def process_work_deeply_task( # Call the OpenAlex client to get detailed work data. full_work_data = openalex_client.get_work_details(openalex_id) if full_work_data: - logger.debug(f"{log_prefix}: Successfully fetched full details from OpenAlex.") + logger.debug( + f"{log_prefix}: Successfully fetched full details from OpenAlex." + ) else: - logger.warning(f"{log_prefix}: No detailed data returned from OpenAlex API.") + logger.warning( + f"{log_prefix}: No detailed data returned from OpenAlex API." + ) except ApiClientError as api_details_err: # Let Celery's autoretry handle API client errors. - logger.warning(f"{log_prefix}: API error fetching details: {api_details_err}. Task will retry.") + logger.warning( + f"{log_prefix}: API error fetching details: {api_details_err}. Task will retry." + ) raise api_details_err except Exception as api_err: - # Catch other unexpected errors during API call. - logger.error(f"{log_prefix}: Unexpected error fetching details from OpenAlex: {api_err}", exc_info=True) - # Raise to allow potential Celery retry based on general Exception handling, or fail. - raise api_err + # Catch other unexpected errors during API call. + logger.error( + f"{log_prefix}: Unexpected error fetching details from OpenAlex: {api_err}", + exc_info=True, + ) + # Raise to allow potential Celery retry based on general Exception handling, or fail. + raise api_err # If no data was fetched (even after potential retries), stop processing this work. if not full_work_data: - logger.warning(f"{log_prefix}: Could not fetch full details for work. Stopping further processing for this work.") - discovery_chain_service.complete_chain(db, root_chain, status_message="Completed - No detailed data from API") + logger.warning( + f"{log_prefix}: Could not fetch full details for work. Stopping further processing for this work." + ) + discovery_chain_service.complete_chain( + db, root_chain, status_message="Completed - No detailed data from API" + ) db.commit() - raise Ignore() # Stop processing this task instance. + raise Ignore() # Stop processing this task instance. # --- Process the Fetched Data using ScholarlyProcessingService --- - logger.debug(f"{log_prefix}: Calling scholarly_processor.process_openalex_work_data...") + logger.debug( + f"{log_prefix}: Calling scholarly_processor.process_openalex_work_data..." + ) try: # Pass the DB session, the existing Work DB record, the fetched API data, and the parent chain. # The service will update the work_db object with details and handle related entities. - referenced_oa_ids, _, cited_by_url_for_tasks = scholarly_processor.process_openalex_work_data( - db=db, - work_db=current_work_db, # Pass the existing DB object to be updated. - work_api_data=full_work_data, - parent_chain=root_chain # Pass the chain for detailed tracking within the service. + referenced_oa_ids, _, cited_by_url_for_tasks = ( + scholarly_processor.process_openalex_work_data( + db=db, + work_db=current_work_db, # Pass the existing DB object to be updated. + work_api_data=full_work_data, + parent_chain=root_chain, # Pass the chain for detailed tracking within the service. + ) + ) + logger.debug( + f"{log_prefix}: scholarly_processor.process_openalex_work_data completed." ) - logger.debug(f"{log_prefix}: scholarly_processor.process_openalex_work_data completed.") except OperationalError as op_err: - # Specifically check for deadlocks (PostgreSQL error code '40P01'). - pgcode = getattr(op_err.orig, 'pgcode', None) - if pgcode == '40P01': - logger.warning(f"{log_prefix}: DEADLOCK detected during scholarly processing. Raising OperationalError for Celery retry.") - # Re-raise the OperationalError; manual retry logic is below in the main except block. - raise op_err - else: - # Handle other database operational errors. - logger.error(f"{log_prefix}: Database OperationalError during scholarly processing (Code: {pgcode}): {op_err}", exc_info=True) - discovery_chain_service.fail_chain(db, root_chain, f"DB OperationalError: {str(op_err)[:150]}") - db.commit() - raise Ignore() # Do not retry non-deadlock operational errors automatically. + # Specifically check for deadlocks (PostgreSQL error code '40P01'). + pgcode = getattr(op_err.orig, "pgcode", None) + if pgcode == "40P01": + logger.warning( + f"{log_prefix}: DEADLOCK detected during scholarly processing. Raising OperationalError for Celery retry." + ) + # Re-raise the OperationalError; manual retry logic is below in the main except block. + raise op_err + else: + # Handle other database operational errors. + logger.error( + f"{log_prefix}: Database OperationalError during scholarly processing (Code: {pgcode}): {op_err}", + exc_info=True, + ) + discovery_chain_service.fail_chain( + db, root_chain, f"DB OperationalError: {str(op_err)[:150]}" + ) + db.commit() + raise Ignore() # Do not retry non-deadlock operational errors automatically. except Exception as scholarly_err: - # Catch other unexpected errors during the processing service call. - logger.error(f"{log_prefix}: EXCEPTION during scholarly processing: {scholarly_err}", exc_info=True) - # Fail the chain and ignore the task for most processing errors. - error_msg = f"Scholarly processing error: {str(scholarly_err)[:150]}" - discovery_chain_service.fail_chain(db, root_chain, error_msg) - db.commit() - raise Ignore() + # Catch other unexpected errors during the processing service call. + logger.error( + f"{log_prefix}: EXCEPTION during scholarly processing: {scholarly_err}", + exc_info=True, + ) + # Fail the chain and ignore the task for most processing errors. + error_msg = f"Scholarly processing error: {str(scholarly_err)[:150]}" + discovery_chain_service.fail_chain(db, root_chain, error_msg) + db.commit() + raise Ignore() - logger.info(f"{log_prefix}: Scholarly data processed. Found {len(referenced_oa_ids)} referenced works to potentially enqueue.") + logger.info( + f"{log_prefix}: Scholarly data processed. Found {len(referenced_oa_ids)} referenced works to potentially enqueue." + ) # --- Commit Main Transaction and Finalize Chain --- # Commit all changes made so far (work creation/update, citation link, associated entities via service). discovery_chain_service.complete_chain(db, root_chain) db.commit() - logger.info(f"{log_prefix}: Main transaction committed. Discovery chain {root_chain.id} completed.") + logger.info( + f"{log_prefix}: Main transaction committed. Discovery chain {root_chain.id} completed." + ) # --- Enqueue Child Tasks for Related Works --- next_depth = depth + 1 # Pass the updated list of visited IDs to children. next_visited_list = list(visited_set) if next_depth <= MAX_RECURSION_DEPTH: - logger.debug(f"{log_prefix}: Enqueuing child tasks for referenced works at depth {next_depth}") + logger.debug( + f"{log_prefix}: Enqueuing child tasks for referenced works at depth {next_depth}" + ) # Enqueue tasks for works referenced by the current work. for ref_oa_id in referenced_oa_ids: - if ref_oa_id not in visited_set: # Avoid re-enqueuing visited works. - logger.debug(f"{log_prefix}: Enqueueing child task for referenced OA ID: {ref_oa_id}") + if ref_oa_id not in visited_set: # Avoid re-enqueuing visited works. + logger.debug( + f"{log_prefix}: Enqueueing child task for referenced OA ID: {ref_oa_id}" + ) # Note: The 'primary' work for this child task is the *current* work. # The relationship is 'citation' because the current work cited the ref_oa_id. process_work_deeply_task.delay( openalex_id=ref_oa_id, - primary_work_oa_id=openalex_id, # Current work is the primary for the child. - relationship_type="citation", # Current work CITED ref_oa_id. + primary_work_oa_id=openalex_id, # Current work is the primary for the child. + relationship_type="citation", # Current work CITED ref_oa_id. initiating_doi_ref_id=initiating_doi_ref_id, depth=next_depth, visited_ids=next_visited_list, @@ -411,7 +554,9 @@ def process_work_deeply_task( # ) else: - logger.info(f"{log_prefix}: Maximum depth reached, not enqueuing further child tasks.") + logger.info( + f"{log_prefix}: Maximum depth reached, not enqueuing further child tasks." + ) logger.info(f"{log_prefix}: Task completed successfully.") @@ -423,70 +568,127 @@ def process_work_deeply_task( except ApiClientError as e: # Handled by Celery autoretry based on task decorator. # Logged here for context, but re-raised implicitly by autoretry. - logger.error(f"{log_prefix}: API Client Error occurred: {e}. Autoretry mechanism active.") + logger.error( + f"{log_prefix}: API Client Error occurred: {e}. Autoretry mechanism active." + ) # Attempt to mark chain as FAILED in case retries are exhausted. if db and root_chain and discovery_chain_service: - try: - if not db.is_active: db = SessionLocal() # Ensure session is active for update. - chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) - if chain_to_fail and chain_to_fail.status not in ['COMPLETED', 'FAILED']: - discovery_chain_service.fail_chain(db, chain_to_fail, f"API Error (final attempt?): {str(e)[:150]}") - db.commit() - except Exception as e_fail: logger.error(f"{log_prefix}: Error marking chain failed after API error: {e_fail}", exc_info=False); db.rollback() + try: + if not db.is_active: + db = SessionLocal() # Ensure session is active for update. + chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) + if chain_to_fail and chain_to_fail.status not in [ + "COMPLETED", + "FAILED", + ]: + discovery_chain_service.fail_chain( + db, chain_to_fail, f"API Error (final attempt?): {str(e)[:150]}" + ) + db.commit() + except Exception as e_fail: + logger.error( + f"{log_prefix}: Error marking chain failed after API error: {e_fail}", + exc_info=False, + ) + db.rollback() # Autoretry decorator handles raising the retry exception. except OperationalError as e: # Catch database operational errors, specifically deadlocks. - pgcode = getattr(e.orig, 'pgcode', None) - if pgcode == '40P01': + pgcode = getattr(e.orig, "pgcode", None) + if pgcode == "40P01": # Handle deadlock: Manually trigger a retry with a backoff. retry_count = self.request.retries # Increase countdown significantly for deadlocks. countdown = int((retry_count + 1) * 10) + 10 - logger.warning(f"{log_prefix}: DEADLOCK detected (Retry {retry_count + 1}/{self.max_retries}). Retrying task in {countdown}s.") + logger.warning( + f"{log_prefix}: DEADLOCK detected (Retry {retry_count + 1}/{self.max_retries}). Retrying task in {countdown}s." + ) # Manually raise the Retry exception. raise self.retry(exc=e, countdown=countdown) else: # Handle other operational errors (e.g., connection issues not covered by retry). - logger.error(f"{log_prefix}: DATABASE OperationalError (non-deadlock, Code: {pgcode}): {e}", exc_info=True) + logger.error( + f"{log_prefix}: DATABASE OperationalError (non-deadlock, Code: {pgcode}): {e}", + exc_info=True, + ) if db and root_chain and discovery_chain_service: - try: - if not db.is_active: db = SessionLocal() - chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) - if chain_to_fail and chain_to_fail.status not in ['COMPLETED', 'FAILED']: - discovery_chain_service.fail_chain(db, chain_to_fail, f"DB OperationalError: {str(e)[:150]}") - db.commit() - except Exception as e_fail: logger.error(f"{log_prefix}: Error marking chain failed after DB OperationalError: {e_fail}", exc_info=False); db.rollback() - raise Ignore() # Do not retry other operational errors automatically. + try: + if not db.is_active: + db = SessionLocal() + chain_to_fail = discovery_chain_service.get_by_uuid( + db, root_chain.id + ) + if chain_to_fail and chain_to_fail.status not in [ + "COMPLETED", + "FAILED", + ]: + discovery_chain_service.fail_chain( + db, chain_to_fail, f"DB OperationalError: {str(e)[:150]}" + ) + db.commit() + except Exception as e_fail: + logger.error( + f"{log_prefix}: Error marking chain failed after DB OperationalError: {e_fail}", + exc_info=False, + ) + db.rollback() + raise Ignore() # Do not retry other operational errors automatically. except (SQLAlchemyError, ValueError, RuntimeError) as e: # Catch other specific database, value, or runtime errors. logger.error(f"{log_prefix}: DATABASE/VALUE/RUNTIME Error: {e}", exc_info=True) if db and root_chain and discovery_chain_service: - try: - if not db.is_active: db = SessionLocal() - chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) - if chain_to_fail and chain_to_fail.status not in ['COMPLETED', 'FAILED']: - discovery_chain_service.fail_chain(db, chain_to_fail, f"DB/Value/Runtime Error: {str(e)[:150]}") - db.commit() - except Exception as e_fail: logger.error(f"{log_prefix}: Error marking chain failed after DB/Value/Runtime error: {e_fail}", exc_info=False); db.rollback() - logger.warning(f"{log_prefix}: Task will be ignored due to encountered DB/Value/Runtime error.") - raise Ignore() # Stop processing for these types of errors. + try: + if not db.is_active: + db = SessionLocal() + chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) + if chain_to_fail and chain_to_fail.status not in [ + "COMPLETED", + "FAILED", + ]: + discovery_chain_service.fail_chain( + db, chain_to_fail, f"DB/Value/Runtime Error: {str(e)[:150]}" + ) + db.commit() + except Exception as e_fail: + logger.error( + f"{log_prefix}: Error marking chain failed after DB/Value/Runtime error: {e_fail}", + exc_info=False, + ) + db.rollback() + logger.warning( + f"{log_prefix}: Task will be ignored due to encountered DB/Value/Runtime error." + ) + raise Ignore() # Stop processing for these types of errors. except Exception as e: # Catch any other unexpected errors. logger.exception(f"{log_prefix}: Unexpected critical error: {e}") if db and root_chain and discovery_chain_service: - try: - if not db.is_active: db = SessionLocal() - chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) - if chain_to_fail and chain_to_fail.status not in ['COMPLETED', 'FAILED']: - discovery_chain_service.fail_chain(db, chain_to_fail, f"Unexpected Error: {str(e)[:150]}") - db.commit() - except Exception as e_fail: logger.error(f"{log_prefix}: Error marking chain failed after critical error: {e_fail}", exc_info=False); db.rollback() + try: + if not db.is_active: + db = SessionLocal() + chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) + if chain_to_fail and chain_to_fail.status not in [ + "COMPLETED", + "FAILED", + ]: + discovery_chain_service.fail_chain( + db, chain_to_fail, f"Unexpected Error: {str(e)[:150]}" + ) + db.commit() + except Exception as e_fail: + logger.error( + f"{log_prefix}: Error marking chain failed after critical error: {e_fail}", + exc_info=False, + ) + db.rollback() # Attempt a generic retry for unexpected errors. try: - raise self.retry(exc=e, countdown=int(self.request.retries * 5) + 5) + raise self.retry(exc=e, countdown=int(self.request.retries * 5) + 5) except Exception as retry_err: - logger.error(f"{log_prefix}: Failed to initiate retry after unexpected error: {retry_err}. Ignoring task.") - raise Ignore() + logger.error( + f"{log_prefix}: Failed to initiate retry after unexpected error: {retry_err}. Ignoring task." + ) + raise Ignore() finally: # --- Cleanup --- # Ensure the database session is always closed. @@ -495,22 +697,26 @@ def process_work_deeply_task( db.close() logger.debug(f"{log_prefix}: Database session closed.") except Exception as close_err: - logger.error(f"{log_prefix}: Error closing database session: {close_err}") + logger.error( + f"{log_prefix}: Error closing database session: {close_err}" + ) @celery_app.task( bind=True, - autoretry_for=(ApiClientError,), # Retry on API client errors. + autoretry_for=(ApiClientError,), # Retry on API client errors. retry_backoff=True, - max_retries=5, # Increased retries for API/deadlock potential. + max_retries=5, # Increased retries for API/deadlock potential. acks_late=True, task_reject_on_worker_lost=True, ) def process_citing_works_list_task( self, - primary_work_oa_id: str, # The OpenAlex ID of the work *being cited*. - cited_by_api_url: str, # The OpenAlex API URL to fetch the list of citing works. - initiating_doi_ref_id: Optional[int] = None # Optional DB ID of the initiating DoiReference. + primary_work_oa_id: str, # The OpenAlex ID of the work *being cited*. + cited_by_api_url: str, # The OpenAlex API URL to fetch the list of citing works. + initiating_doi_ref_id: Optional[ + int + ] = None, # Optional DB ID of the initiating DoiReference. ): """ Celery task to fetch and process a list of works that cite a given primary work. @@ -530,9 +736,15 @@ def process_citing_works_list_task( cited_by_api_url: The specific OpenAlex API endpoint URL to fetch the citing works list. initiating_doi_ref_id: Optional DB ID of the DoiReference that started the chain. """ - task_id = self.request.id if hasattr(self, 'request') and self.request.id else 'UNKNOWN_TASK_ID' + task_id = ( + self.request.id + if hasattr(self, "request") and self.request.id + else "UNKNOWN_TASK_ID" + ) log_prefix = f"Task {task_id} (CitedBy List for PrimOA:{primary_work_oa_id})" - logger.info(f"{log_prefix}: Starting processing of citing works list from URL: {cited_by_api_url}") + logger.info( + f"{log_prefix}: Starting processing of citing works list from URL: {cited_by_api_url}" + ) # --- Initialization --- db: Session | None = None @@ -550,38 +762,61 @@ def process_citing_works_list_task( # --- Create Root Discovery Chain --- chain_params = { - "task_name": self.name, "primary_oa_id": primary_work_oa_id, - "cited_by_url": cited_by_api_url, "initiating_doi_ref_id": initiating_doi_ref_id + "task_name": self.name, + "primary_oa_id": primary_work_oa_id, + "cited_by_url": cited_by_api_url, + "initiating_doi_ref_id": initiating_doi_ref_id, } - root_chain = discovery_chain_service.create_root_chain(db, "CELERY_CITING_WORKS_LIST", chain_params) + root_chain = discovery_chain_service.create_root_chain( + db, "CELERY_CITING_WORKS_LIST", chain_params + ) discovery_chain_service.start_chain(db, root_chain) - logger.info(f"{log_prefix}: Discovery chain {root_chain.id} created and started.") + logger.info( + f"{log_prefix}: Discovery chain {root_chain.id} created and started." + ) # --- Get Primary Work (the one being cited) --- - logger.debug(f"{log_prefix}: Retrieving primary work DB record ({primary_work_oa_id}) with retry...") - primary_work_db = get_work_with_retry(work_repo, primary_work_oa_id, retries=5, delay=5.0) + logger.debug( + f"{log_prefix}: Retrieving primary work DB record ({primary_work_oa_id}) with retry..." + ) + primary_work_db = get_work_with_retry( + work_repo, primary_work_oa_id, retries=5, delay=5.0 + ) if not primary_work_db: error_msg = f"Primary work {primary_work_oa_id} (being cited) not found after retries." logger.error(f"{log_prefix}: {error_msg}") discovery_chain_service.fail_chain(db, root_chain, error_msg) db.commit() - raise Ignore() # Cannot proceed without the primary work record. - logger.debug(f"{log_prefix}: Primary work DB record found (ID: {primary_work_db.id}).") - discovery_chain_service.associate_entity(db, root_chain, primary_work_db, is_direct=False) + raise Ignore() # Cannot proceed without the primary work record. + logger.debug( + f"{log_prefix}: Primary work DB record found (ID: {primary_work_db.id})." + ) + discovery_chain_service.associate_entity( + db, root_chain, primary_work_db, is_direct=False + ) # --- Fetch Citing Works List from OpenAlex API --- logger.debug(f"{log_prefix}: Fetching citing works list from API...") citing_works_data: Optional[List[Dict[str, Any]]] = None try: # This likely involves pagination handling within the client. - citing_works_data = openalex_client.get_citing_works(citing_works_url=cited_by_api_url) - logger.debug(f"{log_prefix}: API call for citing works completed. Received {len(citing_works_data) if citing_works_data is not None else 'None'} items.") + citing_works_data = openalex_client.get_citing_works( + citing_works_url=cited_by_api_url + ) + logger.debug( + f"{log_prefix}: API call for citing works completed. Received {len(citing_works_data) if citing_works_data is not None else 'None'} items." + ) except ApiClientError as api_citing_err: - logger.warning(f"{log_prefix}: API error fetching citing works list: {api_citing_err}. Task will retry.") - raise api_citing_err # Let Celery autoretry handle this. + logger.warning( + f"{log_prefix}: API error fetching citing works list: {api_citing_err}. Task will retry." + ) + raise api_citing_err # Let Celery autoretry handle this. except Exception as api_err: - logger.error(f"{log_prefix}: Unexpected error fetching citing works from OpenAlex: {api_err}", exc_info=True) - raise api_err # Raise for potential generic retry or failure. + logger.error( + f"{log_prefix}: Unexpected error fetching citing works from OpenAlex: {api_err}", + exc_info=True, + ) + raise api_err # Raise for potential generic retry or failure. # Handle case where API call succeeded but returned None (e.g., client internal error). if citing_works_data is None: @@ -590,17 +825,25 @@ def process_citing_works_list_task( discovery_chain_service.fail_chain(db, root_chain, error_msg) db.commit() # Use RuntimeError to indicate a failure state that shouldn't be retried by API handler. - raise RuntimeError(f"API failed to return citing works data from {cited_by_api_url}") + raise RuntimeError( + f"API failed to return citing works data from {cited_by_api_url}" + ) # Handle case where API returned an empty list. if not citing_works_data: - logger.info(f"{log_prefix}: No citing works found for primary work {primary_work_oa_id}.") - discovery_chain_service.complete_chain(db, root_chain, status_message="Completed - No citing works found") + logger.info( + f"{log_prefix}: No citing works found for primary work {primary_work_oa_id}." + ) + discovery_chain_service.complete_chain( + db, root_chain, status_message="Completed - No citing works found" + ) db.commit() - return # Task is successfully completed. + return # Task is successfully completed. # --- Process Each Citing Work Item --- - logger.info(f"{log_prefix}: Found {len(citing_works_data)} citing works. Processing each...") + logger.info( + f"{log_prefix}: Found {len(citing_works_data)} citing works. Processing each..." + ) processed_count = 0 error_count = 0 @@ -611,104 +854,180 @@ def process_citing_works_list_task( # Skip if essential ID is missing. if not citing_work_oa_id: - logger.warning(f"{log_prefix}: Skipping citing item due to missing/invalid OpenAlex ID: {citing_work_item.get('id')}") - error_count += 1 # Count as an error for reporting. + logger.warning( + f"{log_prefix}: Skipping citing item due to missing/invalid OpenAlex ID: {citing_work_item.get('id')}" + ) + error_count += 1 # Count as an error for reporting. continue - logger.debug(f"{log_prefix}: Processing citing work OA ID: {citing_work_oa_id}") + logger.debug( + f"{log_prefix}: Processing citing work OA ID: {citing_work_oa_id}" + ) # Use a database savepoint for processing each citing work individually. # This allows committing successful items even if others fail. nested_transaction = db.begin_nested() - citing_work_chain: Optional[DiscoveryChain] = None # Chain for this specific citing work. - wc_db: Optional[Work] = None # DB record for the citing work. + citing_work_chain: Optional[DiscoveryChain] = ( + None # Chain for this specific citing work. + ) + wc_db: Optional[Work] = None # DB record for the citing work. try: - # Create a child chain for this specific citing work. - citing_work_chain = discovery_chain_service.create_child_chain( - db, root_chain, "REL_CITING_WORK_FROM_LIST", {"citing_oa_id": citing_work_oa_id} - ) - # Prepare minimal data for creating the citing work record if it doesn't exist. - wc_input_data: Dict[str, Any] = {"openalex_id": citing_work_oa_id} - if citing_work_doi: wc_input_data["doi"] = citing_work_doi - if citing_work_item.get("title"): wc_input_data["title"] = citing_work_item.get("title")[:1024] # Truncate title if needed - if citing_work_item.get("publication_year"): wc_input_data["publication_year"] = citing_work_item.get("publication_year") - - logger.debug(f"{log_prefix}: Getting/creating citing work OA ID {citing_work_oa_id}...") - # Get or create the citing work record. - wc_db = work_repo.get_or_create_by_openalex_id(openalex_id=citing_work_oa_id, obj_in_data=wc_input_data) - if wc_db.id is None: - raise RuntimeError(f"Citing Work ID is None after get_or_create for OA ID {citing_work_oa_id}") - logger.debug(f"{log_prefix}: Got/created citing work DB record (ID: {wc_db.id}).") - discovery_chain_service.associate_entity(db, citing_work_chain, wc_db, is_direct=True) - - # Create the citation link (Citing Work -> Primary Work). - if wc_db.id is not None and primary_work_db.id is not None: - citing_id, cited_id = wc_db.id, primary_work_db.id # Wc cites W1 - rel_desc = f"CitingWork(ID:{citing_id}) cites PrimaryWork(ID:{cited_id})" - logger.debug(f"{log_prefix}: Checking/creating citation link: {rel_desc}") - try: - existing_citation = db.query(WorkCitation).filter_by(citing_work_id=citing_id, cited_work_id=cited_id).first() - if not existing_citation: - citation_db = WorkCitation(citing_work_id=citing_id, cited_work_id=cited_id) - db.add(citation_db) - db.flush() # Flush to get ID for association. - logger.info(f"{log_prefix}: Created WorkCitation link: {rel_desc} (ID: {citation_db.id})") - discovery_chain_service.associate_entity(db, citing_work_chain, citation_db, is_direct=False) - else: - logger.debug(f"{log_prefix}: WorkCitation link already exists: {rel_desc}") - except IntegrityError as ie_cite: - logger.warning(f"{log_prefix}: IntegrityError creating WorkCitation ({rel_desc}), likely created concurrently. Rolling back flush. Details: {ie_cite}") - db.rollback() # Rollback the specific flush. - except Exception as e_citation: - logger.error(f"{log_prefix}: Error creating/flushing WorkCitation ({rel_desc}): {e_citation}", exc_info=True) - db.rollback() # Rollback the specific flush. - - # Mark the child chain as complete and commit the savepoint. - discovery_chain_service.complete_chain(db, citing_work_chain) - nested_transaction.commit() # Commit changes for *this* citing work. - processed_count += 1 - logger.debug(f"{log_prefix}: Successfully processed and committed citing work {citing_work_oa_id}") + # Create a child chain for this specific citing work. + citing_work_chain = discovery_chain_service.create_child_chain( + db, + root_chain, + "REL_CITING_WORK_FROM_LIST", + {"citing_oa_id": citing_work_oa_id}, + ) + # Prepare minimal data for creating the citing work record if it doesn't exist. + wc_input_data: Dict[str, Any] = {"openalex_id": citing_work_oa_id} + if citing_work_doi: + wc_input_data["doi"] = citing_work_doi + if citing_work_item.get("title"): + wc_input_data["title"] = citing_work_item.get("title")[ + :1024 + ] # Truncate title if needed + if citing_work_item.get("publication_year"): + wc_input_data["publication_year"] = citing_work_item.get( + "publication_year" + ) + + logger.debug( + f"{log_prefix}: Getting/creating citing work OA ID {citing_work_oa_id}..." + ) + # Get or create the citing work record. + wc_db = work_repo.get_or_create_by_openalex_id( + openalex_id=citing_work_oa_id, obj_in_data=wc_input_data + ) + if wc_db.id is None: + raise RuntimeError( + f"Citing Work ID is None after get_or_create for OA ID {citing_work_oa_id}" + ) + logger.debug( + f"{log_prefix}: Got/created citing work DB record (ID: {wc_db.id})." + ) + discovery_chain_service.associate_entity( + db, citing_work_chain, wc_db, is_direct=True + ) + + # Create the citation link (Citing Work -> Primary Work). + if wc_db.id is not None and primary_work_db.id is not None: + citing_id, cited_id = wc_db.id, primary_work_db.id # Wc cites W1 + rel_desc = ( + f"CitingWork(ID:{citing_id}) cites PrimaryWork(ID:{cited_id})" + ) + logger.debug( + f"{log_prefix}: Checking/creating citation link: {rel_desc}" + ) + try: + existing_citation = ( + db.query(WorkCitation) + .filter_by(citing_work_id=citing_id, cited_work_id=cited_id) + .first() + ) + if not existing_citation: + citation_db = WorkCitation( + citing_work_id=citing_id, cited_work_id=cited_id + ) + db.add(citation_db) + db.flush() # Flush to get ID for association. + logger.info( + f"{log_prefix}: Created WorkCitation link: {rel_desc} (ID: {citation_db.id})" + ) + discovery_chain_service.associate_entity( + db, citing_work_chain, citation_db, is_direct=False + ) + else: + logger.debug( + f"{log_prefix}: WorkCitation link already exists: {rel_desc}" + ) + except IntegrityError as ie_cite: + logger.warning( + f"{log_prefix}: IntegrityError creating WorkCitation ({rel_desc}), likely created concurrently. Rolling back flush. Details: {ie_cite}" + ) + db.rollback() # Rollback the specific flush. + except Exception as e_citation: + logger.error( + f"{log_prefix}: Error creating/flushing WorkCitation ({rel_desc}): {e_citation}", + exc_info=True, + ) + db.rollback() # Rollback the specific flush. + + # Mark the child chain as complete and commit the savepoint. + discovery_chain_service.complete_chain(db, citing_work_chain) + nested_transaction.commit() # Commit changes for *this* citing work. + processed_count += 1 + logger.debug( + f"{log_prefix}: Successfully processed and committed citing work {citing_work_oa_id}" + ) except Exception as e_wc: - # An error occurred processing this specific citing work. - error_count += 1 - logger.error(f"{log_prefix}: Failed processing citing work OA ID {citing_work_oa_id}: {e_wc}", exc_info=True) - # Rollback the savepoint for the failed item. - try: - logger.warning(f"{log_prefix}: Rolling back savepoint for failed citing work {citing_work_oa_id}.") - nested_transaction.rollback() - except Exception as rb_err: - # Log error during rollback itself, but continue. - logger.error(f"{log_prefix}: Error rolling back savepoint for failed citing work {citing_work_oa_id}: {rb_err}") - - # Attempt to mark the specific child chain as FAILED in a separate session/transaction. - if citing_work_chain: - try: - # Use a temporary session to avoid interference with main session state. - temp_db_fail = SessionLocal() - try: - # Re-fetch the chain in the new session. - chain_to_fail = discovery_chain_service.get_by_uuid(temp_db_fail, citing_work_chain.id) - if chain_to_fail: - discovery_chain_service.fail_chain(temp_db_fail, chain_to_fail, error_message=f"Savepoint failed: {str(e_wc)[:100]}") - temp_db_fail.commit() - logger.info(f"{log_prefix}: Marked child chain {citing_work_chain.id} as FAILED.") - else: - logger.error(f"{log_prefix}: Could not find child chain {citing_work_chain.id} in temp session to mark as FAILED.") - except Exception as fail_e: - logger.error(f"{log_prefix}: Failed to mark citing work chain {citing_work_chain.id} as FAILED: {fail_e}", exc_info=False) - temp_db_fail.rollback() - finally: - temp_db_fail.close() - except Exception as session_err: - logger.error(f"{log_prefix}: Failed to create temp session for child chain failure update: {session_err}") - - # Re-raise specific exceptions that should trigger a task retry (like deadlocks). - if isinstance(e_wc, OperationalError) and getattr(e_wc.orig, 'pgcode', None) == '40P01': - logger.warning(f"{log_prefix}: Deadlock detected within savepoint for {citing_work_oa_id}. Re-raising for task retry.") - # Re-raise the deadlock error to be caught by the main task exception handler. - raise e_wc - # Otherwise, the loop continues to the next citing work. + # An error occurred processing this specific citing work. + error_count += 1 + logger.error( + f"{log_prefix}: Failed processing citing work OA ID {citing_work_oa_id}: {e_wc}", + exc_info=True, + ) + # Rollback the savepoint for the failed item. + try: + logger.warning( + f"{log_prefix}: Rolling back savepoint for failed citing work {citing_work_oa_id}." + ) + nested_transaction.rollback() + except Exception as rb_err: + # Log error during rollback itself, but continue. + logger.error( + f"{log_prefix}: Error rolling back savepoint for failed citing work {citing_work_oa_id}: {rb_err}" + ) + + # Attempt to mark the specific child chain as FAILED in a separate session/transaction. + if citing_work_chain: + try: + # Use a temporary session to avoid interference with main session state. + temp_db_fail = SessionLocal() + try: + # Re-fetch the chain in the new session. + chain_to_fail = discovery_chain_service.get_by_uuid( + temp_db_fail, citing_work_chain.id + ) + if chain_to_fail: + discovery_chain_service.fail_chain( + temp_db_fail, + chain_to_fail, + error_message=f"Savepoint failed: {str(e_wc)[:100]}", + ) + temp_db_fail.commit() + logger.info( + f"{log_prefix}: Marked child chain {citing_work_chain.id} as FAILED." + ) + else: + logger.error( + f"{log_prefix}: Could not find child chain {citing_work_chain.id} in temp session to mark as FAILED." + ) + except Exception as fail_e: + logger.error( + f"{log_prefix}: Failed to mark citing work chain {citing_work_chain.id} as FAILED: {fail_e}", + exc_info=False, + ) + temp_db_fail.rollback() + finally: + temp_db_fail.close() + except Exception as session_err: + logger.error( + f"{log_prefix}: Failed to create temp session for child chain failure update: {session_err}" + ) + + # Re-raise specific exceptions that should trigger a task retry (like deadlocks). + if ( + isinstance(e_wc, OperationalError) + and getattr(e_wc.orig, "pgcode", None) == "40P01" + ): + logger.warning( + f"{log_prefix}: Deadlock detected within savepoint for {citing_work_oa_id}. Re-raising for task retry." + ) + # Re-raise the deadlock error to be caught by the main task exception handler. + raise e_wc + # Otherwise, the loop continues to the next citing work. # --- Finalize Root Chain Status --- # After processing all items, set the final status of the root chain based on errors. @@ -723,90 +1042,164 @@ def process_citing_works_list_task( # Commit the main transaction (including successful savepoints and final root chain status). db.commit() - logger.info(f"{log_prefix}: Main transaction committed. Processed: {processed_count}, Errors: {error_count}.") + logger.info( + f"{log_prefix}: Main transaction committed. Processed: {processed_count}, Errors: {error_count}." + ) # --- Exception Handling for the Entire Task --- except Ignore: - logger.info(f"{log_prefix}: Task processing ignored (e.g., primary work missing).") - # Attempt to mark chain as COMPLETED if it was left PROCESSING during an Ignore scenario. - if db and root_chain and discovery_chain_service: - try: - if not db.is_active: db = SessionLocal() - chain_to_update = discovery_chain_service.get_by_uuid(db, root_chain.id) - if chain_to_update and chain_to_update.status == 'PROCESSING': - logger.info(f"{log_prefix}: Marking root chain {chain_to_update.id} as COMPLETED (due to Ignore).") - discovery_chain_service.complete_chain(db, chain_to_update, status_message="Ignored") - db.commit() - except Exception as e_complete: logger.error(f"{log_prefix}: Error updating chain status after Ignore: {e_complete}", exc_info=False); db.rollback() + logger.info( + f"{log_prefix}: Task processing ignored (e.g., primary work missing)." + ) + # Attempt to mark chain as COMPLETED if it was left PROCESSING during an Ignore scenario. + if db and root_chain and discovery_chain_service: + try: + if not db.is_active: + db = SessionLocal() + chain_to_update = discovery_chain_service.get_by_uuid(db, root_chain.id) + if chain_to_update and chain_to_update.status == "PROCESSING": + logger.info( + f"{log_prefix}: Marking root chain {chain_to_update.id} as COMPLETED (due to Ignore)." + ) + discovery_chain_service.complete_chain( + db, chain_to_update, status_message="Ignored" + ) + db.commit() + except Exception as e_complete: + logger.error( + f"{log_prefix}: Error updating chain status after Ignore: {e_complete}", + exc_info=False, + ) + db.rollback() except (ApiClientError, RuntimeError) as e: - # Handle API errors (caught by autoretry) or RuntimeErrors (e.g., failed API fetch). - logger.error(f"{log_prefix}: API Client or Runtime Error during task execution: {e}", exc_info=isinstance(e, RuntimeError)) - if db and root_chain and discovery_chain_service: - try: - if not db.is_active: db = SessionLocal() - chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) - if chain_to_fail and chain_to_fail.status not in ['COMPLETED', 'FAILED']: - discovery_chain_service.fail_chain(db, chain_to_fail, f"API/Runtime Error: {str(e)[:150]}") - db.commit() - except Exception as e_fail: logger.error(f"{log_prefix}: Error marking chain failed after API/Runtime error: {e_fail}", exc_info=False); db.rollback() - # Re-raise API errors for autoretry; treat RuntimeErrors as non-retryable here. - if isinstance(e, ApiClientError): - logger.info(f"{log_prefix}: Raising ApiClientError for Celery autoretry.") - raise e # Let autoretry handle it. - else: # For RuntimeError - logger.warning(f"{log_prefix}: Encountered RuntimeError, task will be ignored.") - raise Ignore() + # Handle API errors (caught by autoretry) or RuntimeErrors (e.g., failed API fetch). + logger.error( + f"{log_prefix}: API Client or Runtime Error during task execution: {e}", + exc_info=isinstance(e, RuntimeError), + ) + if db and root_chain and discovery_chain_service: + try: + if not db.is_active: + db = SessionLocal() + chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) + if chain_to_fail and chain_to_fail.status not in [ + "COMPLETED", + "FAILED", + ]: + discovery_chain_service.fail_chain( + db, chain_to_fail, f"API/Runtime Error: {str(e)[:150]}" + ) + db.commit() + except Exception as e_fail: + logger.error( + f"{log_prefix}: Error marking chain failed after API/Runtime error: {e_fail}", + exc_info=False, + ) + db.rollback() + # Re-raise API errors for autoretry; treat RuntimeErrors as non-retryable here. + if isinstance(e, ApiClientError): + logger.info(f"{log_prefix}: Raising ApiClientError for Celery autoretry.") + raise e # Let autoretry handle it. + else: # For RuntimeError + logger.warning( + f"{log_prefix}: Encountered RuntimeError, task will be ignored." + ) + raise Ignore() except OperationalError as e: # Catch deadlocks or other operational errors occurring outside the item loop. - pgcode = getattr(e.orig, 'pgcode', None) - if pgcode == '40P01': + pgcode = getattr(e.orig, "pgcode", None) + if pgcode == "40P01": # Handle deadlock: Manually trigger a retry. retry_count = self.request.retries countdown = int((retry_count + 1) * 10) + 10 - logger.warning(f"{log_prefix}: DEADLOCK detected (Retry {retry_count + 1}/{self.max_retries}). Retrying task in {countdown}s.") + logger.warning( + f"{log_prefix}: DEADLOCK detected (Retry {retry_count + 1}/{self.max_retries}). Retrying task in {countdown}s." + ) raise self.retry(exc=e, countdown=countdown) else: # Handle other operational errors. - logger.error(f"{log_prefix}: DATABASE OperationalError (non-deadlock, Code: {pgcode}): {e}", exc_info=True) + logger.error( + f"{log_prefix}: DATABASE OperationalError (non-deadlock, Code: {pgcode}): {e}", + exc_info=True, + ) if db and root_chain and discovery_chain_service: - try: - if not db.is_active: db = SessionLocal() - chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) - if chain_to_fail and chain_to_fail.status not in ['COMPLETED', 'FAILED']: - discovery_chain_service.fail_chain(db, chain_to_fail, f"DB OperationalError: {str(e)[:150]}") - db.commit() - except Exception as e_fail: logger.error(f"{log_prefix}: Error marking chain failed after DB OperationalError: {e_fail}", exc_info=False); db.rollback() - raise Ignore() # Do not retry other operational errors. + try: + if not db.is_active: + db = SessionLocal() + chain_to_fail = discovery_chain_service.get_by_uuid( + db, root_chain.id + ) + if chain_to_fail and chain_to_fail.status not in [ + "COMPLETED", + "FAILED", + ]: + discovery_chain_service.fail_chain( + db, chain_to_fail, f"DB OperationalError: {str(e)[:150]}" + ) + db.commit() + except Exception as e_fail: + logger.error( + f"{log_prefix}: Error marking chain failed after DB OperationalError: {e_fail}", + exc_info=False, + ) + db.rollback() + raise Ignore() # Do not retry other operational errors. except (SQLAlchemyError, ValueError) as e: # Catch other specific database or value errors. logger.error(f"{log_prefix}: DATABASE/VALUE Error: {e}", exc_info=True) if db and root_chain and discovery_chain_service: - try: - if not db.is_active: db = SessionLocal() - chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) - if chain_to_fail and chain_to_fail.status not in ['COMPLETED', 'FAILED']: - discovery_chain_service.fail_chain(db, chain_to_fail, f"DB/Value Error: {str(e)[:150]}") - db.commit() - except Exception as e_fail: logger.error(f"{log_prefix}: Error marking chain failed after DB/Value error: {e_fail}", exc_info=False); db.rollback() - logger.warning(f"{log_prefix}: Task will be ignored due to encountered DB/Value error.") + try: + if not db.is_active: + db = SessionLocal() + chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) + if chain_to_fail and chain_to_fail.status not in [ + "COMPLETED", + "FAILED", + ]: + discovery_chain_service.fail_chain( + db, chain_to_fail, f"DB/Value Error: {str(e)[:150]}" + ) + db.commit() + except Exception as e_fail: + logger.error( + f"{log_prefix}: Error marking chain failed after DB/Value error: {e_fail}", + exc_info=False, + ) + db.rollback() + logger.warning( + f"{log_prefix}: Task will be ignored due to encountered DB/Value error." + ) raise Ignore() except Exception as e: - # Catch any other unexpected errors. - logger.exception(f"{log_prefix}: Unexpected critical error: {e}") - if db and root_chain and discovery_chain_service: - try: - if not db.is_active: db = SessionLocal() - chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) - if chain_to_fail and chain_to_fail.status not in ['COMPLETED', 'FAILED']: - discovery_chain_service.fail_chain(db, chain_to_fail, f"Unexpected Error: {str(e)[:150]}") - db.commit() - except Exception as e_fail: logger.error(f"{log_prefix}: Error marking chain failed after critical error: {e_fail}", exc_info=False); db.rollback() - # Attempt a generic retry. - try: - raise self.retry(exc=e, countdown=int(self.request.retries * 5) + 5) - except Exception as retry_err: - logger.error(f"{log_prefix}: Failed to initiate retry after unexpected error: {retry_err}. Ignoring task.") - raise Ignore() + # Catch any other unexpected errors. + logger.exception(f"{log_prefix}: Unexpected critical error: {e}") + if db and root_chain and discovery_chain_service: + try: + if not db.is_active: + db = SessionLocal() + chain_to_fail = discovery_chain_service.get_by_uuid(db, root_chain.id) + if chain_to_fail and chain_to_fail.status not in [ + "COMPLETED", + "FAILED", + ]: + discovery_chain_service.fail_chain( + db, chain_to_fail, f"Unexpected Error: {str(e)[:150]}" + ) + db.commit() + except Exception as e_fail: + logger.error( + f"{log_prefix}: Error marking chain failed after critical error: {e_fail}", + exc_info=False, + ) + db.rollback() + # Attempt a generic retry. + try: + raise self.retry(exc=e, countdown=int(self.request.retries * 5) + 5) + except Exception as retry_err: + logger.error( + f"{log_prefix}: Failed to initiate retry after unexpected error: {retry_err}. Ignoring task." + ) + raise Ignore() finally: # --- Cleanup --- # Ensure the database session is always closed. @@ -815,5 +1208,9 @@ def process_citing_works_list_task( db.close() logger.debug(f"{log_prefix}: Database session closed.") except Exception as close_err: - logger.error(f"{log_prefix}: Error closing database session: {close_err}") -# --- END OF FILE scholarly_tasks.py --- \ No newline at end of file + logger.error( + f"{log_prefix}: Error closing database session: {close_err}" + ) + + +# --- END OF FILE scholarly_tasks.py --- diff --git a/backend/utils/__init__.py b/backend/utils/__init__.py index 1a29fdb..b63fa9d 100644 --- a/backend/utils/__init__.py +++ b/backend/utils/__init__.py @@ -1 +1 @@ -# Makes 'utils' a Python package \ No newline at end of file +# Makes 'utils' a Python package diff --git a/backend/utils/doi_utils.py b/backend/utils/doi_utils.py index 9c79807..126b575 100644 --- a/backend/utils/doi_utils.py +++ b/backend/utils/doi_utils.py @@ -43,7 +43,7 @@ # is used below to refine the results, as lookaheads can become overly complex # and might still miss edge cases or exclude valid characters at the end of a DOI. """, - re.VERBOSE | re.IGNORECASE + re.VERBOSE | re.IGNORECASE, ) # SIMPLE_DOI_FORMAT_CHECK: A simpler regex for basic format validation. @@ -55,6 +55,7 @@ # --- DOI Utility Functions --- + def extract_dois_from_text(text: str) -> List[str]: """ Extracts potential DOI strings from a given block of text using DOI_REGEX. @@ -88,16 +89,16 @@ def extract_dois_from_text(text: str) -> List[str]: # Parentheses, brackets, and angle brackets are sometimes part of DOIs, # but often they are part of the surrounding text (e.g., citations). # This cleanup favors removing them if they appear at the very end. - chars_to_strip = '.,;)]}>' + chars_to_strip = ".,;)]}>" while cleaned and cleaned[-1] in chars_to_strip: cleaned = cleaned[:-1] # Add the cleaned DOI to the set if it's not empty after stripping. if cleaned: - # Optional enhancement: Validate format using is_valid_doi_format here? - # if is_valid_doi_format(cleaned): - # cleaned_dois.add(cleaned) - cleaned_dois.add(cleaned) # Add regardless of strict format for now + # Optional enhancement: Validate format using is_valid_doi_format here? + # if is_valid_doi_format(cleaned): + # cleaned_dois.add(cleaned) + cleaned_dois.add(cleaned) # Add regardless of strict format for now # Return the unique DOIs as a sorted list. return sorted(list(cleaned_dois)) @@ -158,12 +159,12 @@ def is_valid_doi_format(doi: str) -> bool: "10.1000/xyz123", "10.123456789/suffix", "10.1016/j.cell.2020.01.014", - "10.123/abc", # Invalid prefix length - "9.9999/abc", # Invalid start - "doi:10.1101/12345", # Should be False as it checks the string itself + "10.123/abc", # Invalid prefix length + "9.9999/abc", # Invalid start + "doi:10.1101/12345", # Should be False as it checks the string itself "", None, ] for doi_str in test_dois: print(f"'{doi_str}': {is_valid_doi_format(str(doi_str))}") -# --- END OF FILE doi_utils.py --- \ No newline at end of file +# --- END OF FILE doi_utils.py --- diff --git a/backend/utils/github_utils.py b/backend/utils/github_utils.py index 5cf0767..f18c737 100644 --- a/backend/utils/github_utils.py +++ b/backend/utils/github_utils.py @@ -16,6 +16,7 @@ # Setup logger for this module. logger = logging.getLogger(__name__) + def parse_github_url(url: str) -> Optional[Tuple[str, str]]: """ Parses a given URL string to extract GitHub owner and repository names. @@ -51,24 +52,30 @@ def parse_github_url(url: str) -> Optional[Tuple[str, str]]: # Validate the network location (domain). Must be 'github.com'. # Use case-insensitive comparison for robustness. - if parsed_url.netloc.lower() != 'github.com': - logger.warning(f"URL rejected: domain is not github.com ('{parsed_url.netloc}'). URL: {url}") + if parsed_url.netloc.lower() != "github.com": + logger.warning( + f"URL rejected: domain is not github.com ('{parsed_url.netloc}'). URL: {url}" + ) return None # Process the path component of the URL. # 1. Remove leading/trailing slashes for consistent processing. - path = parsed_url.path.strip('/') + path = parsed_url.path.strip("/") # 2. Remove the '.git' suffix if present (case-insensitive). - if path.lower().endswith('.git'): - path = path[:-4] # Slice off the last 4 characters ('.git'). + if path.lower().endswith(".git"): + path = path[:-4] # Slice off the last 4 characters ('.git'). # Split the cleaned path into segments using '/' as the delimiter. - parts = path.split('/') + parts = path.split("/") # Expect exactly two non-empty segments: the owner and the repository name. - if len(parts) == 2 and all(parts): # `all(parts)` checks for empty strings (e.g., 'owner//repo'). + if len(parts) == 2 and all( + parts + ): # `all(parts)` checks for empty strings (e.g., 'owner//repo'). owner, repo = parts[0], parts[1] - logger.debug(f"Successfully parsed GitHub URL '{url}' -> owner='{owner}', repo='{repo}'") + logger.debug( + f"Successfully parsed GitHub URL '{url}' -> owner='{owner}', repo='{repo}'" + ) return owner, repo else: # Log a warning if the path structure doesn't match owner/repo. @@ -84,23 +91,24 @@ def parse_github_url(url: str) -> Optional[Tuple[str, str]]: logger.error(f"Unexpected error parsing GitHub URL '{url}': {e}", exc_info=True) return None + # --- Example Usage & Basic Tests --- # This block executes only when the script is run directly. # It serves as a basic verification of the parse_github_url function. if __name__ == "__main__": urls_to_test = [ - "https://github.com/pallets/flask", # Standard case - "https://github.com/pallets/flask/", # Trailing slash - "https://github.com/pallets/flask.git", # .git suffix - "http://github.com/pallets/flask", # HTTP scheme - "HTTPS://GITHUB.COM/USER/REPO", # Case variation - "https://github.com/django/django/tree/main", # Invalid structure (too many parts) - "https://gitlab.com/user/repo", # Invalid domain - "https://github.com/just_owner", # Invalid structure (too few parts) - "https://github.com//repo", # Invalid structure (empty owner part) - "invalid-url", # Not a URL - "", # Empty string - None, # None input + "https://github.com/pallets/flask", # Standard case + "https://github.com/pallets/flask/", # Trailing slash + "https://github.com/pallets/flask.git", # .git suffix + "http://github.com/pallets/flask", # HTTP scheme + "HTTPS://GITHUB.COM/USER/REPO", # Case variation + "https://github.com/django/django/tree/main", # Invalid structure (too many parts) + "https://gitlab.com/user/repo", # Invalid domain + "https://github.com/just_owner", # Invalid structure (too few parts) + "https://github.com//repo", # Invalid structure (empty owner part) + "invalid-url", # Not a URL + "", # Empty string + None, # None input ] print("--- Testing GitHub URL Parsing ---") @@ -110,4 +118,4 @@ def parse_github_url(url: str) -> Optional[Tuple[str, str]]: print(f"'{test_url}' -> Owner: {result[0]}, Repo: {result[1]} (Success)") else: print(f"'{test_url}' -> FAILED to parse") -# --- END OF FILE github_utils.py --- \ No newline at end of file +# --- END OF FILE github_utils.py --- diff --git a/backend/utils/recipe_executor.py b/backend/utils/recipe_executor.py index d918461..e7df91b 100644 --- a/backend/utils/recipe_executor.py +++ b/backend/utils/recipe_executor.py @@ -12,6 +12,7 @@ database connection strings, and secrets securely, captures output, manages timeouts, and returns structured results or error information. """ + import sys import subprocess import json @@ -34,6 +35,7 @@ # This ensures the subprocess uses the same Python environment. _python_executable = sys.executable + def execute_recipe( recipe_path_relative: str, recipe_params: Dict[str, Any], @@ -41,7 +43,7 @@ def execute_recipe( timeout: int = 60, script_type: str = "analysis", function_name: str = "run_analysis", - secrets: Optional[Dict[str, str]] = None + secrets: Optional[Dict[str, str]] = None, ) -> Dict[str, Any]: """ Executes a specified recipe Python script in an isolated subprocess. @@ -79,9 +81,17 @@ def execute_recipe( # Validate that the recipe script file exists. if not absolute_recipe_path.is_file(): - error_msg = f"Recipe script file not found at resolved path: {absolute_recipe_path}" + error_msg = ( + f"Recipe script file not found at resolved path: {absolute_recipe_path}" + ) logger.error(error_msg) - return {"success": False, "error": {"error": "FileNotFoundError", "message": f"Recipe script not found: {recipe_path_relative}"}} + return { + "success": False, + "error": { + "error": "FileNotFoundError", + "message": f"Recipe script not found: {recipe_path_relative}", + }, + } # Serialize the parameters dictionary into a JSON string. try: @@ -90,7 +100,13 @@ def execute_recipe( # Handle potential errors during JSON serialization (e.g., non-serializable types). error_msg = f"Could not serialize recipe parameters to JSON: {e}" logger.error(error_msg) - return {"success": False, "error": {"error": "ParameterSerializationError", "message": f"Could not serialize parameters: {e}"}} + return { + "success": False, + "error": { + "error": "ParameterSerializationError", + "message": f"Could not serialize parameters: {e}", + }, + } # --- Construct the Subprocess Command --- # Base command includes the Python interpreter, the runner script path, @@ -98,22 +114,29 @@ def execute_recipe( command = [ _python_executable, str(_run_script_path), - "--module-path", str(absolute_recipe_path), - "--params-json", params_json, - "--db-conn-str", db_conn_str, - "--script-type", script_type, - "--function-name", function_name, + "--module-path", + str(absolute_recipe_path), + "--params-json", + params_json, + "--db-conn-str", + db_conn_str, + "--script-type", + script_type, + "--function-name", + function_name, ] # Append secret arguments securely if provided. # Each key and value is passed as a separate argument pair. - log_secrets_display = [] # Used for constructing a masked version for logging. + log_secrets_display = [] # Used for constructing a masked version for logging. if secrets: for key, value in secrets.items(): # Append actual key and value to the command list. - command.extend([f"--secret-key", key, f"--secret-value", value]) + command.extend(["--secret-key", key, "--secret-value", value]) # Append key and masked value for logging purposes. - log_secrets_display.extend([f"--secret-key", key, f"--secret-value", "[SECRET]"]) + log_secrets_display.extend( + ["--secret-key", key, "--secret-value", "[SECRET]"] + ) # --- Log Execution Attempt (Masking Sensitive Data) --- # Create a version of the command for logging where sensitive information @@ -121,14 +144,19 @@ def execute_recipe( log_command_display = [ _python_executable, str(_run_script_path), - "--module-path", str(absolute_recipe_path), - "--params-json", "[PARAMS_JSON]", # Mask serialized parameters. - "--db-conn-str", "[DB_CONN_STR]", # Mask database connection string. - "--script-type", script_type, - "--function-name", function_name, + "--module-path", + str(absolute_recipe_path), + "--params-json", + "[PARAMS_JSON]", # Mask serialized parameters. + "--db-conn-str", + "[DB_CONN_STR]", # Mask database connection string. + "--script-type", + script_type, + "--function-name", + function_name, ] if log_secrets_display: - log_command_display.extend(log_secrets_display) # Append masked secrets. + log_command_display.extend(log_secrets_display) # Append masked secrets. logger.info(f"Executing recipe via subprocess: {' '.join(log_command_display)}") @@ -136,11 +164,11 @@ def execute_recipe( try: result = subprocess.run( command, - capture_output=True, # Capture stdout and stderr streams. - text=True, # Decode stdout/stderr as text (UTF-8 by default). - check=False, # Do not raise CalledProcessError on non-zero exit codes (handle manually). - timeout=timeout, # Set the execution timeout. - encoding='utf-8', # Explicitly specify UTF-8 encoding. + capture_output=True, # Capture stdout and stderr streams. + text=True, # Decode stdout/stderr as text (UTF-8 by default). + check=False, # Do not raise CalledProcessError on non-zero exit codes (handle manually). + timeout=timeout, # Set the execution timeout. + encoding="utf-8", # Explicitly specify UTF-8 encoding. # Security Consideration: Review environment variables passed. By default, # the subprocess inherits the parent's environment. Limit if necessary. # env=os.environ.copy() # Example: pass current environment (review security). @@ -156,20 +184,33 @@ def execute_recipe( # Check the return code of the runner script. if result.returncode != 0: - logger.error(f"Recipe runner script exited with non-zero code: {result.returncode} for {recipe_path_relative}") + logger.error( + f"Recipe runner script exited with non-zero code: {result.returncode} for {recipe_path_relative}" + ) # Attempt to parse stdout for a structured JSON error message from the runner script. try: error_json = json.loads(stdout) # Check if it matches the expected failure structure. if isinstance(error_json, dict) and error_json.get("success") is False: - logger.error(f"Recipe execution failed (reported by runner): {error_json.get('error', {})}") - return error_json # Return the detailed error from the runner. + logger.error( + f"Recipe execution failed (reported by runner): {error_json.get('error', {})}" + ) + return error_json # Return the detailed error from the runner. except json.JSONDecodeError: # If stdout is not JSON, log the raw output (truncated). - logger.error(f"Recipe runner stdout was not valid JSON error output: {stdout[:500]}...") + logger.error( + f"Recipe runner stdout was not valid JSON error output: {stdout[:500]}..." + ) # Return a generic execution error if JSON parsing failed or structure was wrong. - return {"success": False, "error": {"error": "ExecutionError", "message": f"Script exited with code {result.returncode}. Stderr: {stderr[:500]}...", "script_path": recipe_path_relative}} + return { + "success": False, + "error": { + "error": "ExecutionError", + "message": f"Script exited with code {result.returncode}. Stderr: {stderr[:500]}...", + "script_path": recipe_path_relative, + }, + } # If return code is 0, proceed assuming success. # Attempt to parse the standard output as the JSON result from the recipe. @@ -180,28 +221,66 @@ def execute_recipe( if result_json["success"] is True: # Successful execution reported by the runner. logger.info(f"Recipe execution successful: {recipe_path_relative}") - return result_json # Return the structured result. + return result_json # Return the structured result. else: # Handle edge case: runner exited 0 but reported "success": false. - logger.error(f"Recipe runner ({recipe_path_relative}) exited 0 but reported success=False: {result_json.get('error', 'No error details provided')}") - return result_json # Return the structured error from the runner. + logger.error( + f"Recipe runner ({recipe_path_relative}) exited 0 but reported success=False: {result_json.get('error', 'No error details provided')}" + ) + return result_json # Return the structured error from the runner. else: - # Runner exited 0, but output format doesn't match expected structure. - logger.error(f"Recipe runner ({recipe_path_relative}) exited 0 but output unexpected JSON structure: {stdout[:500]}...") - return {"success": False, "error": {"error": "OutputFormatError", "message": "Script exited successfully but output was not in expected format.", "output": stdout[:500]}} + # Runner exited 0, but output format doesn't match expected structure. + logger.error( + f"Recipe runner ({recipe_path_relative}) exited 0 but output unexpected JSON structure: {stdout[:500]}..." + ) + return { + "success": False, + "error": { + "error": "OutputFormatError", + "message": "Script exited successfully but output was not in expected format.", + "output": stdout[:500], + }, + } except json.JSONDecodeError as e: # Failed to decode the expected JSON result from stdout. - logger.error(f"Failed to decode JSON result from recipe runner stdout ({recipe_path_relative}): {e}. Output: {stdout[:500]}...") - return {"success": False, "error": {"error": "OutputDecodeError", "message": f"Could not decode script output as JSON: {e}", "output": stdout[:500]}} + logger.error( + f"Failed to decode JSON result from recipe runner stdout ({recipe_path_relative}): {e}. Output: {stdout[:500]}..." + ) + return { + "success": False, + "error": { + "error": "OutputDecodeError", + "message": f"Could not decode script output as JSON: {e}", + "output": stdout[:500], + }, + } # --- Handle Subprocess Exceptions --- except subprocess.TimeoutExpired: # Subprocess execution exceeded the specified timeout. - logger.error(f"Recipe execution timed out after {timeout}s: {recipe_path_relative}") - return {"success": False, "error": {"error": "TimeoutError", "message": f"Execution timed out after {timeout} seconds."}} + logger.error( + f"Recipe execution timed out after {timeout}s: {recipe_path_relative}" + ) + return { + "success": False, + "error": { + "error": "TimeoutError", + "message": f"Execution timed out after {timeout} seconds.", + }, + } except Exception as e: # Catch any other unexpected errors during subprocess management. - logger.exception(f"Unexpected error running recipe subprocess for {recipe_path_relative}") - return {"success": False, "error": {"error": "SubprocessError", "message": f"Unexpected error launching or managing subprocess: {e}"}} -# --- END OF FILE recipe_executor.py --- \ No newline at end of file + logger.exception( + f"Unexpected error running recipe subprocess for {recipe_path_relative}" + ) + return { + "success": False, + "error": { + "error": "SubprocessError", + "message": f"Unexpected error launching or managing subprocess: {e}", + }, + } + + +# --- END OF FILE recipe_executor.py --- diff --git a/backend/utils/recipe_utils.py b/backend/utils/recipe_utils.py index aaed2dc..9476bbf 100644 --- a/backend/utils/recipe_utils.py +++ b/backend/utils/recipe_utils.py @@ -17,10 +17,9 @@ - Manage paths and constants related to recipe locations. """ -import os -import ast # Abstract Syntax Trees module for parsing Python code structure. +import ast # Abstract Syntax Trees module for parsing Python code structure. import logging -import re # Regular expressions for filename and docstring parsing. +import re # Regular expressions for filename and docstring parsing. from pathlib import Path from typing import List, Dict, Any, Optional, Tuple @@ -34,13 +33,16 @@ # Define standard locations for contributed recipe scripts, relative to the project root. # These constants provide centralized access points for recipe discovery functions. CONTRIB_DIR = PROJECT_ROOT_UTIL / "contrib" -CONTRIB_QUERIES_DIR = CONTRIB_DIR / "queries" # Directory for analysis query recipes. -CONTRIB_AFFILIATION_ALGOS_DIR = CONTRIB_DIR / "affiliation_algorithms" # Directory for affiliation algorithm recipes. +CONTRIB_QUERIES_DIR = CONTRIB_DIR / "queries" # Directory for analysis query recipes. +CONTRIB_AFFILIATION_ALGOS_DIR = ( + CONTRIB_DIR / "affiliation_algorithms" +) # Directory for affiliation algorithm recipes. # Add other recipe directories here as needed (e.g., CONTRIB_DISCOVERY_ALGOS_DIR). # --- Metadata Structures --- + class RecipeParameterMetadata: """ Represents metadata for a single parameter expected by a recipe function. @@ -48,15 +50,17 @@ class RecipeParameterMetadata: Stores the parameter's name, its type hint (as a string), and a human-readable description, typically extracted from the recipe function's docstring. """ + def __init__(self, name: str, type_hint: str, description: str): - self.name = name # Parameter name. - self.type = type_hint # String representation of the type hint (e.g., 'str', 'int', 'Dict[str, Any]'). + self.name = name # Parameter name. + self.type = type_hint # String representation of the type hint (e.g., 'str', 'int', 'Dict[str, Any]'). self.description = description # Description of the parameter's purpose. def to_dict(self) -> Dict[str, str]: """Serializes the parameter metadata into a dictionary format.""" return {"name": self.name, "type": self.type, "description": self.description} + class RecipeMetadata: """ Represents metadata for a discovered recipe script. @@ -65,13 +69,27 @@ class RecipeMetadata: expected parameters, and its file path relative to the project root. This object encapsulates the information needed to display and execute a recipe. """ - def __init__(self, name: str, version: str, description: str, parameters: List[RecipeParameterMetadata], file_path: str): - self.name = name # Base name of the recipe (extracted from filename). - self.version = version # Version string (e.g., 'v1', 'v1.2', extracted from filename). - self.description = description # Description of the recipe's purpose (from docstring). - self.parameters = parameters # List of required parameters (from docstring). + + def __init__( + self, + name: str, + version: str, + description: str, + parameters: List[RecipeParameterMetadata], + file_path: str, + ): + self.name = name # Base name of the recipe (extracted from filename). + self.version = ( + version # Version string (e.g., 'v1', 'v1.2', extracted from filename). + ) + self.description = ( + description # Description of the recipe's purpose (from docstring). + ) + self.parameters = parameters # List of required parameters (from docstring). # Store the file path using forward slashes for cross-platform consistency. - self.file_path = str(Path(file_path)).replace("\\", "/") # Relative path from project root. + self.file_path = str(Path(file_path)).replace( + "\\", "/" + ) # Relative path from project root. def to_dict(self) -> Dict[str, Any]: """Serializes the recipe metadata into a dictionary, suitable for API responses.""" @@ -79,13 +97,19 @@ def to_dict(self) -> Dict[str, Any]: "name": self.name, "version": self.version, "description": self.description, - "parameters": [p.to_dict() for p in self.parameters], # Serialize parameter list. - "file_path": self.file_path, # Include relative path for backend lookup during execution. + "parameters": [ + p.to_dict() for p in self.parameters + ], # Serialize parameter list. + "file_path": self.file_path, # Include relative path for backend lookup during execution. } + # --- Helper Functions --- -def _parse_docstring(docstring: Optional[str]) -> Tuple[str, List[RecipeParameterMetadata]]: + +def _parse_docstring( + docstring: Optional[str], +) -> Tuple[str, List[RecipeParameterMetadata]]: """ Parses a function docstring adhering to a specific format to extract metadata. @@ -107,7 +131,7 @@ def _parse_docstring(docstring: Optional[str]) -> Tuple[str, List[RecipeParamete if not docstring: return "No description provided.", [] - lines = [line.strip() for line in docstring.strip().split('\n')] + lines = [line.strip() for line in docstring.strip().split("\n")] description = lines[0] if lines else "No description provided." parameters: List[RecipeParameterMetadata] = [] param_section_found = False @@ -117,20 +141,30 @@ def _parse_docstring(docstring: Optional[str]) -> Tuple[str, List[RecipeParamete line_lower = line.lower().strip() if line_lower == "params:": param_section_found = True - continue # Move to the next line after finding "Params:" + continue # Move to the next line after finding "Params:" # If inside the params section and the line starts with '-', attempt to parse it. if param_section_found and line.startswith("-"): # Regex to capture name, type hint, and description within parentheses. # Allows for complex type hints (e.g., List[str], Optional[Dict[str, int]]). - match = re.match(r"-\s*(\w+)\s*:\s*([\w\s.\[\],]+)\s*\((.+)\)", line, re.IGNORECASE) + match = re.match( + r"-\s*(\w+)\s*:\s*([\w\s.\[\],]+)\s*\((.+)\)", line, re.IGNORECASE + ) if match: name, type_hint, desc = match.groups() - parameters.append(RecipeParameterMetadata(name.strip(), type_hint.strip(), desc.strip())) + parameters.append( + RecipeParameterMetadata( + name.strip(), type_hint.strip(), desc.strip() + ) + ) else: - # Log a warning if a line in the params section doesn't match the expected format. - logger.warning(f"Could not parse recipe parameter line format: '{line}'") - elif param_section_found and (line_lower.startswith("returns:") or line_lower.startswith("yields:")): + # Log a warning if a line in the params section doesn't match the expected format. + logger.warning( + f"Could not parse recipe parameter line format: '{line}'" + ) + elif param_section_found and ( + line_lower.startswith("returns:") or line_lower.startswith("yields:") + ): # Stop parsing parameters if a 'Returns:' or 'Yields:' section is encountered. break elif param_section_found and not line: @@ -142,9 +176,9 @@ def _parse_docstring(docstring: Optional[str]) -> Tuple[str, List[RecipeParamete # --- Core Discovery Function --- + def discover_recipes( - recipes_base_dir: Path, - target_function_name: str = "run_analysis" + recipes_base_dir: Path, target_function_name: str = "run_analysis" ) -> List[RecipeMetadata]: """ Scans a specified directory for Python files matching the recipe naming @@ -169,26 +203,32 @@ def discover_recipes( """ recipes: List[RecipeMetadata] = [] if not recipes_base_dir.is_dir(): - logger.warning(f"Recipe discovery skipped: Directory not found or is not a directory: {recipes_base_dir}") + logger.warning( + f"Recipe discovery skipped: Directory not found or is not a directory: {recipes_base_dir}" + ) return recipes - logger.info(f"Scanning for recipes with target function '{target_function_name}' in: {recipes_base_dir}") + logger.info( + f"Scanning for recipes with target function '{target_function_name}' in: {recipes_base_dir}" + ) # Iterate through Python files in the specified directory matching the version pattern. for file_path in recipes_base_dir.glob("*_v*.py"): if not file_path.is_file(): - continue # Skip directories or other non-file items. + continue # Skip directories or other non-file items. # Use regex to extract the base name and version string from the filename. # Expects format: 'name_v1.py', 'name_v1.0.py', etc. match = re.match(r"(.+)_v(\d+(?:\.\d+)*)\.py", file_path.name) if not match: # Skip files that don't match the naming convention (might be helper modules). - logger.debug(f"Skipping file (does not match recipe naming convention '_vX.py'): {file_path.name}") + logger.debug( + f"Skipping file (does not match recipe naming convention '_vX.py'): {file_path.name}" + ) continue - recipe_name, numeric_version_part = match.groups() # e.g., 'my_query', '1.0' - full_version_string = f"v{numeric_version_part}" # Prepend 'v' -> 'v1.0' + recipe_name, numeric_version_part = match.groups() # e.g., 'my_query', '1.0' + full_version_string = f"v{numeric_version_part}" # Prepend 'v' -> 'v1.0' logger.debug(f"Processing potential recipe file: {file_path.name}") try: @@ -201,9 +241,12 @@ def discover_recipes( # Traverse the AST to find the definition of the target function. func_node = None for node in ast.walk(tree): - if isinstance(node, ast.FunctionDef) and node.name == target_function_name: + if ( + isinstance(node, ast.FunctionDef) + and node.name == target_function_name + ): func_node = node - break # Found the target function, no need to search further. + break # Found the target function, no need to search further. if func_node: # Extract the docstring from the found function node. @@ -216,31 +259,45 @@ def discover_recipes( relative_path = file_path.relative_to(PROJECT_ROOT_UTIL) except ValueError: # This occurs if the file path is somehow outside the project root. - logger.error(f"Recipe file {file_path} appears outside the project root {PROJECT_ROOT_UTIL}. Storing absolute path as fallback.") - relative_path = file_path # Use absolute path in this edge case. + logger.error( + f"Recipe file {file_path} appears outside the project root {PROJECT_ROOT_UTIL}. Storing absolute path as fallback." + ) + relative_path = file_path # Use absolute path in this edge case. # Create and append the RecipeMetadata object. - recipes.append(RecipeMetadata( - name=recipe_name.replace('_', ' ').title(), # Format name nicely - version=full_version_string, - description=description, - parameters=parameters, - file_path=str(relative_path) # Store relative path as string. - )) - logger.debug(f"Successfully parsed metadata for recipe '{recipe_name}' version '{full_version_string}' (Function: {target_function_name})") + recipes.append( + RecipeMetadata( + name=recipe_name.replace( + "_", " " + ).title(), # Format name nicely + version=full_version_string, + description=description, + parameters=parameters, + file_path=str(relative_path), # Store relative path as string. + ) + ) + logger.debug( + f"Successfully parsed metadata for recipe '{recipe_name}' version '{full_version_string}' (Function: {target_function_name})" + ) else: - # Log if a file matches the naming convention but lacks the target function. - logger.debug(f"No function named '{target_function_name}' found in {file_path.name}, skipping metadata extraction.") + # Log if a file matches the naming convention but lacks the target function. + logger.debug( + f"No function named '{target_function_name}' found in {file_path.name}, skipping metadata extraction." + ) except FileNotFoundError: # Should not happen within the loop but handle defensively. logger.error(f"File not found during recipe processing: {file_path}") except SyntaxError as e: logger.error(f"Syntax error parsing recipe file {file_path}: {e}") - except Exception as e: + except Exception: # Catch any other unexpected errors during file processing or AST parsing. logger.exception(f"Unexpected error processing recipe file {file_path}") - logger.info(f"Discovered {len(recipes)} recipes with target function '{target_function_name}' in {recipes_base_dir}") + logger.info( + f"Discovered {len(recipes)} recipes with target function '{target_function_name}' in {recipes_base_dir}" + ) return recipes -# --- END OF FILE recipe_utils.py --- \ No newline at end of file + + +# --- END OF FILE recipe_utils.py --- diff --git a/contrib/affiliation_algorithms/contributor_affiliation_match_v1.py b/contrib/affiliation_algorithms/contributor_affiliation_match_v1.py index e9f8ffb..d5c2db5 100644 --- a/contrib/affiliation_algorithms/contributor_affiliation_match_v1.py +++ b/contrib/affiliation_algorithms/contributor_affiliation_match_v1.py @@ -11,7 +11,7 @@ import sys import logging from pathlib import Path -from typing import List, Dict, Any, Set +from typing import List, Dict, Any # --- Path Setup --- # Determine the project root directory based on the script's location @@ -27,29 +27,20 @@ # Import necessary MOSS models for database interaction, covering repositories, # institutions, authors, works, affiliations, and DOI references. -from backend.data.models import ( - Repository, - Institution, - Affiliation, - Authorship, - Person, - Work, - DOIReference -) +from backend.data.models import Affiliation, Authorship, DOIReference # --- Logging Setup --- # Configure basic logging to provide visibility into the script's execution. logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [contributor_affil_match_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) def calculate_affiliations( - institution_id: int, - db_conn_str: str + institution_id: int, db_conn_str: str ) -> List[Dict[str, Any]]: """ Identifies potential repository-institution affiliations. @@ -85,7 +76,9 @@ def calculate_affiliations( } Returns an empty list if no affiliations are found or if an error occurs. """ - logger.info(f"Starting contributor_affiliation_match_v1 for Institution ID {institution_id}") + logger.info( + f"Starting contributor_affiliation_match_v1 for Institution ID {institution_id}" + ) engine = None db: Session | None = None @@ -104,57 +97,71 @@ def calculate_affiliations( # Step 1: Find all unique person IDs linked to the target institution # via the Affiliation table. - person_ids_stmt = ( - select(distinct(Affiliation.authorship_person_id)) - .where(Affiliation.institution_id == institution_id) + person_ids_stmt = select(distinct(Affiliation.authorship_person_id)).where( + Affiliation.institution_id == institution_id ) affiliated_person_ids = db.execute(person_ids_stmt).scalars().all() if not affiliated_person_ids: # If no affiliated persons found, no further links can be made. - logger.info(f"No persons found affiliated with Institution ID {institution_id}.") + logger.info( + f"No persons found affiliated with Institution ID {institution_id}." + ) return [] - logger.info(f"Found {len(affiliated_person_ids)} persons affiliated with Inst ID {institution_id}.") + logger.info( + f"Found {len(affiliated_person_ids)} persons affiliated with Inst ID {institution_id}." + ) # Step 2: Find all unique work IDs associated with these affiliated persons # via the Authorship table. - work_ids_stmt = ( - select(distinct(Authorship.work_id)) - .where(Authorship.person_id.in_(affiliated_person_ids)) + work_ids_stmt = select(distinct(Authorship.work_id)).where( + Authorship.person_id.in_(affiliated_person_ids) ) authored_work_ids = db.execute(work_ids_stmt).scalars().all() if not authored_work_ids: # If these authors have no associated works in the DB, stop. - logger.info(f"No works found authored by affiliated persons.") + logger.info("No works found authored by affiliated persons.") return [] - logger.info(f"Found {len(authored_work_ids)} works authored by affiliated persons.") + logger.info( + f"Found {len(authored_work_ids)} works authored by affiliated persons." + ) # Step 3: Find repository links (via DOIReference) to these authored works. # Select distinct repository IDs, along with the linking work ID and DOI for evidence. repo_link_stmt = ( - select(distinct(DOIReference.repository_id), DOIReference.work_id, DOIReference.doi) - .where(DOIReference.work_id.in_(authored_work_ids)) # Link to the works found in Step 2 - .where(DOIReference.repository_id.isnot(None)) # Ensure the reference links to a known repository + select( + distinct(DOIReference.repository_id), + DOIReference.work_id, + DOIReference.doi, + ) + .where( + DOIReference.work_id.in_(authored_work_ids) + ) # Link to the works found in Step 2 + .where( + DOIReference.repository_id.isnot(None) + ) # Ensure the reference links to a known repository ) # Fetch results as dictionary-like rows for easy access by column name. repo_links = db.execute(repo_link_stmt).mappings().all() - logger.info(f"Found {len(repo_links)} DOI references linking affiliated works to repositories.") + logger.info( + f"Found {len(repo_links)} DOI references linking affiliated works to repositories." + ) # Step 4: Aggregate the findings by repository ID. for link in repo_links: - repo_id = link['repository_id'] - work_id = link['work_id'] - doi = link['doi'] + repo_id = link["repository_id"] + work_id = link["work_id"] + doi = link["doi"] # Structure the evidence for this specific link (work/DOI). evidence_item = { - "type": "affiliated_author_work", # Type of evidence detail + "type": "affiliated_author_work", # Type of evidence detail "work_id": work_id, - "doi": doi + "doi": doi, # Note: Adding person_id here would require another join or lookup, # omitted for simplicity in this version. } @@ -162,8 +169,8 @@ def calculate_affiliations( if repo_id not in results_map: # First time encountering this repository, initialize its entry. results_map[repo_id] = { - "score": CONFIDENCE_SCORE, # Assign the predefined score - "evidence_list": [evidence_item] # Start the list of evidence + "score": CONFIDENCE_SCORE, # Assign the predefined score + "evidence_list": [evidence_item], # Start the list of evidence } else: # Repository already seen, just add the new piece of evidence. @@ -172,15 +179,22 @@ def calculate_affiliations( # Limit the number of evidence examples stored per repository for brevity. max_evidence = 5 if len(results_map[repo_id]["evidence_list"]) > max_evidence: - # Keep the first few examples and add a truncation indicator. - results_map[repo_id]["evidence_list"] = results_map[repo_id]["evidence_list"][:max_evidence] + \ - [{"type": "truncated", "count": len(results_map[repo_id]["evidence_list"])}] - + # Keep the first few examples and add a truncation indicator. + results_map[repo_id]["evidence_list"] = results_map[repo_id][ + "evidence_list" + ][:max_evidence] + [ + { + "type": "truncated", + "count": len(results_map[repo_id]["evidence_list"]), + } + ] except Exception as e: # Catch any unexpected errors during execution. - logger.exception(f"Error during contributor_affiliation_match_v1 execution: {e}") - return [] # Return empty list on error + logger.exception( + f"Error during contributor_affiliation_match_v1 execution: {e}" + ) + return [] # Return empty list on error finally: # Ensure database resources are released. if db: @@ -193,18 +207,23 @@ def calculate_affiliations( # Step 5: Format the aggregated results from the map into the final list structure. final_results = [] for repo_id, data in results_map.items(): - final_results.append({ - "repository_id": repo_id, - "confidence_score": data["score"], - "evidence": { # Structure the evidence clearly - "signal_type": "affiliated_author_work_reference", # Overall type of signal - "details": data["evidence_list"] # List of specific work/DOI links - } - }) + final_results.append( + { + "repository_id": repo_id, + "confidence_score": data["score"], + "evidence": { # Structure the evidence clearly + "signal_type": "affiliated_author_work_reference", # Overall type of signal + "details": data["evidence_list"], # List of specific work/DOI links + }, + } + ) - logger.info(f"Contributor_affiliation_match_v1 finished. Found {len(final_results)} potential repository affiliations for Inst {institution_id}.") + logger.info( + f"Contributor_affiliation_match_v1 finished. Found {len(final_results)} potential repository affiliations for Inst {institution_id}." + ) return final_results + # --- Example Test Call Block --- # This block is typically commented out but can be used for direct script # execution during development or testing, provided the necessary environment @@ -228,4 +247,4 @@ def calculate_affiliations( # # Pretty-print the JSON output for readability # import json # print(json.dumps(affiliations, indent=2)) -# --- End Example Test Call Block --- \ No newline at end of file +# --- End Example Test Call Block --- diff --git a/contrib/affiliation_algorithms/keyword_match_v1.py b/contrib/affiliation_algorithms/keyword_match_v1.py index 88d5bc8..f6ed297 100644 --- a/contrib/affiliation_algorithms/keyword_match_v1.py +++ b/contrib/affiliation_algorithms/keyword_match_v1.py @@ -9,9 +9,7 @@ """ import sys -import os import logging -import re # Required for potential future regex use, though not used currently from pathlib import Path from typing import List, Dict, Any, Set @@ -26,7 +24,10 @@ # Import necessary SQLAlchemy components for database interaction. from sqlalchemy import create_engine, or_, select, text -from sqlalchemy.orm import sessionmaker, Session # `joinedload` was removed as it wasn't used. +from sqlalchemy.orm import ( + sessionmaker, + Session, +) # `joinedload` was removed as it wasn't used. # Import required MOSS data models. from backend.data.models import Repository, Owner @@ -37,15 +38,15 @@ logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [keyword_match_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) def calculate_affiliations( - institution_id: int, # Included for consistency with the algorithm signature pattern. + institution_id: int, # Included for consistency with the algorithm signature pattern. keywords: List[str], - db_conn_str: str + db_conn_str: str, ) -> List[Dict[str, Any]]: """ Calculates repository-institution affiliations by matching keywords in DB metadata. @@ -85,7 +86,9 @@ def calculate_affiliations( Returns an empty list if no keywords are provided, no matches are found, or an error occurs during processing. """ - logger.info(f"Starting keyword_match_v1 for Institution ID {institution_id} with keywords: {keywords}") + logger.info( + f"Starting keyword_match_v1 for Institution ID {institution_id} with keywords: {keywords}" + ) if not keywords: logger.warning("No keywords provided, returning empty list.") return [] @@ -105,11 +108,13 @@ def calculate_affiliations( # Prepare filter conditions for the database query. filter_conditions = [] - lower_keywords = [kw.lower() for kw in keywords] # Use lowercase for case-insensitive matching + lower_keywords = [ + kw.lower() for kw in keywords + ] # Use lowercase for case-insensitive matching # Create ILIKE conditions for text fields (description, owner login). for kw in lower_keywords: - like_pattern = f"%{kw}%" # Pattern for substring matching + like_pattern = f"%{kw}%" # Pattern for substring matching filter_conditions.append(Repository.description.ilike(like_pattern)) filter_conditions.append(Owner.login.ilike(like_pattern)) @@ -118,22 +123,30 @@ def calculate_affiliations( # Note: This requires PostgreSQL and appropriate parameter binding. try: # Use `text()` to pass the array parameter securely. - topics_filter = Repository.topics.op('?|')(text('ARRAY[:keywords]')) - topics_filter = topics_filter.params(keywords=lower_keywords) # Bind the keyword list + topics_filter = Repository.topics.op("?|")(text("ARRAY[:keywords]")) + topics_filter = topics_filter.params( + keywords=lower_keywords + ) # Bind the keyword list filter_conditions.append(topics_filter) except Exception as jsonb_err: - # Log an error if the JSONB filter setup fails (e.g., unsupported DB, syntax error). - # The query will proceed without the topics filter in this case. - logger.error(f"Could not apply JSONB topics filter: {jsonb_err}. Proceeding without topic matching.") + # Log an error if the JSONB filter setup fails (e.g., unsupported DB, syntax error). + # The query will proceed without the topics filter in this case. + logger.error( + f"Could not apply JSONB topics filter: {jsonb_err}. Proceeding without topic matching." + ) # Construct the final SQLAlchemy query. # Select necessary fields from Repository and its associated Owner. # Join Repository to Owner to access the owner's login name. # Apply the combined filter conditions using OR logic. stmt = ( - select(Repository.id, Repository.description, Repository.topics, Owner.login) - .join(Repository.owner) # Perform the join to Owner table - .where(or_(*filter_conditions)) # Apply all filter conditions combined with OR + select( + Repository.id, Repository.description, Repository.topics, Owner.login + ) + .join(Repository.owner) # Perform the join to Owner table + .where( + or_(*filter_conditions) + ) # Apply all filter conditions combined with OR ) logger.info("Executing database query for keyword matches...") @@ -143,45 +156,53 @@ def calculate_affiliations( # Process the query results to assign confidence scores and format output. for row in query_results: - repo_id = row['id'] + repo_id = row["id"] # Avoid processing the same repository multiple times if it matched on different fields/keywords. if repo_id in processed_repo_ids: continue - description = row['description'] or "" # Handle potential None values + description = row["description"] or "" # Handle potential None values # Topics can be None if the column is nullable or not populated. - topics = row['topics'] if row['topics'] is not None else [] - owner_login = row['login'] or "" # Handle potential None values + topics = row["topics"] if row["topics"] is not None else [] + owner_login = row["login"] or "" # Handle potential None values - best_score = 0.0 # Track the highest confidence score for this repo - match_type = "none" # Track the type of match yielding the best score - matched_keyword = None # The specific keyword that resulted in the best match - matched_value = None # The value where the best match occurred (for evidence) + best_score = 0.0 # Track the highest confidence score for this repo + match_type = "none" # Track the type of match yielding the best score + matched_keyword = ( + None # The specific keyword that resulted in the best match + ) + matched_value = ( + None # The value where the best match occurred (for evidence) + ) # Check for matches in fields, prioritizing owner login (highest confidence). for kw in lower_keywords: if kw in owner_login.lower(): - if best_score < 0.9: # Assign owner login match score + if best_score < 0.9: # Assign owner login match score best_score = 0.9 match_type = "owner_login" matched_keyword = kw - matched_value = owner_login # Store the login name as evidence + matched_value = owner_login # Store the login name as evidence # Break inner loop once a match is found in this field for this repo. # We only need one keyword match per field type for scoring. break # Check description if no owner match was found (or if owner score is lower, though unlikely here). if best_score < 0.9: - for kw in lower_keywords: - if kw in description.lower(): - if best_score < 0.6: # Assign description match score - best_score = 0.6 - match_type = "description" - matched_keyword = kw - # Provide a preview of the description as evidence. - matched_value = description[:100] + "..." if len(description)>100 else description - break # Break inner loop + for kw in lower_keywords: + if kw in description.lower(): + if best_score < 0.6: # Assign description match score + best_score = 0.6 + match_type = "description" + matched_keyword = kw + # Provide a preview of the description as evidence. + matched_value = ( + description[:100] + "..." + if len(description) > 100 + else description + ) + break # Break inner loop # Check topics if no better match was found yet. if best_score < 0.6: @@ -191,37 +212,41 @@ def calculate_affiliations( lower_topics = [str(t).lower() for t in topics] for kw in lower_keywords: if kw in lower_topics: - if best_score < 0.4: # Assign topic match score (lowest confidence) + if ( + best_score < 0.4 + ): # Assign topic match score (lowest confidence) best_score = 0.4 match_type = "topic" matched_keyword = kw - matched_value = topics # Store the original list of topics as evidence - break # Break inner loop + matched_value = topics # Store the original list of topics as evidence + break # Break inner loop else: # Log a warning if topics data is not in the expected list format. - logger.warning(f"Topics data for repo {repo_id} is not a list: {topics}") - + logger.warning( + f"Topics data for repo {repo_id} is not a list: {topics}" + ) # If any keyword match was found (score > 0), add it to the results. if best_score > 0.0: evidence = { "match_type": match_type, "matched_keyword": matched_keyword, - "matched_value_preview": matched_value # Context where match occurred + "matched_value_preview": matched_value, # Context where match occurred } - results.append({ - "repository_id": repo_id, - "confidence_score": best_score, - "evidence": evidence - }) + results.append( + { + "repository_id": repo_id, + "confidence_score": best_score, + "evidence": evidence, + } + ) # Mark this repository as processed. processed_repo_ids.add(repo_id) - except Exception as e: # Catch and log any unexpected errors during database query or processing. logger.exception(f"Error during keyword_match_v1 execution: {e}") - return [] # Return empty list on error + return [] # Return empty list on error finally: # Ensure database resources are released. if db: @@ -231,9 +256,12 @@ def calculate_affiliations( engine.dispose() logger.info("Database engine disposed.") - logger.info(f"Keyword_match_v1 finished. Found {len(results)} affiliations for Inst {institution_id}.") + logger.info( + f"Keyword_match_v1 finished. Found {len(results)} affiliations for Inst {institution_id}." + ) return results + # --- Example Test Call Block --- # Intended for development/testing. Requires setting DATABASE_URL environment variable # and having relevant data in the database. @@ -251,4 +279,4 @@ def calculate_affiliations( # print("\nResults:") # import json # print(json.dumps(affiliations, indent=2)) # Pretty print the results -# --- End Example Test Call Block --- \ No newline at end of file +# --- End Example Test Call Block --- diff --git a/contrib/affiliation_algorithms/readme_mention_v1.py b/contrib/affiliation_algorithms/readme_mention_v1.py index 54a5492..cca3e9f 100644 --- a/contrib/affiliation_algorithms/readme_mention_v1.py +++ b/contrib/affiliation_algorithms/readme_mention_v1.py @@ -9,11 +9,10 @@ """ import sys -import os import logging -import re # Import regular expression module for keyword matching +import re # Import regular expression module for keyword matching from pathlib import Path -from typing import List, Dict, Any, Set +from typing import List, Dict, Any # --- Path Setup --- # Determine the project root directory relative to this script's location @@ -26,9 +25,11 @@ from sqlalchemy import create_engine, select from sqlalchemy.orm import sessionmaker, Session + # Import necessary MOSS models and the GitHub client. from backend.data.models import Repository from backend.external import GitHubClient, ApiClientError + # Import settings to check for token availability for logging purposes. from backend.config import settings @@ -38,7 +39,7 @@ logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [readme_mention_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) @@ -46,14 +47,19 @@ # Log a warning if the GitHub API token doesn't seem to be configured in the # application settings, as this will likely lead to rate limiting or failures. if not settings.GITHUB_API_TOKEN: - logger.warning("GITHUB_API_TOKEN environment variable not found by settings module.") - logger.warning("GitHub API calls in readme_mention_v1 may fail or be severely rate-limited due to missing authentication.") + logger.warning( + "GITHUB_API_TOKEN environment variable not found by settings module." + ) + logger.warning( + "GitHub API calls in readme_mention_v1 may fail or be severely rate-limited due to missing authentication." + ) # --- End Token Check --- + def calculate_affiliations( - institution_id: int, # Included for context and consistency with algorithm signature. + institution_id: int, # Included for context and consistency with algorithm signature. keywords: List[str], - db_conn_str: str + db_conn_str: str, ) -> List[Dict[str, Any]]: """ Identifies potential repository-institution affiliations based on keyword mentions in READMEs. @@ -93,7 +99,9 @@ def calculate_affiliations( or a critical error occurs. May return a list containing error details if initialization fails. """ - logger.info(f"Starting readme_mention_v1 for Institution ID {institution_id} with keywords: {keywords}") + logger.info( + f"Starting readme_mention_v1 for Institution ID {institution_id} with keywords: {keywords}" + ) if not keywords: logger.warning("No keywords provided for README search, returning empty list.") return [] @@ -113,20 +121,26 @@ def calculate_affiliations( # Compile a single regex pattern to find any of the keywords as whole words (\b). # re.escape handles special characters in keywords. re.IGNORECASE makes it case-insensitive. try: - keyword_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, lower_keywords)) + r')\b', re.IGNORECASE) + keyword_pattern = re.compile( + r"\b(" + "|".join(map(re.escape, lower_keywords)) + r")\b", re.IGNORECASE + ) except re.error as regex_err: - logger.error(f"Failed to compile keyword regex: {regex_err}. Keywords: {keywords}") + logger.error( + f"Failed to compile keyword regex: {regex_err}. Keywords: {keywords}" + ) return [{"error": "RegexCompilationError", "message": str(regex_err)}] try: # Instantiate GitHub Client. This relies on the environment or settings for authentication. try: - github_client = GitHubClient() + github_client = GitHubClient() except ValueError as e: - # Handle failure to initialize client (e.g., missing token in settings). - logger.error(f"Failed to initialize GitHubClient, likely missing token: {e}") - # Return an error structure indicating the failure. - return [{"error": "GitHub Client Initialization Failed", "message": str(e)}] + # Handle failure to initialize client (e.g., missing token in settings). + logger.error( + f"Failed to initialize GitHubClient, likely missing token: {e}" + ) + # Return an error structure indicating the failure. + return [{"error": "GitHub Client Initialization Failed", "message": str(e)}] # Establish database connection. engine = create_engine(db_conn_str) @@ -137,41 +151,58 @@ def calculate_affiliations( # Performance consideration: Fetching all repositories might be slow for large datasets. # Future optimization could involve filtering repositories based on certain criteria. repo_stmt = select(Repository.id, Repository.full_name) - all_repos = db.execute(repo_stmt).mappings().all() # Fetch as dictionary-like mappings + all_repos = ( + db.execute(repo_stmt).mappings().all() + ) # Fetch as dictionary-like mappings total_repos = len(all_repos) - logger.info(f"Found {total_repos} repositories in the database to check for README mentions.") + logger.info( + f"Found {total_repos} repositories in the database to check for README mentions." + ) # Counters for tracking progress and issues during processing. processed_count = 0 found_count = 0 api_error_count = 0 # List of common README filenames to check for each repository. - readme_files_to_check = ["README.md", "README", "README.rst", "README.txt"] # Added .txt + readme_files_to_check = [ + "README.md", + "README", + "README.rst", + "README.txt", + ] # Added .txt # Step 2: Iterate through each repository and check its README. for repo_data in all_repos: processed_count += 1 - repo_id = repo_data['id'] - full_name = repo_data['full_name'] + repo_id = repo_data["id"] + full_name = repo_data["full_name"] # Basic validation of the repository's full name format. - if not full_name or '/' not in full_name: - logger.warning(f"Skipping repo ID {repo_id} due to invalid full_name format: '{full_name}'") - continue + if not full_name or "/" not in full_name: + logger.warning( + f"Skipping repo ID {repo_id} due to invalid full_name format: '{full_name}'" + ) + continue # Log progress periodically. if processed_count % 100 == 0: - logger.info(f"Processed {processed_count}/{total_repos} repositories...") + logger.info( + f"Processed {processed_count}/{total_repos} repositories..." + ) # Extract owner and repo name from the full name. try: - owner, repo_name_only = full_name.split('/', 1) + owner, repo_name_only = full_name.split("/", 1) except ValueError: - logger.warning(f"Skipping repo ID {repo_id} due to unexpected full_name format: '{full_name}'") - continue + logger.warning( + f"Skipping repo ID {repo_id} due to unexpected full_name format: '{full_name}'" + ) + continue - readme_content: str | None = None # To store fetched README content - fetched_readme_path: str | None = None # To store the path of the found README + readme_content: str | None = None # To store fetched README content + fetched_readme_path: str | None = ( + None # To store the path of the found README + ) # Attempt to fetch content from common README file locations. for readme_path in readme_files_to_check: @@ -179,13 +210,17 @@ def calculate_affiliations( # Use the GitHub client to get file content. logger.debug(f"Attempting to fetch {readme_path} for {full_name}") # get_file_content should return the decoded content or None/raise error. - content_maybe = github_client.get_file_content(owner, repo_name_only, readme_path) + content_maybe = github_client.get_file_content( + owner, repo_name_only, readme_path + ) if content_maybe: readme_content = content_maybe fetched_readme_path = readme_path - logger.debug(f"Successfully fetched content from {readme_path} for {full_name}") - break # Found a README, no need to check other variants for this repo. + logger.debug( + f"Successfully fetched content from {readme_path} for {full_name}" + ) + break # Found a README, no need to check other variants for this repo. except ApiClientError as e: # Handle specific API errors gracefully. @@ -193,18 +228,25 @@ def calculate_affiliations( # Common case: the specific README file variant doesn't exist. logger.debug(f"{readme_path} not found for {full_name} (404).") elif e.status_code == 403: - # Potential rate limit or permission issue. Log a warning. - logger.warning(f"Access denied (403) fetching {readme_path} for {full_name}. Rate limit or permissions issue?") - api_error_count += 1 - # Consider breaking the inner loop (variants) or outer loop (repos) on repeated 403s. + # Potential rate limit or permission issue. Log a warning. + logger.warning( + f"Access denied (403) fetching {readme_path} for {full_name}. Rate limit or permissions issue?" + ) + api_error_count += 1 + # Consider breaking the inner loop (variants) or outer loop (repos) on repeated 403s. else: # Log other unexpected API errors. - logger.error(f"API Error {e.status_code} fetching {readme_path} for {full_name}: {e}") + logger.error( + f"API Error {e.status_code} fetching {readme_path} for {full_name}: {e}" + ) api_error_count += 1 except Exception as e: # Catch any other unexpected errors during file fetching. # Log minimally to avoid flooding logs, but indicate the error. - logger.error(f"Unexpected error fetching {readme_path} for {full_name}: {type(e).__name__}", exc_info=False) + logger.error( + f"Unexpected error fetching {readme_path} for {full_name}: {type(e).__name__}", + exc_info=False, + ) api_error_count += 1 # Stop checking variants for this repo if an unexpected error occurs. break @@ -217,24 +259,32 @@ def calculate_affiliations( if found_matches: # Extract unique matched keywords (case-insensitive) for the evidence record. unique_matches = {match.lower() for match in found_matches} - logger.info(f"Found keyword match(es): {list(unique_matches)} in '{fetched_readme_path}' for repo {repo_id} ({full_name})") + logger.info( + f"Found keyword match(es): {list(unique_matches)} in '{fetched_readme_path}' for repo {repo_id} ({full_name})" + ) # Structure the evidence for this affiliation finding. evidence = { "signal_type": "readme_mention", - "matched_keywords": sorted(list(unique_matches)), # Store unique matches alphabetically - "readme_file": fetched_readme_path + "matched_keywords": sorted( + list(unique_matches) + ), # Store unique matches alphabetically + "readme_file": fetched_readme_path, } # Append the affiliation result to the list. - results_list.append({ - "repository_id": repo_id, - "confidence_score": CONFIDENCE_SCORE, - "evidence": evidence - }) + results_list.append( + { + "repository_id": repo_id, + "confidence_score": CONFIDENCE_SCORE, + "evidence": evidence, + } + ) found_count += 1 except Exception as parse_err: - logger.error(f"Error processing README content for repo {repo_id} ({full_name}): {parse_err}", exc_info=False) - + logger.error( + f"Error processing README content for repo {repo_id} ({full_name}): {parse_err}", + exc_info=False, + ) except Exception as e: # Catch critical errors during the overall process (e.g., database connection failure). @@ -251,9 +301,12 @@ def calculate_affiliations( logger.info("Database engine disposed.") # Note: GitHubClient session cleanup might be handled within the client itself upon garbage collection. - logger.info(f"Readme_mention_v1 finished for Inst {institution_id}. Found {found_count} affiliations. API errors encountered: {api_error_count}.") + logger.info( + f"Readme_mention_v1 finished for Inst {institution_id}. Found {found_count} affiliations. API errors encountered: {api_error_count}." + ) return results_list + # --- Example Test Call Block --- # For development/testing. Requires DATABASE_URL and GITHUB_API_TOKEN environment # variables and relevant data in the database. @@ -273,4 +326,4 @@ def calculate_affiliations( # print("\nResults:") # import json # print(json.dumps(affiliations, indent=2)) -# --- End Example Test Call Block --- \ No newline at end of file +# --- End Example Test Call Block --- diff --git a/contrib/discovery_algorithms/keyword_discovery_v1.py b/contrib/discovery_algorithms/keyword_discovery_v1.py index cc5a3fa..a5a6021 100644 --- a/contrib/discovery_algorithms/keyword_discovery_v1.py +++ b/contrib/discovery_algorithms/keyword_discovery_v1.py @@ -8,10 +8,9 @@ """ import sys -import os import logging from pathlib import Path -from typing import List, Dict, Any, Optional +from typing import List, Optional # --- Path Setup --- # Determine the project root directory relative to this script's location @@ -31,7 +30,7 @@ logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [discovery_kw_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) @@ -39,9 +38,9 @@ def find_candidate_repos( keywords: List[str], max_results: int = 100, - github_api_token: Optional[str] = None, # Allow passing a specific token + github_api_token: Optional[str] = None, # Allow passing a specific token # db_conn_str is part of the standard runner signature but not used here. - db_conn_str: Optional[str] = None + db_conn_str: Optional[str] = None, ) -> List[str]: """ Searches GitHub for repositories matching a given set of keywords. @@ -69,14 +68,18 @@ def find_candidate_repos( # Construct the search query by joining keywords. query = " ".join(keywords) - logger.info(f"Starting GitHub discovery search with query: '{query}', max_results: {max_results}") + logger.info( + f"Starting GitHub discovery search with query: '{query}', max_results: {max_results}" + ) # Instantiate the GitHub API client. # This might raise ValueError if base configuration (e.g., settings) is invalid. try: github_client = GitHubClient() except ValueError as e: - logger.error(f"Failed to initialize GitHubClient: {e}. Check base configuration or token availability.") + logger.error( + f"Failed to initialize GitHubClient: {e}. Check base configuration or token availability." + ) # Cannot proceed without a client instance. return [] @@ -90,12 +93,13 @@ def find_candidate_repos( else: # If no specific token is provided, log a warning about rate limits. # Ensure anonymous request by removing any default Authorization header. - logger.warning("No GitHub API token provided to discovery algorithm. Search will be anonymous and heavily rate-limited.") + logger.warning( + "No GitHub API token provided to discovery algorithm. Search will be anonymous and heavily rate-limited." + ) if "Authorization" in request_headers: del request_headers["Authorization"] # --- End Header Prep --- - repo_urls: List[str] = [] try: # Perform the repository search via the GitHub client. @@ -110,7 +114,9 @@ def find_candidate_repos( # Temporarily update session headers with the prepared ones for this call. github_client.session.headers.update(request_headers) # Execute the search using the modified session headers. - search_result_tuple = github_client.search_repositories(query=query, max_results=max_results) + search_result_tuple = github_client.search_repositories( + query=query, max_results=max_results + ) # Restore the original headers to avoid affecting subsequent uses of the client instance. github_client.session.headers = original_headers # --- END TEMPORARY WORKAROUND --- @@ -123,17 +129,21 @@ def find_candidate_repos( url = item.get("html_url") if url: repo_urls.append(url) - logger.info(f"Discovery search completed. Found {len(repo_urls)} candidate repository URLs.") + logger.info( + f"Discovery search completed. Found {len(repo_urls)} candidate repository URLs." + ) else: # Handle cases where the API call succeeded but returned no items. - logger.warning("Repository search returned no results or failed to retrieve items.") + logger.warning( + "Repository search returned no results or failed to retrieve items." + ) except ApiClientError as e: # Handle specific errors raised by the GitHub client (e.g., rate limits, auth errors). logger.error(f"API client error during GitHub discovery search: {e}") # Return empty list on client errors to indicate failure. return [] - except Exception as e: + except Exception: # Catch any other unexpected exceptions during the process. logger.exception("Unexpected error during GitHub discovery search execution.") # Return empty list on unexpected errors. @@ -141,6 +151,7 @@ def find_candidate_repos( return repo_urls + # --- Example Test Call Block --- # This section is intended for development or testing purposes. # It demonstrates how to call the function directly, typically requiring @@ -167,4 +178,4 @@ def find_candidate_repos( # print(f"- {url}") # else: # print("None found or an error occurred during the search.") -# --- End Example Test Call Block --- \ No newline at end of file +# --- End Example Test Call Block --- diff --git a/contrib/queries/citation_community_detection_v1.py b/contrib/queries/citation_community_detection_v1.py index 926941c..6933aef 100644 --- a/contrib/queries/citation_community_detection_v1.py +++ b/contrib/queries/citation_community_detection_v1.py @@ -1,7 +1,6 @@ # --- NEW FILE: contrib/queries/citation_community_detection_v1.py --- import sys -import os import logging from pathlib import Path from typing import List, Dict, Any, Set, Tuple, Optional @@ -16,14 +15,16 @@ # --- Dependencies --- try: import networkx as nx - from community import community_louvain # Use python-louvain library + from community import community_louvain # Use python-louvain library except ImportError as e: - print(f"Error importing dependencies: {e}. Please install networkx and python-louvain.") + print( + f"Error importing dependencies: {e}. Please install networkx and python-louvain." + ) print("pip install networkx python-louvain") sys.exit(1) # --- End Dependencies --- -from sqlalchemy import create_engine, select, union_all +from sqlalchemy import create_engine, select from sqlalchemy.orm import sessionmaker, Session # Import required MOSS models @@ -33,12 +34,14 @@ logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [citation_community_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) -def fetch_citation_network(db: Session, seed_work_id: int, depth: int) -> Tuple[Set[int], Set[Tuple[int, int]]]: +def fetch_citation_network( + db: Session, seed_work_id: int, depth: int +) -> Tuple[Set[int], Set[Tuple[int, int]]]: """ Fetches work IDs (nodes) and citation links (edges) within a specified depth from a seed work using breadth-first search. @@ -49,31 +52,31 @@ def fetch_citation_network(db: Session, seed_work_id: int, depth: int) -> Tuple[ nodes: Set[int] = {seed_work_id} edges: Set[Tuple[int, int]] = set() current_frontier: Set[int] = {seed_work_id} - visited_nodes: Set[int] = {seed_work_id} # Include seed node initially + visited_nodes: Set[int] = {seed_work_id} # Include seed node initially for current_depth in range(depth): if not current_frontier: - break # No more nodes to expand + break # No more nodes to expand next_frontier: Set[int] = set() # Find works directly citing or cited by the current frontier nodes # Fetch both directions in one go for undirected graph - citing_stmt = ( - select(WorkCitation.citing_work_id, WorkCitation.cited_work_id) - .where(WorkCitation.cited_work_id.in_(current_frontier)) - ) - cited_stmt = ( - select(WorkCitation.citing_work_id, WorkCitation.cited_work_id) - .where(WorkCitation.citing_work_id.in_(current_frontier)) - ) + citing_stmt = select( + WorkCitation.citing_work_id, WorkCitation.cited_work_id + ).where(WorkCitation.cited_work_id.in_(current_frontier)) + cited_stmt = select( + WorkCitation.citing_work_id, WorkCitation.cited_work_id + ).where(WorkCitation.citing_work_id.in_(current_frontier)) # Combine results - use session.execute for simpler iteration combined_results = db.execute(citing_stmt).all() + db.execute(cited_stmt).all() for citer, cited in combined_results: # Add edge (always store as tuple for undirected graph) - edge = tuple(sorted((citer, cited))) # Ensure consistent edge representation + edge = tuple( + sorted((citer, cited)) + ) # Ensure consistent edge representation edges.add(edge) # Add newly discovered nodes to nodes set and next frontier if not visited @@ -82,19 +85,15 @@ def fetch_citation_network(db: Session, seed_work_id: int, depth: int) -> Tuple[ if node not in visited_nodes: nodes.add(node) next_frontier.add(node) - visited_nodes.add(node) # Mark as visited here + visited_nodes.add(node) # Mark as visited here - current_frontier = next_frontier # Move to the next level + current_frontier = next_frontier # Move to the next level logger.info(f"Fetched network: {len(nodes)} nodes, {len(edges)} edges.") return nodes, edges -def run_analysis( - db_conn_str: str, - seed_work_id: int, - depth: int = 1 -) -> Dict[str, Any]: +def run_analysis(db_conn_str: str, seed_work_id: int, depth: int = 1) -> Dict[str, Any]: """ Performs community detection on the citation graph starting from a seed work. @@ -111,10 +110,15 @@ def run_analysis( and 'modularity' score. If error, data contains error details. """ - logger.info(f"Starting citation_community_detection_v1 analysis for seed_work_id={seed_work_id}, depth={depth}") + logger.info( + f"Starting citation_community_detection_v1 analysis for seed_work_id={seed_work_id}, depth={depth}" + ) if depth < 0: - return {"result_type": "error", "data": {"error": "ValueError", "message": "Depth cannot be negative."}} + return { + "result_type": "error", + "data": {"error": "ValueError", "message": "Depth cannot be negative."}, + } engine = None db: Session | None = None @@ -129,7 +133,13 @@ def run_analysis( # Check if seed work exists seed_work = db.get(Work, seed_work_id) if not seed_work: - return {"result_type": "error", "data": {"error": "NotFound", "message": f"Seed work with ID {seed_work_id} not found."}} + return { + "result_type": "error", + "data": { + "error": "NotFound", + "message": f"Seed work with ID {seed_work_id} not found.", + }, + } # Fetch the network data nodes, edges = fetch_citation_network(db, seed_work_id, depth) @@ -137,23 +147,32 @@ def run_analysis( if not nodes or not edges: logger.info("No citation network found within the specified depth.") # Return empty communities if no network is found - return {"result_type": "value", "data": {"communities": [], "modularity": None}} + return { + "result_type": "value", + "data": {"communities": [], "modularity": None}, + } # Build the NetworkX graph G = nx.Graph() G.add_nodes_from(nodes) G.add_edges_from(edges) - logger.info(f"Built graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.") + logger.info( + f"Built graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges." + ) # Check if graph is connected (Louvain works better on connected components) if not nx.is_connected(G): - logger.warning("Graph is not connected. Louvain will run on the largest connected component.") + logger.warning( + "Graph is not connected. Louvain will run on the largest connected component." + ) # Optionally run on each component, but for simplicity, run on largest largest_cc = max(nx.connected_components(G), key=len) - G_comp = G.subgraph(largest_cc).copy() # Create a subgraph copy - logger.info(f"Running Louvain on largest component ({len(G_comp.nodes())} nodes).") + G_comp = G.subgraph(largest_cc).copy() # Create a subgraph copy + logger.info( + f"Running Louvain on largest component ({len(G_comp.nodes())} nodes)." + ) else: - G_comp = G # Use the whole graph if connected + G_comp = G # Use the whole graph if connected # Perform community detection using Louvain logger.info("Running Louvain algorithm...") @@ -176,11 +195,20 @@ def run_analysis( logger.info(f"Detected {len(communities_result)} communities.") except ImportError: - # Already checked at top, but good practice - return {"result_type": "error", "data": {"error": "ImportError", "message": "NetworkX or python-louvain not installed."}} + # Already checked at top, but good practice + return { + "result_type": "error", + "data": { + "error": "ImportError", + "message": "NetworkX or python-louvain not installed.", + }, + } except Exception as e: logger.exception(f"Error during citation_community_detection_v1 execution: {e}") - return {"result_type": "error", "data": {"error": type(e).__name__, "message": str(e)}} + return { + "result_type": "error", + "data": {"error": type(e).__name__, "message": str(e)}, + } finally: if db: db.close() @@ -191,6 +219,8 @@ def run_analysis( "result_type": "value", "data": { "communities": communities_result, - "modularity": round(modularity_score, 5) if modularity_score is not None else None - } - } \ No newline at end of file + "modularity": round(modularity_score, 5) + if modularity_score is not None + else None, + }, + } diff --git a/contrib/queries/citing_work_subjects_v1.py b/contrib/queries/citing_work_subjects_v1.py index 901ad9b..b98fedc 100644 --- a/contrib/queries/citing_work_subjects_v1.py +++ b/contrib/queries/citing_work_subjects_v1.py @@ -1,7 +1,6 @@ # --- NEW FILE: contrib/queries/citing_work_subjects_v1.py --- import sys -import os import logging from pathlib import Path from typing import List, Dict, Any, Optional, Set @@ -12,29 +11,36 @@ sys.path.insert(0, str(project_root)) # --- End Path Setup --- -from sqlalchemy import create_engine, select, func, and_, distinct, desc -from sqlalchemy.orm import sessionmaker, Session, aliased, Query +from sqlalchemy import create_engine, select, func, distinct, desc +from sqlalchemy.orm import sessionmaker, Session # Import required MOSS models from backend.data.models import ( - Repository, Work, DOIReference, WorkCitation, - WorkTopic, Topic, Subfield, Field, Domain + Work, + DOIReference, + WorkCitation, + WorkTopic, + Topic, + Subfield, + Field, + Domain, ) # --- Logging Setup --- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [citing_work_subjects_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) + def run_analysis( db_conn_str: str, subject_level: str, repository_id: Optional[int] = None, doi: Optional[str] = None, - top_n: int = 10 + top_n: int = 10, ) -> Dict[str, Any]: """ Identifies the top N most frequent subjects (Domains, Fields, Subfields, or Topics) @@ -52,16 +58,36 @@ def run_analysis( If successful, data is a list of subject summary dictionaries. If error, data contains error details. """ - logger.info(f"Starting citing_work_subjects_v1 analysis for level='{subject_level}', repo={repository_id}, doi={doi}, top_n={top_n}") + logger.info( + f"Starting citing_work_subjects_v1 analysis for level='{subject_level}', repo={repository_id}, doi={doi}, top_n={top_n}" + ) if not repository_id and not doi: - return {"result_type": "error", "data": {"error": "ValueError", "message": "Either repository_id or doi must be provided."}} + return { + "result_type": "error", + "data": { + "error": "ValueError", + "message": "Either repository_id or doi must be provided.", + }, + } if repository_id and doi: - return {"result_type": "error", "data": {"error": "ValueError", "message": "Provide either repository_id or doi, not both."}} - - valid_levels = ['domain', 'field', 'subfield', 'topic'] + return { + "result_type": "error", + "data": { + "error": "ValueError", + "message": "Provide either repository_id or doi, not both.", + }, + } + + valid_levels = ["domain", "field", "subfield", "topic"] if subject_level not in valid_levels: - return {"result_type": "error", "data": {"error": "ValueError", "message": f"Invalid subject_level. Choose from: {valid_levels}"}} + return { + "result_type": "error", + "data": { + "error": "ValueError", + "message": f"Invalid subject_level. Choose from: {valid_levels}", + }, + } engine = None db: Session | None = None @@ -78,12 +104,14 @@ def run_analysis( logger.info(f"Finding works linked to repository_id: {repository_id}") stmt = select(distinct(DOIReference.work_id)).where( DOIReference.repository_id == repository_id, - DOIReference.work_id.is_not(None) + DOIReference.work_id.is_not(None), ) target_work_ids_result = db.execute(stmt).scalars().all() target_work_ids = set(target_work_ids_result) if not target_work_ids: - logger.info(f"No resolved works found linked to repository {repository_id}.") + logger.info( + f"No resolved works found linked to repository {repository_id}." + ) return {"result_type": "table", "data": []} elif doi: logger.info(f"Finding work with DOI: {doi}") @@ -97,9 +125,8 @@ def run_analysis( logger.info(f"Found {len(target_work_ids)} target work ID(s).") # Step 2: Find works citing the target work(s) - citing_work_ids_stmt = ( - select(distinct(WorkCitation.citing_work_id)) - .where(WorkCitation.cited_work_id.in_(target_work_ids)) + citing_work_ids_stmt = select(distinct(WorkCitation.citing_work_id)).where( + WorkCitation.cited_work_id.in_(target_work_ids) ) citing_work_ids_result = db.execute(citing_work_ids_stmt).scalars().all() if not citing_work_ids_result: @@ -110,54 +137,74 @@ def run_analysis( # Step 3: Join citing works to the hierarchy and aggregate # Base query joining citing works through the hierarchy - base_query = db.query( - Topic.id.label("topic_id"), - Subfield.id.label("subfield_id"), Subfield.display_name.label("subfield_name"), - Field.id.label("field_id"), Field.display_name.label("field_name"), - Domain.id.label("domain_id"), Domain.display_name.label("domain_name"), - Topic.display_name.label("topic_name"), - func.count(distinct(WorkTopic.work_id)).label("citing_work_count") # Count distinct citing works - ).select_from(WorkTopic)\ - .join(Topic, WorkTopic.topic_id == Topic.id)\ - .join(Subfield, Topic.subfield_id == Subfield.id)\ - .join(Field, Subfield.field_id == Field.id)\ - .join(Domain, Field.domain_id == Domain.id)\ - .filter(WorkTopic.work_id.in_(citing_work_ids)) # Filter for citing works + base_query = ( + db.query( + Topic.id.label("topic_id"), + Subfield.id.label("subfield_id"), + Subfield.display_name.label("subfield_name"), + Field.id.label("field_id"), + Field.display_name.label("field_name"), + Domain.id.label("domain_id"), + Domain.display_name.label("domain_name"), + Topic.display_name.label("topic_name"), + func.count(distinct(WorkTopic.work_id)).label( + "citing_work_count" + ), # Count distinct citing works + ) + .select_from(WorkTopic) + .join(Topic, WorkTopic.topic_id == Topic.id) + .join(Subfield, Topic.subfield_id == Subfield.id) + .join(Field, Subfield.field_id == Field.id) + .join(Domain, Field.domain_id == Domain.id) + .filter(WorkTopic.work_id.in_(citing_work_ids)) + ) # Filter for citing works # --- Aggregation based on subject_level --- - if subject_level == 'topic': + if subject_level == "topic": agg_query = base_query.group_by( - Topic.id, Topic.display_name, - Subfield.id, Subfield.display_name, # Include parent details - Field.id, Field.display_name, - Domain.id, Domain.display_name + Topic.id, + Topic.display_name, + Subfield.id, + Subfield.display_name, # Include parent details + Field.id, + Field.display_name, + Domain.id, + Domain.display_name, ) entity_name_col = Topic.display_name - parent_info = lambda row: f"{row.subfield_name} (Subfield) / {row.field_name} (Field) / {row.domain_name} (Domain)" + parent_info = ( + lambda row: f"{row.subfield_name} (Subfield) / {row.field_name} (Field) / {row.domain_name} (Domain)" + ) - elif subject_level == 'subfield': + elif subject_level == "subfield": agg_query = base_query.group_by( - Subfield.id, Subfield.display_name, - Field.id, Field.display_name, # Include parent details - Domain.id, Domain.display_name + Subfield.id, + Subfield.display_name, + Field.id, + Field.display_name, # Include parent details + Domain.id, + Domain.display_name, ) entity_name_col = Subfield.display_name - parent_info = lambda row: f"{row.field_name} (Field) / {row.domain_name} (Domain)" + parent_info = ( + lambda row: f"{row.field_name} (Field) / {row.domain_name} (Domain)" + ) - elif subject_level == 'field': + elif subject_level == "field": agg_query = base_query.group_by( - Field.id, Field.display_name, - Domain.id, Domain.display_name # Include parent details + Field.id, + Field.display_name, + Domain.id, + Domain.display_name, # Include parent details ) entity_name_col = Field.display_name parent_info = lambda row: f"{row.domain_name} (Domain)" - else: # subject_level == 'domain' + else: # subject_level == 'domain' agg_query = base_query.group_by(Domain.id, Domain.display_name) entity_name_col = Domain.display_name parent_info = lambda row: None - # Add ordering and limit final_query = agg_query.order_by(desc("citing_work_count")).limit(top_n) @@ -167,21 +214,26 @@ def run_analysis( # Format results for row in query_results: - results.append({ - "subject_level": subject_level, - "subject_name": getattr(row, f"{subject_level}_name"), - "subject_id": getattr(row, f"{subject_level}_id"), - "parent_context": parent_info(row), - "citing_work_count": row.citing_work_count - }) + results.append( + { + "subject_level": subject_level, + "subject_name": getattr(row, f"{subject_level}_name"), + "subject_id": getattr(row, f"{subject_level}_id"), + "parent_context": parent_info(row), + "citing_work_count": row.citing_work_count, + } + ) except Exception as e: logger.exception(f"Error during citing_work_subjects_v1 execution: {e}") - return {"result_type": "error", "data": {"error": type(e).__name__, "message": str(e)}} + return { + "result_type": "error", + "data": {"error": type(e).__name__, "message": str(e)}, + } finally: if db: db.close() if engine: engine.dispose() - return {"result_type": "table", "data": results} \ No newline at end of file + return {"result_type": "table", "data": results} diff --git a/contrib/queries/engaged_non_pr_contributors_v1.py b/contrib/queries/engaged_non_pr_contributors_v1.py index 65d707b..bae401f 100644 --- a/contrib/queries/engaged_non_pr_contributors_v1.py +++ b/contrib/queries/engaged_non_pr_contributors_v1.py @@ -1,10 +1,9 @@ # --- NEW FILE: contrib/queries/engaged_non_pr_contributors_v1.py --- import sys -import os import logging from pathlib import Path -from typing import List, Dict, Any, Optional, Set +from typing import List, Dict, Any, Set # --- Path Setup --- # Ensures the script can find backend modules when run by the executor @@ -23,15 +22,12 @@ logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [engaged_non_pr_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) -def run_analysis( - db_conn_str: str, - repository_id: int -) -> Dict[str, Any]: +def run_analysis(db_conn_str: str, repository_id: int) -> Dict[str, Any]: """ Identifies contributors who have created issues but not pull requests for a given repository, and counts their created issues. @@ -47,7 +43,9 @@ def run_analysis( ordered by issue count descending. If error, data contains error details. """ - logger.info(f"Starting engaged_non_pr_contributors_v1 analysis for repository_id={repository_id}") + logger.info( + f"Starting engaged_non_pr_contributors_v1 analysis for repository_id={repository_id}" + ) engine = None db: Session | None = None @@ -59,26 +57,30 @@ def run_analysis( db = SessionLocal() # Step 1: Find contributors who authored PRs for the repo - pr_authors_stmt = ( - select(distinct(PullRequest.user_id)) - .where(PullRequest.repository_id == repository_id) + pr_authors_stmt = select(distinct(PullRequest.user_id)).where( + PullRequest.repository_id == repository_id ) pr_author_ids_result = db.execute(pr_authors_stmt).scalars().all() pr_author_ids: Set[int] = set(pr_author_ids_result) - logger.debug(f"Found {len(pr_author_ids)} distinct PR authors for repo {repository_id}.") + logger.debug( + f"Found {len(pr_author_ids)} distinct PR authors for repo {repository_id}." + ) # Step 2: Find contributors who authored Issues for the repo - issue_authors_stmt = ( - select(distinct(Issue.user_id)) - .where(Issue.repository_id == repository_id) + issue_authors_stmt = select(distinct(Issue.user_id)).where( + Issue.repository_id == repository_id ) issue_author_ids_result = db.execute(issue_authors_stmt).scalars().all() issue_author_ids: Set[int] = set(issue_author_ids_result) - logger.debug(f"Found {len(issue_author_ids)} distinct Issue authors for repo {repository_id}.") + logger.debug( + f"Found {len(issue_author_ids)} distinct Issue authors for repo {repository_id}." + ) # Step 3: Find contributors in the second set but not the first non_pr_issue_author_ids = issue_author_ids - pr_author_ids - logger.info(f"Found {len(non_pr_issue_author_ids)} contributors who authored issues but not PRs.") + logger.info( + f"Found {len(non_pr_issue_author_ids)} contributors who authored issues but not PRs." + ) if not non_pr_issue_author_ids: logger.info("No contributors found who only authored issues.") @@ -88,12 +90,14 @@ def run_analysis( aggregation_stmt = ( select( Contributor.login.label("contributor_login"), - func.count(Issue.id).label("issue_count") + func.count(Issue.id).label("issue_count"), ) .select_from(Contributor) .join(Issue, Contributor.id == Issue.user_id) .where(Contributor.id.in_(non_pr_issue_author_ids)) - .where(Issue.repository_id == repository_id) # Ensure count is only for this repo + .where( + Issue.repository_id == repository_id + ) # Ensure count is only for this repo .group_by(Contributor.login) .order_by(desc("issue_count")) ) @@ -108,11 +112,14 @@ def run_analysis( except Exception as e: logger.exception(f"Error during engaged_non_pr_contributors_v1 execution: {e}") - return {"result_type": "error", "data": {"error": type(e).__name__, "message": str(e)}} + return { + "result_type": "error", + "data": {"error": type(e).__name__, "message": str(e)}, + } finally: if db: db.close() if engine: engine.dispose() - return {"result_type": "table", "data": results} \ No newline at end of file + return {"result_type": "table", "data": results} diff --git a/contrib/queries/institutional_authorship_v1.py b/contrib/queries/institutional_authorship_v1.py index 592a71f..d4f1f02 100644 --- a/contrib/queries/institutional_authorship_v1.py +++ b/contrib/queries/institutional_authorship_v1.py @@ -1,7 +1,6 @@ # --- NEW FILE: contrib/queries/institutional_authorship_v1.py --- import sys -import os import logging from pathlib import Path from typing import List, Dict, Any, Set @@ -18,22 +17,25 @@ # Import required MOSS models from backend.data.models import ( - Repository, Work, DOIReference, Person, Institution, Authorship, Affiliation + Repository, + Work, + DOIReference, + Person, + Institution, + Authorship, + Affiliation, ) # --- Logging Setup --- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [inst_authorship_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) -def run_analysis( - db_conn_str: str, - repository_id: int -) -> Dict[str, Any]: +def run_analysis(db_conn_str: str, repository_id: int) -> Dict[str, Any]: """ Identifies institutions associated with authors of works linked to a specific repository. @@ -51,7 +53,9 @@ def run_analysis( ordered by count descending. If error, data contains error details. """ - logger.info(f"Starting institutional_authorship_v1 analysis for repository_id={repository_id}") + logger.info( + f"Starting institutional_authorship_v1 analysis for repository_id={repository_id}" + ) engine = None db: Session | None = None @@ -66,7 +70,13 @@ def run_analysis( repo = db.get(Repository, repository_id) if not repo: logger.error(f"Repository ID {repository_id} not found.") - return {"result_type": "error", "data": {"error": "NotFound", "message": f"Repository ID {repository_id} not found."}} + return { + "result_type": "error", + "data": { + "error": "NotFound", + "message": f"Repository ID {repository_id} not found.", + }, + } logger.info(f"Found repository: {repo.full_name}") # 2. Find all unique Work IDs linked to the repository via DOIReference @@ -78,47 +88,61 @@ def run_analysis( linked_work_ids_result = db.execute(linked_work_ids_stmt).scalars().all() if not linked_work_ids_result: - logger.info(f"No resolved works found linked to repository {repository_id}.") + logger.info( + f"No resolved works found linked to repository {repository_id}." + ) return {"result_type": "table", "data": []} linked_work_ids: Set[int] = set(linked_work_ids_result) - logger.info(f"Found {len(linked_work_ids)} unique works linked to repository {repository_id}.") + logger.info( + f"Found {len(linked_work_ids)} unique works linked to repository {repository_id}." + ) # 3. Query Authorship, Affiliation, Institution for these Work IDs # 4. Group by Institution and count distinct Persons aggregation_stmt = ( select( Institution.display_name.label("institution_name"), - func.count(distinct(Person.id)).label("distinct_author_count") + func.count(distinct(Person.id)).label("distinct_author_count"), ) .select_from(Work) .join(Authorship, Work.id == Authorship.work_id) .join(Person, Authorship.person_id == Person.id) # Ensure composite join condition for Authorship -> Affiliation - .join(Affiliation, and_( - Authorship.work_id == Affiliation.authorship_work_id, - Authorship.person_id == Affiliation.authorship_person_id - )) + .join( + Affiliation, + and_( + Authorship.work_id == Affiliation.authorship_work_id, + Authorship.person_id == Affiliation.authorship_person_id, + ), + ) .join(Institution, Affiliation.institution_id == Institution.id) .where(Work.id.in_(linked_work_ids)) .group_by(Institution.display_name) .order_by(desc("distinct_author_count")) ) - aggregation_results = db.execute(aggregation_stmt).mappings().all() # Fetch as dict-like + aggregation_results = ( + db.execute(aggregation_stmt).mappings().all() + ) # Fetch as dict-like # Format results results = [dict(row) for row in aggregation_results] - logger.info(f"Found {len(results)} institutions associated with authors of linked works.") + logger.info( + f"Found {len(results)} institutions associated with authors of linked works." + ) except Exception as e: logger.exception(f"Error during institutional_authorship_v1 execution: {e}") - return {"result_type": "error", "data": {"error": type(e).__name__, "message": str(e)}} + return { + "result_type": "error", + "data": {"error": type(e).__name__, "message": str(e)}, + } finally: if db: db.close() if engine: engine.dispose() - return {"result_type": "table", "data": results} \ No newline at end of file + return {"result_type": "table", "data": results} diff --git a/contrib/queries/institutional_contribution_aggregation_v1.py b/contrib/queries/institutional_contribution_aggregation_v1.py index 8a0109f..0aded7d 100644 --- a/contrib/queries/institutional_contribution_aggregation_v1.py +++ b/contrib/queries/institutional_contribution_aggregation_v1.py @@ -1,10 +1,9 @@ # --- NEW FILE: contrib/queries/institutional_contribution_aggregation_v1.py --- import sys -import os import logging from pathlib import Path -from typing import List, Dict, Any, Optional, Set +from typing import List, Dict, Any, Set # --- Path Setup --- # Assuming this script is in contrib/queries/ @@ -19,25 +18,21 @@ # Import required MOSS models from backend.data.models import ( Repository, - Institution, # Although not directly queried, good practice RepositoryContributorAssociation, - RepositoryInstitutionAffiliation, - Contributor # Needed for unique count potentially + RepositoryInstitutionAffiliation, # Needed for unique count potentially ) # --- Logging Setup --- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [inst_contrib_agg_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) def run_analysis( - db_conn_str: str, - institution_id: int, - min_confidence: float = 0.5 + db_conn_str: str, institution_id: int, min_confidence: float = 0.5 ) -> Dict[str, Any]: """ Aggregates contribution counts for repositories affiliated with a specific institution. @@ -58,7 +53,9 @@ def run_analysis( If successful, data is a list of repository contribution summary dictionaries. If error, data contains error details. """ - logger.info(f"Starting institutional_contribution_aggregation_v1 analysis for institution_id={institution_id}, min_confidence={min_confidence}") + logger.info( + f"Starting institutional_contribution_aggregation_v1 analysis for institution_id={institution_id}, min_confidence={min_confidence}" + ) engine = None db: Session | None = None @@ -74,54 +71,71 @@ def run_analysis( select(RepositoryInstitutionAffiliation.repository_id) .where( RepositoryInstitutionAffiliation.institution_id == institution_id, - RepositoryInstitutionAffiliation.confidence_score >= min_confidence + RepositoryInstitutionAffiliation.confidence_score >= min_confidence, ) .distinct() ) - affiliated_repo_ids_result = db.execute(affiliated_repo_ids_stmt).scalars().all() + affiliated_repo_ids_result = ( + db.execute(affiliated_repo_ids_stmt).scalars().all() + ) if not affiliated_repo_ids_result: - logger.info("No repositories found affiliated with the institution above the confidence threshold.") + logger.info( + "No repositories found affiliated with the institution above the confidence threshold." + ) return {"result_type": "table", "data": []} affiliated_repo_ids: Set[int] = set(affiliated_repo_ids_result) logger.info(f"Found {len(affiliated_repo_ids)} affiliated repositories.") # Step 2 & 3: Aggregate contributions for these repositories - RepoContribAssoc = RepositoryContributorAssociation # Alias for brevity + RepoContribAssoc = RepositoryContributorAssociation # Alias for brevity aggregation_stmt = ( select( Repository.id.label("repository_id"), Repository.full_name.label("repository_full_name"), - func.sum(RepoContribAssoc.contributions_count).label("total_contributions"), - func.count(RepoContribAssoc.contributor_id).label("unique_contributors_count") # Count distinct contributors associated + func.sum(RepoContribAssoc.contributions_count).label( + "total_contributions" + ), + func.count(RepoContribAssoc.contributor_id).label( + "unique_contributors_count" + ), # Count distinct contributors associated ) .select_from(Repository) .join(RepoContribAssoc, Repository.id == RepoContribAssoc.repository_id) .where(Repository.id.in_(affiliated_repo_ids)) .group_by(Repository.id, Repository.full_name) - .order_by(desc("total_contributions")) # Order by contribution count + .order_by(desc("total_contributions")) # Order by contribution count ) - aggregation_results = db.execute(aggregation_stmt).mappings().all() # Fetch results as dict-like objects + aggregation_results = ( + db.execute(aggregation_stmt).mappings().all() + ) # Fetch results as dict-like objects # Format results into a list of dictionaries - results = [dict(row) for row in aggregation_results] # Convert RowMapping to dict + results = [ + dict(row) for row in aggregation_results + ] # Convert RowMapping to dict # Optional: Post-process to handle potential NULL sums if no contributions are recorded for row in results: - if row['total_contributions'] is None: - row['total_contributions'] = 0 # Replace None sum with 0 + if row["total_contributions"] is None: + row["total_contributions"] = 0 # Replace None sum with 0 logger.info(f"Aggregated contributions for {len(results)} repositories.") except Exception as e: - logger.exception(f"Error during institutional_contribution_aggregation_v1 execution: {e}") - return {"result_type": "error", "data": {"error": type(e).__name__, "message": str(e)}} + logger.exception( + f"Error during institutional_contribution_aggregation_v1 execution: {e}" + ) + return { + "result_type": "error", + "data": {"error": type(e).__name__, "message": str(e)}, + } finally: if db: db.close() if engine: engine.dispose() - return {"result_type": "table", "data": results} \ No newline at end of file + return {"result_type": "table", "data": results} diff --git a/contrib/queries/repo_health_v1.py b/contrib/queries/repo_health_v1.py index 1aabb7b..9b1ce4e 100644 --- a/contrib/queries/repo_health_v1.py +++ b/contrib/queries/repo_health_v1.py @@ -1,10 +1,9 @@ # --- UPDATED FILE: contrib/queries/repo_health_v1.py --- import sys -import os import logging from pathlib import Path -from typing import List, Dict, Any, Optional # Added Optional +from typing import List, Dict, Any, Optional # Added Optional from datetime import datetime, timezone, timedelta # --- Path Setup --- @@ -13,57 +12,67 @@ sys.path.insert(0, str(project_root)) # --- End Path Setup --- -from sqlalchemy import create_engine, select, text, Integer, cast +from sqlalchemy import create_engine, select from sqlalchemy.orm import sessionmaker, Session -from backend.data.models import Repository # Import model directly +from backend.data.models import Repository # Import model directly # --- Logging Setup --- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [repo_health_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) + def calculate_repo_health(repo: Repository) -> Dict[str, Any]: """Calculates health score and metrics for a single Repository object.""" metrics = {} score_components = {} # Metric 1: Has Description? - metrics['has_description'] = bool(repo.description and len(repo.description) > 10) - score_components['description'] = 0.1 if metrics['has_description'] else 0.0 + metrics["has_description"] = bool(repo.description and len(repo.description) > 10) + score_components["description"] = 0.1 if metrics["has_description"] else 0.0 # Metric 2: Has License? - metrics['has_license'] = bool(repo.license and repo.license.get('key') != 'other') - score_components['license'] = 0.15 if metrics['has_license'] else 0.0 + metrics["has_license"] = bool(repo.license and repo.license.get("key") != "other") + score_components["license"] = 0.15 if metrics["has_license"] else 0.0 # Metric 3: Recently Pushed? (e.g., within last 6 months) cutoff_date = datetime.now(timezone.utc) - timedelta(days=180) - metrics['recently_pushed'] = bool(repo.gh_pushed_at and repo.gh_pushed_at > cutoff_date) - score_components['activity'] = 0.25 if metrics['recently_pushed'] else 0.05 # Some score even if old + metrics["recently_pushed"] = bool( + repo.gh_pushed_at and repo.gh_pushed_at > cutoff_date + ) + score_components["activity"] = ( + 0.25 if metrics["recently_pushed"] else 0.05 + ) # Some score even if old # Metric 4: Star Score (simple scaling) stars = repo.stargazers_count or 0 - metrics['stars'] = stars + metrics["stars"] = stars # Simple log scale, capping score contribution - score_components['stars'] = min(0.25 * ( (stars / 100) if stars < 100 else (1 + (stars-100)**0.2 / 5)), 0.25) + score_components["stars"] = min( + 0.25 * ((stars / 100) if stars < 100 else (1 + (stars - 100) ** 0.2 / 5)), 0.25 + ) # Metric 5: Fork Score (simple scaling) forks = repo.forks_count or 0 - metrics['forks'] = forks - score_components['forks'] = min(0.10 * ( (forks / 20) if forks < 20 else (1 + (forks-20)**0.2 / 10) ), 0.10) + metrics["forks"] = forks + score_components["forks"] = min( + 0.10 * ((forks / 20) if forks < 20 else (1 + (forks - 20) ** 0.2 / 10)), 0.10 + ) # Metric 6: Open Issues vs Watchers (basic proxy for engagement vs. potential issues) # Avoid division by zero open_issues = repo.open_issues_count or 0 - watchers = repo.watchers_count or 0 # Note: GitHub API v3 'watchers' is actually 'subscribers' - metrics['open_issues'] = open_issues - metrics['subscribers'] = watchers - issue_ratio = open_issues / (watchers + 1) # Add 1 to avoid zero division + watchers = ( + repo.watchers_count or 0 + ) # Note: GitHub API v3 'watchers' is actually 'subscribers' + metrics["open_issues"] = open_issues + metrics["subscribers"] = watchers + issue_ratio = open_issues / (watchers + 1) # Add 1 to avoid zero division # Lower ratio is better, capped score - score_components['issues'] = max(0.15 * (1 - min(issue_ratio, 1.0)), 0) - + score_components["issues"] = max(0.15 * (1 - min(issue_ratio, 1.0)), 0) # Calculate final score (sum of components, max 1.0) total_score = sum(score_components.values()) @@ -73,7 +82,9 @@ def calculate_repo_health(repo: Repository) -> Dict[str, Any]: "full_name": repo.full_name, "score": round(total_score, 3), "metrics": metrics, - "score_components": {k: round(v, 3) for k, v in score_components.items()} # Rounded components + "score_components": { + k: round(v, 3) for k, v in score_components.items() + }, # Rounded components } @@ -81,7 +92,7 @@ def calculate_repo_health(repo: Repository) -> Dict[str, Any]: def run_analysis( db_conn_str: str, repository_id: Optional[int] = None, - repository_ids: Optional[List[int]] = None + repository_ids: Optional[List[int]] = None, ) -> Dict[str, Any]: """ Calculates a basic health score for one or more GitHub repositories. @@ -102,8 +113,16 @@ def run_analysis( logger.info("Starting repo_health_v1 analysis...") if not repository_ids and repository_id is None: - logger.error("Missing required parameter: provide either repository_id or repository_ids.") - return {"result_type": "error", "data": {"error": "ValueError", "message": "Missing required parameter: provide either repository_id or repository_ids."}} + logger.error( + "Missing required parameter: provide either repository_id or repository_ids." + ) + return { + "result_type": "error", + "data": { + "error": "ValueError", + "message": "Missing required parameter: provide either repository_id or repository_ids.", + }, + } target_ids: List[int] = [] if repository_ids: @@ -113,17 +132,29 @@ def run_analysis( target_ids = [int(rid) for rid in repository_ids] except (ValueError, TypeError) as e: logger.error(f"Invalid format for repository_ids: {e}") - return {"result_type": "error", "data": {"error": "TypeError", "message": f"Invalid repository_ids format: {e}"}} + return { + "result_type": "error", + "data": { + "error": "TypeError", + "message": f"Invalid repository_ids format: {e}", + }, + } elif repository_id is not None: logger.info(f"Processing single repository ID: {repository_id}") try: target_ids = [int(repository_id)] except (ValueError, TypeError) as e: - logger.error(f"Invalid format for repository_id: {e}") - return {"result_type": "error", "data": {"error": "TypeError", "message": f"Invalid repository_id format: {e}"}} + logger.error(f"Invalid format for repository_id: {e}") + return { + "result_type": "error", + "data": { + "error": "TypeError", + "message": f"Invalid repository_id format: {e}", + }, + } if not target_ids: - return {"result_type": "table", "data": []} # Return empty if no valid IDs + return {"result_type": "table", "data": []} # Return empty if no valid IDs engine = None db: Session | None = None @@ -140,14 +171,16 @@ def run_analysis( repos_found = db.execute(stmt).scalars().all() found_ids = {repo.id for repo in repos_found} - logger.info(f"Found {len(repos_found)} repositories in the database out of {len(target_ids)} requested.") + logger.info( + f"Found {len(repos_found)} repositories in the database out of {len(target_ids)} requested." + ) # Check for missing repos missing_ids = set(target_ids) - found_ids if missing_ids: msg = f"Repositories not found for IDs: {', '.join(map(str, missing_ids))}" logger.warning(msg) - errors.append(msg) # Add to overall errors/notes + errors.append(msg) # Add to overall errors/notes # Calculate health for found repos for repo in repos_found: @@ -155,31 +188,44 @@ def run_analysis( health_data = calculate_repo_health(repo) results_list.append(health_data) except Exception as calc_err: - logger.error(f"Error calculating health for repo {repo.id}: {calc_err}", exc_info=True) - errors.append(f"Error calculating health for repo {repo.id}: {calc_err}") + logger.error( + f"Error calculating health for repo {repo.id}: {calc_err}", + exc_info=True, + ) + errors.append( + f"Error calculating health for repo {repo.id}: {calc_err}" + ) # Optionally add a partial error entry to results_list - results_list.append({ - "repository_id": repo.id, - "full_name": repo.full_name, - "score": None, - "error": str(calc_err) - }) - + results_list.append( + { + "repository_id": repo.id, + "full_name": repo.full_name, + "score": None, + "error": str(calc_err), + } + ) except Exception as e: logger.exception(f"Error during repo_health_v1 execution: {e}") # Return a general error if DB connection or main query fails - return {"result_type": "error", "data": {"error": type(e).__name__, "message": str(e)}} + return { + "result_type": "error", + "data": {"error": type(e).__name__, "message": str(e)}, + } finally: if db: db.close() if engine: engine.dispose() - logger.info(f"Repo_health_v1 analysis finished. Calculated health for {len(results_list)} repositories.") + logger.info( + f"Repo_health_v1 analysis finished. Calculated health for {len(results_list)} repositories." + ) # Return as a table, include errors/notes if any occurred return { "result_type": "table", "data": results_list, - "notes": errors if errors else None # Add notes field for missing IDs or calculation errors - } \ No newline at end of file + "notes": errors + if errors + else None, # Add notes field for missing IDs or calculation errors + } diff --git a/contrib/queries/top_pr_contributors_v1.py b/contrib/queries/top_pr_contributors_v1.py index 9bd980b..1194fe4 100644 --- a/contrib/queries/top_pr_contributors_v1.py +++ b/contrib/queries/top_pr_contributors_v1.py @@ -1,10 +1,9 @@ # --- CORRECTED FILE: contrib/queries/top_pr_contributors_v1.py --- import sys -import os import logging from pathlib import Path -from typing import List, Dict, Any, Optional +from typing import List, Dict, Any # --- Path Setup --- # Ensures the script can find backend modules when run by the executor @@ -23,15 +22,13 @@ logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [top_pr_contrib_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) def run_analysis( - db_conn_str: str, - repository_id: int, - limit: int = 10 + db_conn_str: str, repository_id: int, limit: int = 10 ) -> Dict[str, Any]: """ Identifies the top contributors to a repository based on merged Pull Requests. @@ -48,15 +45,17 @@ def run_analysis( ordered by count descending. If error, data contains error details. """ - logger.info(f"Starting top_pr_contributors_v1 analysis for repository_id={repository_id}, limit={limit}") + logger.info( + f"Starting top_pr_contributors_v1 analysis for repository_id={repository_id}, limit={limit}" + ) engine = None db: Session | None = None results: List[Dict[str, Any]] = [] if limit <= 0: - logger.warning("Limit must be a positive integer. Setting limit to 10.") - limit = 10 + logger.warning("Limit must be a positive integer. Setting limit to 10.") + limit = 10 try: engine = create_engine(db_conn_str) @@ -67,13 +66,13 @@ def run_analysis( aggregation_stmt = ( select( Contributor.login.label("contributor_login"), - func.count(PullRequest.id).label("merged_pr_count") + func.count(PullRequest.id).label("merged_pr_count"), ) .select_from(Contributor) .join(PullRequest, Contributor.id == PullRequest.user_id) .where(PullRequest.repository_id == repository_id) # --- FIX: Use correct column name 'gh_merged_at' --- - .where(PullRequest.gh_merged_at.isnot(None)) # Filter for merged PRs + .where(PullRequest.gh_merged_at.isnot(None)) # Filter for merged PRs # --- END FIX --- .group_by(Contributor.login) .order_by(desc("merged_pr_count")) @@ -81,7 +80,9 @@ def run_analysis( ) logger.info("Executing contributor PR count query...") - aggregation_results = db.execute(aggregation_stmt).mappings().all() # Fetch as dict-like + aggregation_results = ( + db.execute(aggregation_stmt).mappings().all() + ) # Fetch as dict-like # Format results results = [dict(row) for row in aggregation_results] @@ -90,11 +91,14 @@ def run_analysis( except Exception as e: logger.exception(f"Error during top_pr_contributors_v1 execution: {e}") - return {"result_type": "error", "data": {"error": type(e).__name__, "message": str(e)}} + return { + "result_type": "error", + "data": {"error": type(e).__name__, "message": str(e)}, + } finally: if db: db.close() if engine: engine.dispose() - return {"result_type": "table", "data": results} \ No newline at end of file + return {"result_type": "table", "data": results} diff --git a/contrib/queries/top_subjects_v1.py b/contrib/queries/top_subjects_v1.py index 6d72d09..14a5395 100644 --- a/contrib/queries/top_subjects_v1.py +++ b/contrib/queries/top_subjects_v1.py @@ -1,7 +1,6 @@ # --- UPDATED FILE: contrib/queries/top_subjects_v1.py --- import sys -import os import logging from pathlib import Path from typing import List, Dict, Any, Optional, Set, Tuple @@ -13,28 +12,35 @@ # --- End Path Setup --- from sqlalchemy import create_engine, select, func, and_, distinct, desc, Column -from sqlalchemy.orm import sessionmaker, Session, aliased, Query +from sqlalchemy.orm import sessionmaker, Session # Import required MOSS models from backend.data.models import ( - Repository, Work, DOIReference, Institution, Affiliation, Authorship, - WorkTopic, Topic, Subfield, Field, Domain + Repository, + Work, + DOIReference, + Institution, + Affiliation, + Authorship, + WorkTopic, + Topic, + Subfield, + Field, + Domain, ) # --- Logging Setup --- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [top_subjects_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) + # --- Helper function for single level analysis --- def _get_top_subjects_for_level( - db: Session, - level: str, - top_n: int, - target_work_ids: Optional[Set[int]] = None + db: Session, level: str, top_n: int, target_work_ids: Optional[Set[int]] = None ) -> List[Dict[str, Any]]: """ Performs the aggregation for a single subject level. @@ -46,11 +52,15 @@ def _get_top_subjects_for_level( base_query_stmt = ( select( # Select necessary IDs and names for grouping and context - Domain.id.label("domain_id"), Domain.display_name.label("domain_name"), - Field.id.label("field_id"), Field.display_name.label("field_name"), - Subfield.id.label("subfield_id"), Subfield.display_name.label("subfield_name"), - Topic.id.label("topic_id"), Topic.display_name.label("topic_name"), - func.count(distinct(Work.id)).label("work_count") # Count distinct works + Domain.id.label("domain_id"), + Domain.display_name.label("domain_name"), + Field.id.label("field_id"), + Field.display_name.label("field_name"), + Subfield.id.label("subfield_id"), + Subfield.display_name.label("subfield_name"), + Topic.id.label("topic_id"), + Topic.display_name.label("topic_name"), + func.count(distinct(Work.id)).label("work_count"), # Count distinct works ) .select_from(Work) .join(WorkTopic, Work.id == WorkTopic.work_id) @@ -62,46 +72,67 @@ def _get_top_subjects_for_level( # Apply work ID filter if target_work_ids is not None if target_work_ids is not None: - if not target_work_ids: # Handle empty set case explicitly - logger.debug(f"Target work ID set is empty for level {level}, returning no results.") + if not target_work_ids: # Handle empty set case explicitly + logger.debug( + f"Target work ID set is empty for level {level}, returning no results." + ) return [] base_query_stmt = base_query_stmt.where(Work.id.in_(target_work_ids)) # --- Dynamic Aggregation based on subject_level --- - group_by_cols: List[Tuple[Column, str]] = [] # Store tuples of (Column, label_name) - select_cols: List[Column] = [] # Store columns to select directly + group_by_cols: List[Tuple[Column, str]] = [] # Store tuples of (Column, label_name) + select_cols: List[Column] = [] # Store columns to select directly - if level == 'topic': + if level == "topic": group_by_cols = [ - (Topic.id, "topic_id"), (Topic.display_name, "topic_name"), - (Subfield.id, "subfield_id"), (Subfield.display_name, "subfield_name"), - (Field.id, "field_id"), (Field.display_name, "field_name"), - (Domain.id, "domain_id"), (Domain.display_name, "domain_name") + (Topic.id, "topic_id"), + (Topic.display_name, "topic_name"), + (Subfield.id, "subfield_id"), + (Subfield.display_name, "subfield_name"), + (Field.id, "field_id"), + (Field.display_name, "field_name"), + (Domain.id, "domain_id"), + (Domain.display_name, "domain_name"), ] select_cols = [col for col, _ in group_by_cols] - parent_info = lambda row: f"{row.get('subfield_name')} (Subfield) / {row.get('field_name')} (Field) / {row.get('domain_name')} (Domain)" if row.get('subfield_name') else None + parent_info = ( + lambda row: f"{row.get('subfield_name')} (Subfield) / {row.get('field_name')} (Field) / {row.get('domain_name')} (Domain)" + if row.get("subfield_name") + else None + ) - elif level == 'subfield': + elif level == "subfield": group_by_cols = [ - (Subfield.id, "subfield_id"), (Subfield.display_name, "subfield_name"), - (Field.id, "field_id"), (Field.display_name, "field_name"), - (Domain.id, "domain_id"), (Domain.display_name, "domain_name") + (Subfield.id, "subfield_id"), + (Subfield.display_name, "subfield_name"), + (Field.id, "field_id"), + (Field.display_name, "field_name"), + (Domain.id, "domain_id"), + (Domain.display_name, "domain_name"), ] select_cols = [col for col, _ in group_by_cols] - parent_info = lambda row: f"{row.get('field_name')} (Field) / {row.get('domain_name')} (Domain)" if row.get('field_name') else None + parent_info = ( + lambda row: f"{row.get('field_name')} (Field) / {row.get('domain_name')} (Domain)" + if row.get("field_name") + else None + ) - elif level == 'field': + elif level == "field": group_by_cols = [ - (Field.id, "field_id"), (Field.display_name, "field_name"), - (Domain.id, "domain_id"), (Domain.display_name, "domain_name") + (Field.id, "field_id"), + (Field.display_name, "field_name"), + (Domain.id, "domain_id"), + (Domain.display_name, "domain_name"), ] select_cols = [col for col, _ in group_by_cols] - parent_info = lambda row: f"{row.get('domain_name')} (Domain)" if row.get('domain_name') else None + parent_info = ( + lambda row: f"{row.get('domain_name')} (Domain)" + if row.get("domain_name") + else None + ) - elif level == 'domain': - group_by_cols = [ - (Domain.id, "domain_id"), (Domain.display_name, "domain_name") - ] + elif level == "domain": + group_by_cols = [(Domain.id, "domain_id"), (Domain.display_name, "domain_name")] select_cols = [col for col, _ in group_by_cols] parent_info = lambda row: None else: @@ -110,41 +141,51 @@ def _get_top_subjects_for_level( # Final aggregation query for this level final_query_stmt = ( - base_query_stmt - .group_by(*[col for col, _ in group_by_cols]) # Group by the actual columns + base_query_stmt.group_by( + *[col for col, _ in group_by_cols] + ) # Group by the actual columns .order_by(desc("work_count")) .limit(top_n) # Re-select only the necessary columns for this level + count .with_only_columns( - *[col.label(label) for col, label in group_by_cols], # Select grouped columns with labels - func.count(distinct(Work.id)).label("work_count") # Select the count again + *[ + col.label(label) for col, label in group_by_cols + ], # Select grouped columns with labels + func.count(distinct(Work.id)).label("work_count"), # Select the count again ) ) logger.debug(f"Executing aggregation query for level '{level}'...") - query_results = db.execute(final_query_stmt).mappings().all() # Use mappings() - logger.info(f"Aggregation query for level '{level}' returned {len(query_results)} results.") + query_results = db.execute(final_query_stmt).mappings().all() # Use mappings() + logger.info( + f"Aggregation query for level '{level}' returned {len(query_results)} results." + ) # Format results for row_mapping in query_results: - row_dict = dict(row_mapping) # Convert RowMapping to dict - results.append({ - "subject_level": level, - "subject_name": row_dict.get(f"{level}_name"), - "subject_id": row_dict.get(f"{level}_id"), - "parent_context": parent_info(row_dict), - "associated_work_count": row_dict.get("work_count") - }) + row_dict = dict(row_mapping) # Convert RowMapping to dict + results.append( + { + "subject_level": level, + "subject_name": row_dict.get(f"{level}_name"), + "subject_id": row_dict.get(f"{level}_id"), + "parent_context": parent_info(row_dict), + "associated_work_count": row_dict.get("work_count"), + } + ) return results + + # --- End Helper function --- + def run_analysis( db_conn_str: str, subject_level: str, top_n: int = 10, repository_id: Optional[int] = None, - institution_id: Optional[int] = None + institution_id: Optional[int] = None, ) -> Dict[str, Any]: """ Identifies the top N most frequent subjects for one or all levels ('topic', @@ -164,14 +205,28 @@ def run_analysis( If successful, data is a list of subject summary dictionaries. If error, data contains error details. """ - logger.info(f"Starting top_subjects_v1 analysis for level='{subject_level}', repo={repository_id}, inst={institution_id}, top_n={top_n}") + logger.info( + f"Starting top_subjects_v1 analysis for level='{subject_level}', repo={repository_id}, inst={institution_id}, top_n={top_n}" + ) if repository_id and institution_id: - return {"result_type": "error", "data": {"error": "ValueError", "message": "Provide either repository_id or institution_id, not both."}} + return { + "result_type": "error", + "data": { + "error": "ValueError", + "message": "Provide either repository_id or institution_id, not both.", + }, + } - valid_levels = ['domain', 'field', 'subfield', 'topic', 'all'] + valid_levels = ["domain", "field", "subfield", "topic", "all"] if subject_level not in valid_levels: - return {"result_type": "error", "data": {"error": "ValueError", "message": f"Invalid subject_level. Choose from: {valid_levels}"}} + return { + "result_type": "error", + "data": { + "error": "ValueError", + "message": f"Invalid subject_level. Choose from: {valid_levels}", + }, + } engine = None db: Session | None = None @@ -188,50 +243,97 @@ def run_analysis( if repository_id: repo = db.get(Repository, repository_id) if not repo: - return {"result_type": "error", "data": {"error": "NotFound", "message": f"Repository ID {repository_id} not found."}} - filter_context = {"type": "repository", "id": repository_id, "name": repo.full_name} + return { + "result_type": "error", + "data": { + "error": "NotFound", + "message": f"Repository ID {repository_id} not found.", + }, + } + filter_context = { + "type": "repository", + "id": repository_id, + "name": repo.full_name, + } logger.info(f"Filtering works linked to repository: {repo.full_name}") stmt = select(distinct(DOIReference.work_id)).where( DOIReference.repository_id == repository_id, - DOIReference.work_id.is_not(None) + DOIReference.work_id.is_not(None), ) work_ids_result = db.execute(stmt).scalars().all() target_work_ids = set(work_ids_result) if not target_work_ids: - logger.info(f"No resolved works found linked to repository {repository_id}.") - return {"result_type": "table", "data": [], "filter_context": filter_context} - logger.info(f"Found {len(target_work_ids)} target works for repository {repository_id}.") + logger.info( + f"No resolved works found linked to repository {repository_id}." + ) + return { + "result_type": "table", + "data": [], + "filter_context": filter_context, + } + logger.info( + f"Found {len(target_work_ids)} target works for repository {repository_id}." + ) elif institution_id: inst = db.get(Institution, institution_id) if not inst: - return {"result_type": "error", "data": {"error": "NotFound", "message": f"Institution ID {institution_id} not found."}} - filter_context = {"type": "institution", "id": institution_id, "name": inst.display_name} + return { + "result_type": "error", + "data": { + "error": "NotFound", + "message": f"Institution ID {institution_id} not found.", + }, + } + filter_context = { + "type": "institution", + "id": institution_id, + "name": inst.display_name, + } logger.info(f"Filtering works linked to institution: {inst.display_name}") stmt = ( - select(distinct(Authorship.work_id)) - .join(Affiliation, and_(Authorship.work_id == Affiliation.authorship_work_id, Authorship.person_id == Affiliation.authorship_person_id)) + select(distinct(Authorship.work_id)) + .join( + Affiliation, + and_( + Authorship.work_id == Affiliation.authorship_work_id, + Authorship.person_id == Affiliation.authorship_person_id, + ), + ) .where(Affiliation.institution_id == institution_id) ) work_ids_result = db.execute(stmt).scalars().all() target_work_ids = set(work_ids_result) if not target_work_ids: logger.info(f"No works found linked to institution {institution_id}.") - return {"result_type": "table", "data": [], "filter_context": filter_context} - logger.info(f"Found {len(target_work_ids)} target works for institution {institution_id}.") + return { + "result_type": "table", + "data": [], + "filter_context": filter_context, + } + logger.info( + f"Found {len(target_work_ids)} target works for institution {institution_id}." + ) # Step 2: Run analysis for the specified level(s) - if subject_level == 'all': - levels_to_run = ['topic', 'subfield', 'field', 'domain'] + if subject_level == "all": + levels_to_run = ["topic", "subfield", "field", "domain"] for level in levels_to_run: - level_results = _get_top_subjects_for_level(db, level, top_n, target_work_ids) + level_results = _get_top_subjects_for_level( + db, level, top_n, target_work_ids + ) all_results.extend(level_results) else: - all_results = _get_top_subjects_for_level(db, subject_level, top_n, target_work_ids) + all_results = _get_top_subjects_for_level( + db, subject_level, top_n, target_work_ids + ) except Exception as e: logger.exception(f"Error during top_subjects_v1 execution: {e}") - return {"result_type": "error", "data": {"error": type(e).__name__, "message": str(e)}} + return { + "result_type": "error", + "data": {"error": type(e).__name__, "message": str(e)}, + } finally: if db: db.close() @@ -240,6 +342,6 @@ def run_analysis( return { "result_type": "table", - "filter_context": filter_context, # Add the context here - "data": all_results - } \ No newline at end of file + "filter_context": filter_context, # Add the context here + "data": all_results, + } diff --git a/contrib/queries/works_by_citing_institution_v1.py b/contrib/queries/works_by_citing_institution_v1.py index e054c26..583a052 100644 --- a/contrib/queries/works_by_citing_institution_v1.py +++ b/contrib/queries/works_by_citing_institution_v1.py @@ -1,10 +1,9 @@ # --- NEW FILE: contrib/queries/works_by_citing_institution_v1.py --- import sys -import os import logging from pathlib import Path -from typing import List, Dict, Any, Optional, Set +from typing import List, Dict, Any, Set # --- Path Setup --- # Assuming this script is in contrib/queries/ @@ -13,20 +12,19 @@ sys.path.insert(0, str(project_root)) # --- End Path Setup --- -from sqlalchemy import create_engine, select, func, and_, distinct, join, alias -from sqlalchemy.orm import sessionmaker, Session, aliased +from sqlalchemy import create_engine, select, and_, distinct +from sqlalchemy.orm import sessionmaker, Session # Import required MOSS models from backend.data.models import ( - Repository, - Institution, Work, DOIReference, WorkCitation, Authorship, Affiliation, - RepositoryInstitutionAffiliation + RepositoryInstitutionAffiliation, ) + # Import required MOSS schema for structuring output from backend.schemas.responses import WorkSummary @@ -34,15 +32,13 @@ logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] [works_by_citing_inst_v1] - %(message)s", - handlers=[logging.StreamHandler(sys.stderr)] + handlers=[logging.StreamHandler(sys.stderr)], ) logger = logging.getLogger(__name__) def run_analysis( - db_conn_str: str, - institution_id: int, - min_confidence: float = 0.5 + db_conn_str: str, institution_id: int, min_confidence: float = 0.5 ) -> Dict[str, Any]: """ Finds scholarly works that cite repositories affiliated with a specific institution, @@ -66,7 +62,9 @@ def run_analysis( If successful, data is a list of work summary dictionaries. If error, data contains error details. """ - logger.info(f"Starting works_by_citing_institution_v1 analysis for institution_id={institution_id}, min_confidence={min_confidence}") + logger.info( + f"Starting works_by_citing_institution_v1 analysis for institution_id={institution_id}, min_confidence={min_confidence}" + ) engine = None db: Session | None = None @@ -82,13 +80,17 @@ def run_analysis( select(RepositoryInstitutionAffiliation.repository_id) .where( RepositoryInstitutionAffiliation.institution_id == institution_id, - RepositoryInstitutionAffiliation.confidence_score >= min_confidence + RepositoryInstitutionAffiliation.confidence_score >= min_confidence, ) .distinct() ) - affiliated_repo_ids_result = db.execute(affiliated_repo_ids_stmt).scalars().all() + affiliated_repo_ids_result = ( + db.execute(affiliated_repo_ids_stmt).scalars().all() + ) if not affiliated_repo_ids_result: - logger.info("No repositories found affiliated with the institution above the confidence threshold.") + logger.info( + "No repositories found affiliated with the institution above the confidence threshold." + ) return {"result_type": "table", "data": []} affiliated_repo_ids: Set[int] = set(affiliated_repo_ids_result) logger.info(f"Found {len(affiliated_repo_ids)} affiliated repositories.") @@ -98,7 +100,7 @@ def run_analysis( select(DOIReference.work_id) .where( DOIReference.repository_id.in_(affiliated_repo_ids), - DOIReference.work_id.is_not(None) # Ensure the DOI was resolved + DOIReference.work_id.is_not(None), # Ensure the DOI was resolved ) .distinct() ) @@ -107,7 +109,9 @@ def run_analysis( logger.info("No cited works found linked to the affiliated repositories.") return {"result_type": "table", "data": []} cited_work_ids: Set[int] = set(cited_work_ids_result) - logger.info(f"Found {len(cited_work_ids)} unique works cited by affiliated repositories.") + logger.info( + f"Found {len(cited_work_ids)} unique works cited by affiliated repositories." + ) # Step 3 & 4: Find citing works (W_citing) whose authors are affiliated with the target institution # This is the most complex query. We need W_citing where: @@ -124,22 +128,31 @@ def run_analysis( select(distinct(WC.citing_work_id)) .select_from(WC) .join(Aship, WC.citing_work_id == Aship.work_id) - .join(Aff, and_( - Aship.work_id == Aff.authorship_work_id, - Aship.person_id == Aff.authorship_person_id - )) + .join( + Aff, + and_( + Aship.work_id == Aff.authorship_work_id, + Aship.person_id == Aff.authorship_person_id, + ), + ) .where( WC.cited_work_id.in_(cited_work_ids), - Aff.institution_id == institution_id + Aff.institution_id == institution_id, ) ) - valid_citing_work_ids_result = db.execute(valid_citing_work_ids_stmt).scalars().all() + valid_citing_work_ids_result = ( + db.execute(valid_citing_work_ids_stmt).scalars().all() + ) if not valid_citing_work_ids_result: - logger.info("No citing works found with authors affiliated with the target institution.") + logger.info( + "No citing works found with authors affiliated with the target institution." + ) return {"result_type": "table", "data": []} valid_citing_work_ids: List[int] = valid_citing_work_ids_result - logger.info(f"Found {len(valid_citing_work_ids)} candidate citing works with relevant author affiliations.") + logger.info( + f"Found {len(valid_citing_work_ids)} candidate citing works with relevant author affiliations." + ) # Step 5: Fetch Work details for the valid citing work IDs final_works_stmt = ( @@ -151,29 +164,36 @@ def run_analysis( # Format results using WorkSummary Pydantic model (or manually construct dict) for work in final_works: - # Use the Pydantic model to serialize, handling potential None values - try: - summary = WorkSummary.model_validate(work) - results.append(summary.model_dump()) - except Exception as pydantic_err: - logger.warning(f"Could not validate Work ID {work.id} for WorkSummary: {pydantic_err}") - # Fallback to manual dict creation if validation fails - results.append({ - "id": work.id, - "title": work.title, - "doi": work.doi, - "publication_year": work.publication_year - }) + # Use the Pydantic model to serialize, handling potential None values + try: + summary = WorkSummary.model_validate(work) + results.append(summary.model_dump()) + except Exception as pydantic_err: + logger.warning( + f"Could not validate Work ID {work.id} for WorkSummary: {pydantic_err}" + ) + # Fallback to manual dict creation if validation fails + results.append( + { + "id": work.id, + "title": work.title, + "doi": work.doi, + "publication_year": work.publication_year, + } + ) logger.info(f"Returning {len(results)} works.") except Exception as e: logger.exception(f"Error during works_by_citing_institution_v1 execution: {e}") - return {"result_type": "error", "data": {"error": type(e).__name__, "message": str(e)}} + return { + "result_type": "error", + "data": {"error": type(e).__name__, "message": str(e)}, + } finally: if db: db.close() if engine: engine.dispose() - return {"result_type": "table", "data": results} \ No newline at end of file + return {"result_type": "table", "data": results} diff --git a/pyproject.toml b/pyproject.toml index 4c995d9..7c3a7b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,3 +16,8 @@ dependencies = [ "sqlalchemy>=2.0.40", "uvicorn[standard]>=0.34.2", ] + +[dependency-groups] +dev = [ + "pre-commit>=4.2.0", +] diff --git a/scripts/setup_db.py b/scripts/setup_db.py index c073d7f..12a4890 100644 --- a/scripts/setup_db.py +++ b/scripts/setup_db.py @@ -3,13 +3,14 @@ import logging # Ensure the backend package is discoverable -PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if PROJECT_ROOT not in sys.path: sys.path.insert(0, PROJECT_ROOT) from alembic.config import Config from alembic import command -from backend.config.logging_config import setup_logging # Use our logging setup +from backend.config.logging_config import setup_logging # Use our logging setup + # Import settings to ensure .env is loaded if alembic.ini relies on it indirectly # (Although our current env.py loads it directly) try: @@ -18,21 +19,22 @@ print(f"ERROR: Could not load settings. Is .env configured correctly? Details: {e}") sys.exit(1) except ImportError as e: - print(f"ERROR: Could not import settings. Path issue? Details: {e}") - sys.exit(1) + print(f"ERROR: Could not import settings. Path issue? Details: {e}") + sys.exit(1) # Set up logging for the script setup_logging() logger = logging.getLogger(__name__) + def main(): """Applies Alembic migrations to the database.""" logger.info("Starting database setup/migration...") # Construct the absolute path to alembic.ini relative to this script # Assumes this script is in moss/scripts/ and alembic.ini is in moss/ - alembic_ini_path = os.path.join(PROJECT_ROOT, 'alembic.ini') + alembic_ini_path = os.path.join(PROJECT_ROOT, "alembic.ini") logger.info(f"Using Alembic config: {alembic_ini_path}") if not os.path.exists(alembic_ini_path): @@ -58,10 +60,11 @@ def main(): logger.error(f"Error applying database migrations: {e}", exc_info=True) return False + if __name__ == "__main__": if main(): print("Database setup script completed successfully.") sys.exit(0) else: print("Database setup script failed.") - sys.exit(1) \ No newline at end of file + sys.exit(1) diff --git a/uv.lock b/uv.lock index 83de40e..c2c6afe 100644 --- a/uv.lock +++ b/uv.lock @@ -98,6 +98,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4a/7e/3db2bd1b1f9e95f7cddca6d6e75e2f2bd9f51b1246e546d88addca0106bd/certifi-2025.4.26-py3-none-any.whl", hash = "sha256:30350364dfe371162649852c63336a15c70c6510c2ad5015b21c2345311805f3", size = 159618, upload-time = "2025-04-26T02:12:27.662Z" }, ] +[[package]] +name = "cfgv" +version = "3.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114, upload-time = "2023-08-12T20:38:17.776Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249, upload-time = "2023-08-12T20:38:16.269Z" }, +] + [[package]] name = "charset-normalizer" version = "3.4.2" @@ -229,6 +238,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d4/f6/a6a9f45769e955ed52fb2c1e06599c37f481028530a405793a7de5ba2625/concurrent_log_handler-0.9.26-py3-none-any.whl", hash = "sha256:0b03a8f1dcb1a03ad292647ee4930b3f9ba2bdb45e55bf2699d2c053f8e6531f", size = 28348, upload-time = "2025-05-09T19:52:00.147Z" }, ] +[[package]] +name = "distlib" +version = "0.3.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0d/dd/1bec4c5ddb504ca60fc29472f3d27e8d4da1257a854e1d96742f15c1d02d/distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403", size = 613923, upload-time = "2024-10-09T18:35:47.551Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87", size = 468973, upload-time = "2024-10-09T18:35:44.272Z" }, +] + [[package]] name = "exceptiongroup" version = "1.3.0" @@ -255,6 +273,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/b3/b51f09c2ba432a576fe63758bddc81f78f0c6309d9e5c10d194313bf021e/fastapi-0.115.12-py3-none-any.whl", hash = "sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d", size = 95164, upload-time = "2025-03-23T22:55:42.101Z" }, ] +[[package]] +name = "filelock" +version = "3.18.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/10/c23352565a6544bdc5353e0b15fc1c563352101f30e24bf500207a54df9a/filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2", size = 18075, upload-time = "2025-03-14T07:11:40.47Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de", size = 16215, upload-time = "2025-03-14T07:11:39.145Z" }, +] + [[package]] name = "greenlet" version = "3.2.2" @@ -352,6 +379,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4d/dc/7decab5c404d1d2cdc1bb330b1bf70e83d6af0396fd4fc76fc60c0d522bf/httptools-0.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8", size = 87682, upload-time = "2024-10-16T19:44:46.46Z" }, ] +[[package]] +name = "identify" +version = "2.6.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0c/83/b6ea0334e2e7327084a46aaaf71f2146fc061a192d6518c0d020120cd0aa/identify-2.6.10.tar.gz", hash = "sha256:45e92fd704f3da71cc3880036633f48b4b7265fd4de2b57627cb157216eb7eb8", size = 99201, upload-time = "2025-04-19T15:10:38.32Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2b/d3/85feeba1d097b81a44bcffa6a0beab7b4dfffe78e82fc54978d3ac380736/identify-2.6.10-py2.py3-none-any.whl", hash = "sha256:5f34248f54136beed1a7ba6a6b5c4b6cf21ff495aac7c359e1ef831ae3b8ab25", size = 99101, upload-time = "2025-04-19T15:10:36.701Z" }, +] + [[package]] name = "idna" version = "3.10" @@ -464,6 +500,11 @@ dependencies = [ { name = "uvicorn", extra = ["standard"] }, ] +[package.dev-dependencies] +dev = [ + { name = "pre-commit" }, +] + [package.metadata] requires-dist = [ { name = "alembic", specifier = ">=1.15.2" }, @@ -480,6 +521,9 @@ requires-dist = [ { name = "uvicorn", extras = ["standard"], specifier = ">=0.34.2" }, ] +[package.metadata.requires-dev] +dev = [{ name = "pre-commit", specifier = ">=4.2.0" }] + [[package]] name = "networkx" version = "3.4.2" @@ -489,6 +533,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263, upload-time = "2024-10-21T12:39:36.247Z" }, ] +[[package]] +name = "nodeenv" +version = "1.9.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, +] + [[package]] name = "numpy" version = "2.2.5" @@ -551,6 +604,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/68/67/1175790323026d3337cc285cc9c50eca637d70472b5e622529df74bb8f37/numpy-2.2.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d2e3bdadaba0e040d1e7ab39db73e0afe2c74ae277f5614dad53eadbecbbb169", size = 12859001, upload-time = "2025-04-19T22:48:57.665Z" }, ] +[[package]] +name = "platformdirs" +version = "4.3.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload-time = "2025-05-07T22:47:42.121Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload-time = "2025-05-07T22:47:40.376Z" }, +] + [[package]] name = "portalocker" version = "3.1.1" @@ -563,6 +625,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f7/60/1974cfdd5bb770568ddc6f89f3e0df4cfdd1acffd5a609dff5e95f48c6e2/portalocker-3.1.1-py3-none-any.whl", hash = "sha256:80e984e24de292ff258a5bea0e4f3f778fff84c0ae1275dbaebc4658de4aacb3", size = 19661, upload-time = "2024-12-31T14:22:47.019Z" }, ] +[[package]] +name = "pre-commit" +version = "4.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cfgv" }, + { name = "identify" }, + { name = "nodeenv" }, + { name = "pyyaml" }, + { name = "virtualenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/08/39/679ca9b26c7bb2999ff122d50faa301e49af82ca9c066ec061cfbc0c6784/pre_commit-4.2.0.tar.gz", hash = "sha256:601283b9757afd87d40c4c4a9b2b5de9637a8ea02eaff7adc2d0fb4e04841146", size = 193424, upload-time = "2025-03-18T21:35:20.987Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/74/a88bf1b1efeae488a0c0b7bdf71429c313722d1fc0f377537fbe554e6180/pre_commit-4.2.0-py2.py3-none-any.whl", hash = "sha256:a009ca7205f1eb497d10b845e52c838a98b6cdd2102a6c8e4540e94ee75c58bd", size = 220707, upload-time = "2025-03-18T21:35:19.343Z" }, +] + [[package]] name = "prompt-toolkit" version = "3.0.51" @@ -1029,6 +1107,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/03/ff/7c0c86c43b3cbb927e0ccc0255cb4057ceba4799cd44ae95174ce8e8b5b2/vine-5.1.0-py3-none-any.whl", hash = "sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc", size = 9636, upload-time = "2023-11-05T08:46:51.205Z" }, ] +[[package]] +name = "virtualenv" +version = "20.31.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "distlib" }, + { name = "filelock" }, + { name = "platformdirs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/56/2c/444f465fb2c65f40c3a104fd0c495184c4f2336d65baf398e3c75d72ea94/virtualenv-20.31.2.tar.gz", hash = "sha256:e10c0a9d02835e592521be48b332b6caee6887f332c111aa79a09b9e79efc2af", size = 6076316, upload-time = "2025-05-08T17:58:23.811Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f3/40/b1c265d4b2b62b58576588510fc4d1fe60a86319c8de99fd8e9fec617d2c/virtualenv-20.31.2-py3-none-any.whl", hash = "sha256:36efd0d9650ee985f0cad72065001e66d49a6f24eb44d98980f630686243cf11", size = 6057982, upload-time = "2025-05-08T17:58:21.15Z" }, +] + [[package]] name = "watchfiles" version = "1.0.5" From aa1f585045f9ca33258b6757d431df431241c0c9 Mon Sep 17 00:00:00 2001 From: Ali Tavallaie Date: Tue, 13 May 2025 14:16:18 +0330 Subject: [PATCH 6/6] disabaling ruff for contrib/,Older Experiments/,backend/data/,scripts/ --- .pre-commit-config.yaml | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f1c490b..3aa4e83 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,19 @@ repos: -- repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version. - rev: v0.11.9 - hooks: - # Run the linter. - - id: ruff - types_or: [ python, pyi ] - args: [ --fix ] - # Run the formatter. - - id: ruff-format - types_or: [ python, pyi ] \ No newline at end of file + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.11.9 + hooks: + # Run the linter. + - id: ruff + types_or: [python, pyi] + + args: + [ + --fix, + --exclude, + "contrib/*,Older Experiments/,backend/data/,scripts/", + ] + # Run the formatter. + - id: ruff-format + types_or: [python, pyi] + args: [--exclude, "contrib/*,Older Experiments/,backend/data/,scripts/"]