From 8124907d68cec3670bed2ac10f434737a666f684 Mon Sep 17 00:00:00 2001
From: Brian Rosenberg <brosenberg@mitre.org>
Date: Thu, 6 Mar 2025 08:24:21 -0500
Subject: [PATCH 1/9] Changes for Python upgrade

---
 python/ArgosTranslation/.dockerignore         |  31 ++++
 python/ArgosTranslation/Dockerfile            | 165 ++++++++++--------
 python/ClipDetection/Dockerfile               |   2 +-
 python/FastTextLanguageDetection/Dockerfile   |   2 +-
 .../FastTextLanguageDetection/pyproject.toml  |   3 +-
 5 files changed, 131 insertions(+), 72 deletions(-)
 create mode 100644 python/ArgosTranslation/.dockerignore

diff --git a/python/ArgosTranslation/.dockerignore b/python/ArgosTranslation/.dockerignore
new file mode 100644
index 000000000..d3e5a40ce
--- /dev/null
+++ b/python/ArgosTranslation/.dockerignore
@@ -0,0 +1,31 @@
+#############################################################################
+# NOTICE                                                                    #
+#                                                                           #
+# This software (or technical data) was produced for the U.S. Government    #
+# under contract, and is subject to the Rights in Data-General Clause       #
+# 52.227-14, Alt. IV (DEC 2007).                                            #
+#                                                                           #
+# Copyright 2025 The MITRE Corporation. All Rights Reserved.                #
+#############################################################################
+
+#############################################################################
+# Copyright 2025 The MITRE Corporation                                      #
+#                                                                           #
+# Licensed under the Apache License, Version 2.0 (the "License");           #
+# you may not use this file except in compliance with the License.          #
+# You may obtain a copy of the License at                                   #
+#                                                                           #
+#    http://www.apache.org/licenses/LICENSE-2.0                             #
+#                                                                           #
+# Unless required by applicable law or agreed to in writing, software       #
+# distributed under the License is distributed on an "AS IS" BASIS,         #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
+# See the License for the specific language governing permissions and       #
+# limitations under the License.                                            #
+#############################################################################
+
+.dockerignore
+Dockerfile
+LICENSE
+README.md
+.vscode
diff --git a/python/ArgosTranslation/Dockerfile b/python/ArgosTranslation/Dockerfile
index d484b734a..246cbbf5b 100644
--- a/python/ArgosTranslation/Dockerfile
+++ b/python/ArgosTranslation/Dockerfile
@@ -1,69 +1,96 @@
-# syntax=docker/dockerfile:1.2
-
-#############################################################################
-# NOTICE                                                                    #
-#                                                                           #
-# This software (or technical data) was produced for the U.S. Government    #
-# under contract, and is subject to the Rights in Data-General Clause       #
-# 52.227-14, Alt. IV (DEC 2007).                                            #
-#                                                                           #
-# Copyright 2024 The MITRE Corporation. All Rights Reserved.                #
-#############################################################################
-
-#############################################################################
-# Copyright 2024 The MITRE Corporation                                      #
-#                                                                           #
-# Licensed under the Apache License, Version 2.0 (the "License");           #
-# you may not use this file except in compliance with the License.          #
-# You may obtain a copy of the License at                                   #
-#                                                                           #
-#    http://www.apache.org/licenses/LICENSE-2.0                             #
-#                                                                           #
-# Unless required by applicable law or agreed to in writing, software       #
-# distributed under the License is distributed on an "AS IS" BASIS,         #
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
-# See the License for the specific language governing permissions and       #
-# limitations under the License.                                            #
-#############################################################################
-
-ARG BUILD_REGISTRY
-ARG BUILD_TAG=latest
-
-FROM openmpf/openmpf_argos_translation_models:1.9.1-to-eng AS argos_models
-
-FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG}
-
-COPY ./setup_models/argos_model_installer.py /model_setup/argos_model_installer.py
-
-RUN pip3 install --no-cache-dir 'argostranslate==1.9.1'
-
-# To include a language model add *<ISO_LANG>_en* to the cp command.
-# Refer to README for list of supported languages
-RUN --mount=from=argos_models,source=/models,target=/all-models \
-    cd /all-models; \
-    mkdir /models; \
-    cp -- *de_en* *fr_en* *ru_en* *zh_en* *zt_en* *es_en* *ar_en* *ko_en* *fa_en* /models; \
-    python3 /model_setup/argos_model_installer.py /models; \
-    rm -r /model_setup /models;
-
-
-RUN argospm update
-
-# Another option for downloading language models is to add lines below for languages of interest:
-# RUN argospm install translate-<INPUT_ISO_LANG>_en
-# RUN argospm install translate-es_en
-
-# Please note that argospm may download models that require a newer version of argostranslate.
-
-ARG RUN_TESTS=false
-
-RUN --mount=target=.,readwrite \
-    install-component.sh; \
-    if [ "${RUN_TESTS,,}" == true ]; then python tests/test_argos_translation.py; fi
-
-LABEL org.label-schema.license="Apache 2.0" \
-      org.label-schema.name="OpenMPF Argos Translation" \
-      org.label-schema.schema-version="1.0" \
-      org.label-schema.url="https://openmpf.github.io" \
-      org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \
-      org.label-schema.vendor="MITRE"
+# syntax=docker/dockerfile:1.4
+
+#############################################################################
+# NOTICE                                                                    #
+#                                                                           #
+# This software (or technical data) was produced for the U.S. Government    #
+# under contract, and is subject to the Rights in Data-General Clause       #
+# 52.227-14, Alt. IV (DEC 2007).                                            #
+#                                                                           #
+# Copyright 2024 The MITRE Corporation. All Rights Reserved.                #
+#############################################################################
+
+#############################################################################
+# Copyright 2024 The MITRE Corporation                                      #
+#                                                                           #
+# Licensed under the Apache License, Version 2.0 (the "License");           #
+# you may not use this file except in compliance with the License.          #
+# You may obtain a copy of the License at                                   #
+#                                                                           #
+#    http://www.apache.org/licenses/LICENSE-2.0                             #
+#                                                                           #
+# Unless required by applicable law or agreed to in writing, software       #
+# distributed under the License is distributed on an "AS IS" BASIS,         #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
+# See the License for the specific language governing permissions and       #
+# limitations under the License.                                            #
+#############################################################################
+
+ARG BUILD_REGISTRY
+ARG BUILD_TAG=latest
+
+FROM openmpf/openmpf_argos_translation_models:1.9.1-to-eng AS argos_models
+
+
+
+FROM ${BUILD_REGISTRY}openmpf_python_component_build:${BUILD_TAG} AS build
+
+RUN --mount=type=tmpfs,target=/var/cache/apt \
+    --mount=type=tmpfs,target=/var/lib/apt/lists  \
+    --mount=type=tmpfs,target=/tmp \
+    apt-get update && apt-get install -y --no-install-recommends cmake pkg-config make
+
+
+RUN pip3 install --no-cache-dir 'argostranslate==1.9.1'
+
+RUN --mount=target=.,readwrite install-component.sh
+
+
+
+FROM ${BUILD_REGISTRY}openmpf_python_executor:${BUILD_TAG} AS setup_models
+
+COPY --from=build $COMPONENT_VIRTUALENV $COMPONENT_VIRTUALENV
+
+# To include a language model add *<ISO_LANG>_en* to the cp command.
+# Refer to README for list of supported languages
+RUN --mount=from=argos_models,source=/models,target=/all-models \
+    --mount=source=setup_models/argos_model_installer.py,target=/opt/argos_model_installer.py \
+    cd /all-models; \
+    mkdir /models; \
+    cp -- *de_en* *fr_en* *ru_en* *zh_en* *zt_en* *es_en* *ar_en* *ko_en* *fa_en* /models; \
+    python3 /opt/argos_model_installer.py /models; \
+    rm -r /models;
+
+RUN argospm update
+
+# Another option for downloading language models is to add lines below for languages of interest:
+# RUN argospm install translate-<INPUT_ISO_LANG>_en
+# RUN argospm install translate-es_en
+
+# Please note that argospm may download models that require a newer version of argostranslate.
+
+
+FROM build AS run_tests
+
+ARG RUN_TESTS=false
+RUN --mount=target=.,readwrite \
+    --mount=from=setup_models,source=/root/.local/share/argos-translate,target=/root/.local/share/argos-translate \
+<<eot
+    if [ "${RUN_TESTS,,}" == true ]; then
+        echo test
+        python tests/test_argos_translation.py
+    fi
+eot
+
+
+FROM setup_models
+
+COPY --from=run_tests $PLUGINS_DIR/ArgosTranslation $PLUGINS_DIR/ArgosTranslation
+
+
+LABEL org.label-schema.license="Apache 2.0" \
+      org.label-schema.name="OpenMPF Argos Translation" \
+      org.label-schema.schema-version="1.0" \
+      org.label-schema.url="https://openmpf.github.io" \
+      org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \
+      org.label-schema.vendor="MITRE"
diff --git a/python/ClipDetection/Dockerfile b/python/ClipDetection/Dockerfile
index 681b05d3c..dc835fc18 100644
--- a/python/ClipDetection/Dockerfile
+++ b/python/ClipDetection/Dockerfile
@@ -50,7 +50,7 @@ RUN --mount=type=tmpfs,target=/tmp \
         wget -O- 'https://github.com/openai/CLIP/tarball/master' \
             | tar --extract --gzip --directory /tmp/CLIP; \
         cd /tmp/CLIP/*; \
-        pip3 install . 'torchvision==0.14.1'
+        pip3 install .
 
 ARG RUN_TESTS=false
 
diff --git a/python/FastTextLanguageDetection/Dockerfile b/python/FastTextLanguageDetection/Dockerfile
index 9eb49f4ee..61e610206 100644
--- a/python/FastTextLanguageDetection/Dockerfile
+++ b/python/FastTextLanguageDetection/Dockerfile
@@ -30,7 +30,7 @@ ARG BUILD_REGISTRY
 ARG BUILD_TAG=latest
 FROM ${BUILD_REGISTRY}openmpf_python_component_build:${BUILD_TAG} AS build
 
-RUN pip install --no-cache-dir fasttext huggingface_hub
+RUN pip install --no-cache-dir fasttext huggingface_hub 'numpy<2'
 
 RUN python <<eot
 import huggingface_hub
diff --git a/python/FastTextLanguageDetection/pyproject.toml b/python/FastTextLanguageDetection/pyproject.toml
index a91fe72aa..0efe9c58e 100644
--- a/python/FastTextLanguageDetection/pyproject.toml
+++ b/python/FastTextLanguageDetection/pyproject.toml
@@ -34,7 +34,8 @@ version = "9.0"
 dependencies = [
     "mpf_component_api>=9.0",
     "fasttext",
-    "huggingface_hub"
+    "huggingface_hub",
+    "numpy<2"
 ]
 
 [project.entry-points."mpf.exported_component"]

From 18252a00cad96f21673e2a354d4bfede5152ffcb Mon Sep 17 00:00:00 2001
From: Brian Rosenberg <brosenberg@mitre.org>
Date: Thu, 6 Mar 2025 11:24:55 -0500
Subject: [PATCH 2/9] Fix nlp text correction

---
 python/NlpTextCorrection/Dockerfile           | 20 +++++++++++++++++--
 .../nlp_correction_component.py               |  5 ++---
 python/NlpTextCorrection/setup.cfg            |  2 +-
 .../tests/test_nlp_correction.py              |  4 ++--
 4 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/python/NlpTextCorrection/Dockerfile b/python/NlpTextCorrection/Dockerfile
index 9c14737c2..f76055ccf 100644
--- a/python/NlpTextCorrection/Dockerfile
+++ b/python/NlpTextCorrection/Dockerfile
@@ -28,9 +28,16 @@
 
 ARG BUILD_REGISTRY
 ARG BUILD_TAG=latest
-FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG}
 
-RUN pip install --no-cache-dir --use-pep517 'cyhunspell>=2.0.2'
+FROM ${BUILD_REGISTRY}openmpf_python_component_build:${BUILD_TAG} AS build
+
+RUN --mount=type=tmpfs,target=/var/cache/apt \
+    --mount=type=tmpfs,target=/var/lib/apt/lists  \
+    --mount=type=tmpfs,target=/tmp \
+    apt-get update; \
+    apt-get install -y --no-install-recommends libhunspell-dev hunspell-en-us
+
+RUN pip install --no-cache-dir --use-pep517 'hunspell'
 
 ARG RUN_TESTS=false
 
@@ -38,6 +45,15 @@ RUN --mount=target=.,readwrite \
     install-component.sh; \
     if [ "${RUN_TESTS,,}" == true ]; then python tests/test_nlp_correction.py; fi
 
+
+
+FROM ${BUILD_REGISTRY}openmpf_python_executor:${BUILD_TAG}
+
+COPY --from=build $COMPONENT_VIRTUALENV $COMPONENT_VIRTUALENV
+
+COPY --from=build $PLUGINS_DIR/NlpTextCorrection $PLUGINS_DIR/NlpTextCorrection
+
+
 LABEL org.label-schema.license="Apache 2.0" \
       org.label-schema.name="OpenMPF NLP Text Correction" \
       org.label-schema.schema-version="1.0" \
diff --git a/python/NlpTextCorrection/nlp_correction_component/nlp_correction_component.py b/python/NlpTextCorrection/nlp_correction_component/nlp_correction_component.py
index d13fd9ed2..95b6a541c 100644
--- a/python/NlpTextCorrection/nlp_correction_component/nlp_correction_component.py
+++ b/python/NlpTextCorrection/nlp_correction_component/nlp_correction_component.py
@@ -26,7 +26,7 @@
 
 from typing import Iterable
 from typing import Mapping, Sequence
-from hunspell import Hunspell
+from hunspell import HunSpell
 
 import re
 import os
@@ -121,8 +121,7 @@ class HunspellWrapper(object):
     def __init__(self, job_properties: Mapping[str, str]):
         self._job_properties = job_properties
 
-        self._hunspell = Hunspell('en_US')
-        self._hunspell.clear_cache()
+        self._hunspell = HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff')
 
         self._unicode_error = False
 
diff --git a/python/NlpTextCorrection/setup.cfg b/python/NlpTextCorrection/setup.cfg
index ecf8e17d8..6734052de 100644
--- a/python/NlpTextCorrection/setup.cfg
+++ b/python/NlpTextCorrection/setup.cfg
@@ -33,7 +33,7 @@ packages = nlp_correction_component
 install_requires =
     mpf_component_api>=9.0
     mpf_component_util>=9.0
-    cyhunspell>=2.0.2
+    hunspell
 
 [options.entry_points]
 mpf.exported_component =
diff --git a/python/NlpTextCorrection/tests/test_nlp_correction.py b/python/NlpTextCorrection/tests/test_nlp_correction.py
index 293ced52c..9f00ed10b 100644
--- a/python/NlpTextCorrection/tests/test_nlp_correction.py
+++ b/python/NlpTextCorrection/tests/test_nlp_correction.py
@@ -207,7 +207,7 @@ def test_custom_acronym(self):
             feed_forward_track=None
         )
 
-        expected_text = "Hun spell doesn\'t recognize SQ or D.Q. as words. Other variations: D.Q, SQ., .D.Q.!"
+        expected_text = "Hunspell doesn\'t recognize SQ or D.Q. as words. Other variations: D.Q, SQ., .D.Q.!"
 
         results = list(NlpCorrectionComponent().get_detections_from_generic(job))
 
@@ -225,7 +225,7 @@ def test_custom_acronym(self):
             feed_forward_track=None
         )
 
-        expected_text = "Hun spell doesn\'t recognize DQ or D.Q. as words. Other variations: DQ, DQ., .D.Q.!"
+        expected_text = "Hunspell doesn\'t recognize DQ or D.Q. as words. Other variations: DQ, DQ., .D.Q.!"
 
         results = list(NlpCorrectionComponent().get_detections_from_generic(job_2))
 

From 86f2ebe079d826aab7fcfc84c2d8aa28ddfa389d Mon Sep 17 00:00:00 2001
From: Brian Rosenberg <brosenberg@mitre.org>
Date: Thu, 6 Mar 2025 11:43:03 -0500
Subject: [PATCH 3/9] Upgrade whisper

---
 python/WhisperSpeechDetection/Dockerfile | 2 +-
 python/WhisperSpeechDetection/setup.cfg  | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/WhisperSpeechDetection/Dockerfile b/python/WhisperSpeechDetection/Dockerfile
index 0fcc95b0d..5f49d436c 100644
--- a/python/WhisperSpeechDetection/Dockerfile
+++ b/python/WhisperSpeechDetection/Dockerfile
@@ -32,7 +32,7 @@ FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG}
 
 ARG RUN_TESTS=false
 
-RUN pip install --no-cache-dir openai-whisper==20230314 'numpy<1.24,>=1.18'
+RUN pip install --no-cache-dir openai-whisper==20240930
 
 RUN python -c 'import whisper; whisper.load_model("base")'
 RUN python -c 'import whisper; whisper.load_model("base.en")'
diff --git a/python/WhisperSpeechDetection/setup.cfg b/python/WhisperSpeechDetection/setup.cfg
index bc556c50d..d6745891d 100644
--- a/python/WhisperSpeechDetection/setup.cfg
+++ b/python/WhisperSpeechDetection/setup.cfg
@@ -33,8 +33,7 @@ packages = whisper_speech_detection_component
 install_requires =
     mpf_component_api>=9.0
     mpf_component_util>=9.0
-    openai-whisper==20230314
-    numpy<1.24,>=1.18
+    openai-whisper==20240930
 
 [options.entry_points]
 mpf.exported_component =

From b82ed12110d182684e39a54c16704482c15fd0c1 Mon Sep 17 00:00:00 2001
From: Brian Rosenberg <brosenberg@mitre.org>
Date: Thu, 6 Mar 2025 12:44:25 -0500
Subject: [PATCH 4/9] Fix nlp text correction

---
 python/NlpTextCorrection/Dockerfile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/NlpTextCorrection/Dockerfile b/python/NlpTextCorrection/Dockerfile
index f76055ccf..671742015 100644
--- a/python/NlpTextCorrection/Dockerfile
+++ b/python/NlpTextCorrection/Dockerfile
@@ -51,6 +51,10 @@ FROM ${BUILD_REGISTRY}openmpf_python_executor:${BUILD_TAG}
 
 COPY --from=build $COMPONENT_VIRTUALENV $COMPONENT_VIRTUALENV
 
+COPY --from=build /usr/lib/x86_64-linux-gnu/libhunspell-1.7.so.0 /usr/lib/x86_64-linux-gnu/
+
+COPY --from=build /usr/share/hunspell /usr/share/hunspell
+
 COPY --from=build $PLUGINS_DIR/NlpTextCorrection $PLUGINS_DIR/NlpTextCorrection
 
 

From 67ffb5affe1cc541661cc3112738c7933891a1d5 Mon Sep 17 00:00:00 2001
From: Brian Rosenberg <brosenberg@mitre.org>
Date: Tue, 11 Mar 2025 14:16:32 -0400
Subject: [PATCH 5/9] Fix east

---
 .../east_component/east_processor.py               | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/EastTextDetection/east_component/east_processor.py b/python/EastTextDetection/east_component/east_processor.py
index a0b9cb6c9..b809c6c1b 100644
--- a/python/EastTextDetection/east_component/east_processor.py
+++ b/python/EastTextDetection/east_component/east_processor.py
@@ -24,14 +24,11 @@
 # limitations under the License.                                            #
 #############################################################################
 
-import os
+import importlib.resources
 
 import mpf_component_api as mpf
 from .bbox_utils import *
 
-# The path to the serialized EAST model file.
-from pkg_resources import resource_filename
-_model_filename = os.path.realpath(resource_filename(__name__, 'east_resnet50.pb'))
 
 # The output layer names for the EAST model. Respectively: the layer
 # corresponding to the bounding box geometry, and the layer corresponding to
@@ -105,10 +102,10 @@ def load_model(self, frame_width, frame_height, max_side_len, rotate_on,
         use_cached = (old_params == new_params)
 
         if not use_cached:
-            self._model = cv2.dnn.readNetFromTensorflow(_model_filename)
+            self._model = read_net()
 
         if not use_cached or (rotate_on and not self._rotate_on):
-            self._model_90 = cv2.dnn.readNetFromTensorflow(_model_filename)
+            self._model_90 = read_net()
 
         self._rotate_on = rotate_on
 
@@ -396,3 +393,8 @@ def process_frames(self, frames, temp_padding_x, temp_padding_y,
                 image_locs.append([])
 
         return image_locs
+
+def read_net():
+    ref = importlib.resources.files(__name__) / 'east_resnet50.pb'
+    with importlib.resources.as_file(ref) as path:
+        return cv2.dnn.readNetFromTensorflow(str(path))

From d3d2d1bc66de0fd5b71abfef3384eccd5ae6ffd3 Mon Sep 17 00:00:00 2001
From: Brian Rosenberg <brosenberg@mitre.org>
Date: Fri, 15 Aug 2025 07:45:29 -0400
Subject: [PATCH 6/9] Fix spacy version in AzureTranslation

---
 python/AzureTranslation/setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/AzureTranslation/setup.cfg b/python/AzureTranslation/setup.cfg
index 841d2a888..995898772 100644
--- a/python/AzureTranslation/setup.cfg
+++ b/python/AzureTranslation/setup.cfg
@@ -34,7 +34,7 @@ install_requires =
     mpf_component_api>=9.0
     mpf_component_util>=9.0
     langcodes
-    spacy>=3.7.4
+    spacy>=3.7.4,<3.7.6
     wtpsplit>=1.3.0
 
 [options.entry_points]

From 91cfee6c9c7933bc2939bf9e24315b983e5782ba Mon Sep 17 00:00:00 2001
From: Brian Rosenberg <brosenberg@mitre.org>
Date: Fri, 15 Aug 2025 08:14:07 -0400
Subject: [PATCH 7/9] Change assertEquals to assertEqual

---
 .../tests/test_llama_video_summarization.py   | 100 +++++++++---------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/python/LlamaVideoSummarization/tests/test_llama_video_summarization.py b/python/LlamaVideoSummarization/tests/test_llama_video_summarization.py
index 78b84079e..0b2de0ca5 100644
--- a/python/LlamaVideoSummarization/tests/test_llama_video_summarization.py
+++ b/python/LlamaVideoSummarization/tests/test_llama_video_summarization.py
@@ -196,23 +196,23 @@ def setUp(self):
         child_process_send_job_patcher = mock.patch('llama_video_summarization_component.ChildProcess.send_job_get_response')
         self.mock_child_process_send_job = child_process_send_job_patcher.start()
         self.addCleanup(child_process_send_job_patcher.stop)
-        
+
 
     def run_patched_job(self, component, job, response):
         if USE_MOCKS:
             self.mock_child_process_send_job.return_value = response
 
         return component.get_detections_from_video(job)
-    
 
-    def assert_detection_region(self, detection, frame_width, frame_height):    
-        self.assertEquals(0, detection.x_left_upper)
-        self.assertEquals(0, detection.y_left_upper)
-        self.assertEquals(frame_width, detection.width)
-        self.assertEquals(frame_height, detection.height)
+
+    def assert_detection_region(self, detection, frame_width, frame_height):
+        self.assertEqual(0, detection.x_left_upper)
+        self.assertEqual(0, detection.y_left_upper)
+        self.assertEqual(frame_width, detection.width)
+        self.assertEqual(frame_height, detection.height)
 
 
-    def assert_first_middle_last_detections(self, track, frame_width, frame_height):    
+    def assert_first_middle_last_detections(self, track, frame_width, frame_height):
         self.assertIn(track.start_frame, track.frame_locations)
         self.assert_detection_region(track.frame_locations[track.start_frame], frame_width, frame_height)
 
@@ -232,22 +232,22 @@ def test_multiple_videos(self):
         frame_height = int(job.media_properties['FRAME_HEIGHT'])
 
         results = self.run_patched_job(component, job, json.dumps(CAT_TIMELINE))
-        self.assertEquals(3, len(results))
+        self.assertEqual(3, len(results))
 
-        self.assertEquals('TRUE', results[0].detection_properties['SEGMENT SUMMARY'])
+        self.assertEqual('TRUE', results[0].detection_properties['SEGMENT SUMMARY'])
         self.assertIn("looking around as people walk by.", results[0].detection_properties["TEXT"])
-        self.assertEquals(0, results[0].start_frame)
-        self.assertEquals(171, results[0].stop_frame)
+        self.assertEqual(0, results[0].start_frame)
+        self.assertEqual(171, results[0].stop_frame)
         self.assert_first_middle_last_detections(results[0], frame_width, frame_height)
 
         self.assertIn("looking around.", results[1].detection_properties["TEXT"])
-        self.assertEquals(0, results[1].start_frame) # 0 * 25
-        self.assertEquals(121, results[1].stop_frame) # (4.9 * 25) - 1
+        self.assertEqual(0, results[1].start_frame) # 0 * 25
+        self.assertEqual(121, results[1].stop_frame) # (4.9 * 25) - 1
         self.assert_first_middle_last_detections(results[1], frame_width, frame_height)
 
         self.assertIn("looks back at the camera", results[2].detection_properties["TEXT"])
-        self.assertEquals(125, results[2].start_frame) # 5.0 * 25
-        self.assertEquals(169, results[2].stop_frame) # (6.8 * 25) - 1
+        self.assertEqual(125, results[2].start_frame) # 5.0 * 25
+        self.assertEqual(169, results[2].stop_frame) # (6.8 * 25) - 1
         self.assert_first_middle_last_detections(results[2], frame_width, frame_height)
 
 
@@ -256,17 +256,17 @@ def test_multiple_videos(self):
         frame_height = int(job.media_properties['FRAME_HEIGHT'])
 
         results = self.run_patched_job(component, job, json.dumps(DOG_TIMELINE))
-        self.assertEquals(2, len(results))
+        self.assertEqual(2, len(results))
 
-        self.assertEquals('TRUE', results[0].detection_properties['SEGMENT SUMMARY'])
+        self.assertEqual('TRUE', results[0].detection_properties['SEGMENT SUMMARY'])
         self.assertIn("sitting by a window and looking around", results[0].detection_properties["TEXT"])
-        self.assertEquals(0, results[0].start_frame)
-        self.assertEquals(153, results[0].stop_frame)
+        self.assertEqual(0, results[0].start_frame)
+        self.assertEqual(153, results[0].stop_frame)
         self.assert_first_middle_last_detections(results[0], frame_width, frame_height)
 
         self.assertIn("sitting by the window.", results[1].detection_properties["TEXT"])
-        self.assertEquals(0, results[1].start_frame) # 0 * 25
-        self.assertEquals(152, results[1].stop_frame) # (6.12 * 25) - 1
+        self.assertEqual(0, results[1].start_frame) # 0 * 25
+        self.assertEqual(152, results[1].stop_frame) # (6.12 * 25) - 1
         self.assert_first_middle_last_detections(results[1], frame_width, frame_height)
 
 
@@ -275,17 +275,17 @@ def test_multiple_videos(self):
         frame_height = int(job.media_properties['FRAME_HEIGHT'])
 
         results = self.run_patched_job(component, job, json.dumps(SHORT_TIMELINE))
-        self.assertEquals(2, len(results))
+        self.assertEqual(2, len(results))
 
-        self.assertEquals('TRUE', results[0].detection_properties['SEGMENT SUMMARY'])
+        self.assertEqual('TRUE', results[0].detection_properties['SEGMENT SUMMARY'])
         self.assertIn("A person is running around.", results[0].detection_properties["TEXT"])
-        self.assertEquals(0, results[0].start_frame)
-        self.assertEquals(0, results[0].stop_frame)
+        self.assertEqual(0, results[0].start_frame)
+        self.assertEqual(0, results[0].stop_frame)
         self.assert_first_middle_last_detections(results[0], frame_width, frame_height)
 
         self.assertIn("A person running.", results[1].detection_properties["TEXT"])
-        self.assertEquals(0, results[1].start_frame) # 0 * 1
-        self.assertEquals(0, results[1].stop_frame) # (1 * 1) - 1
+        self.assertEqual(0, results[1].start_frame) # 0 * 1
+        self.assertEqual(0, results[1].stop_frame) # (1 * 1) - 1
         self.assert_first_middle_last_detections(results[1], frame_width, frame_height)
 
 
@@ -293,11 +293,11 @@ def test_invalid_timeline(self):
         component = LlamaVideoSummarizationComponent()
 
         job = mpf.VideoJob('cat job', str(TEST_DATA / 'cat.mp4'), 0, 15000,
-            { 
+            {
                 "GENERATION_MAX_ATTEMPTS" : "1"
-            }, 
+            },
             CAT_VIDEO_PROPERTIES, {})
-        
+
         with self.assertRaises(mpf.DetectionException) as cm:
             self.run_patched_job(component, job, json.dumps(
             {
@@ -309,13 +309,13 @@ def test_invalid_timeline(self):
         self.assertIn("Max timeline event end time not close enough to segment stop time.", str(cm.exception))
 
         # test disabling time check
-        job = mpf.VideoJob('cat job', str(TEST_DATA / 'cat.mp4'), 0, 15000, 
+        job = mpf.VideoJob('cat job', str(TEST_DATA / 'cat.mp4'), 0, 15000,
             {
                 "GENERATION_MAX_ATTEMPTS" : "1",
                 "TIMELINE_CHECK_TARGET_THRESHOLD" : "-1"
             },
             CAT_VIDEO_PROPERTIES, {})
-        
+
         results = self.run_patched_job(component, job, json.dumps(
         {
             "video_summary": "This is a video of a cat.",
@@ -336,7 +336,7 @@ def test_invalid_json_response(self):
 
         with self.assertRaises(mpf.DetectionException) as cm:
             self.run_patched_job(component, job, "garbage xyz") # don't care about results
-        
+
         self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code)
         self.assertIn("not valid JSON", str(cm.exception))
 
@@ -389,26 +389,26 @@ def test_timeline_integrity(self):
         # event that starts within range but ends outside of valid frames
         DRONE_TIMELINE_SEGMENT_1["video_event_timeline"][2]["timestamp_end"] = 185.0
         job1_results = self.run_patched_job(component, job1, json.dumps(DRONE_TIMELINE_SEGMENT_1))
-        self.assertEquals(6, len(job1_results))
+        self.assertEqual(6, len(job1_results))
 
         self.assertIn('SEGMENT SUMMARY', job1_results[0].detection_properties)
         for track in job1_results:
-            self.assertEquals('0-5393', track.detection_properties['SEGMENT ID'])
+            self.assertEqual('0-5393', track.detection_properties['SEGMENT ID'])
             self.assertGreaterEqual(track.start_frame, 0)
             self.assertLessEqual(track.stop_frame, 5393)
 
-        self.assertEquals(1962, job1_results[3].start_frame)
-        self.assertEquals(5393, job1_results[3].stop_frame)
+        self.assertEqual(1962, job1_results[3].start_frame)
+        self.assertEqual(5393, job1_results[3].stop_frame)
         self.assertIsNotNone(job1_results[3].frame_locations[1962])
         self.assertIsNotNone(job1_results[3].frame_locations[3752])
         self.assertIsNotNone(job1_results[3].frame_locations[5393])
 
-        self.assertEquals(5393, job1_results[4].start_frame)
-        self.assertEquals(5393, job1_results[4].stop_frame)
+        self.assertEqual(5393, job1_results[4].start_frame)
+        self.assertEqual(5393, job1_results[4].stop_frame)
         self.assertIsNotNone(job1_results[4].frame_locations[5393])
 
-        self.assertEquals(5393, job1_results[5].start_frame)
-        self.assertEquals(5392, job1_results[5].stop_frame) # 179.96 < 179.9798
+        self.assertEqual(5393, job1_results[5].start_frame)
+        self.assertEqual(5392, job1_results[5].stop_frame) # 179.96 < 179.9798
         self.assertIsNotNone(job1_results[5].frame_locations[5393])
 
         job2 = mpf.VideoJob(
@@ -425,7 +425,7 @@ def test_timeline_integrity(self):
             ),
             media_properties=DRONE_VIDEO_PROPERTIES,
             feed_forward_track=None)
-        
+
         with self.assertRaises(mpf.DetectionException) as cm:
             self.run_patched_job(component, job2, json.dumps(DRONE_TIMELINE_SEGMENT_2))
 
@@ -484,26 +484,26 @@ def test_timeline_integrity(self):
 
         DRONE_TIMELINE_SEGMENT_2['video_event_timeline'][1]["timestamp_end"] = 298.46
         job2_results = self.run_patched_job(component, job2, json.dumps(DRONE_TIMELINE_SEGMENT_2))
-        self.assertEquals(3, len(job2_results))
+        self.assertEqual(3, len(job2_results))
 
         self.assertIn('SEGMENT SUMMARY', job2_results[0].detection_properties)
         for track in job2_results:
-            self.assertEquals('5394-8989', track.detection_properties['SEGMENT ID'])
+            self.assertEqual('5394-8989', track.detection_properties['SEGMENT ID'])
             self.assertGreaterEqual(track.start_frame, 5394)
             self.assertLessEqual(track.stop_frame, 8989)
 
-        self.assertEquals(5394, job2_results[1].start_frame)
-        self.assertEquals(6490, job2_results[1].stop_frame)
+        self.assertEqual(5394, job2_results[1].start_frame)
+        self.assertEqual(6490, job2_results[1].stop_frame)
         self.assertIsNotNone(job2_results[1].frame_locations[5394])
         self.assertIsNotNone(job2_results[1].frame_locations[5942])
         self.assertIsNotNone(job2_results[1].frame_locations[6490])
 
-        self.assertEquals(6524, job2_results[2].start_frame)
-        self.assertEquals(8943, job2_results[2].stop_frame)
+        self.assertEqual(6524, job2_results[2].start_frame)
+        self.assertEqual(8943, job2_results[2].stop_frame)
         self.assertIsNotNone(job2_results[2].frame_locations[6524])
         self.assertIsNotNone(job2_results[2].frame_locations[7733])
         self.assertIsNotNone(job2_results[2].frame_locations[8943])
 
 
 if __name__ == "__main__":
-    unittest.main(verbosity=2)
\ No newline at end of file
+    unittest.main(verbosity=2)

From be94abc57bba9338d3f728c855f8c47ff5ad4367 Mon Sep 17 00:00:00 2001
From: Brian Rosenberg <brosenberg@mitre.org>
Date: Mon, 18 Aug 2025 10:10:35 -0400
Subject: [PATCH 8/9] Remove pkg_resources

---
 .../clip_component/clip_component.py          | 103 +++++++++++-------
 .../transformer_tagging_component.py          |  80 ++++++++------
 2 files changed, 111 insertions(+), 72 deletions(-)

diff --git a/python/ClipDetection/clip_component/clip_component.py b/python/ClipDetection/clip_component/clip_component.py
index 78d781ca8..dfba78b5c 100644
--- a/python/ClipDetection/clip_component/clip_component.py
+++ b/python/ClipDetection/clip_component/clip_component.py
@@ -27,9 +27,10 @@
 import logging
 import os
 import csv
-from pkg_resources import resource_filename
+import functools
+import importlib.resources
 from itertools import islice
-from typing import Iterable, Mapping
+from typing import IO, Iterable, Mapping, NamedTuple
 
 from PIL import Image
 import cv2
@@ -106,7 +107,7 @@ def get_detections_from_image_reader(self, image_job, image_reader):
                 yield detection
                 num_detections += 1
             logger.info(f"Job complete. Found {num_detections} detections.")
-        
+
         except Exception as e:
             logger.exception(f'Job failed due to: {e}')
             raise
@@ -119,7 +120,7 @@ def _batches_from_video_capture(video_capture, batch_size):
             if len(frames) >= batch_size:
                 yield len(frames), np.stack(frames)
                 frames = []
-        
+
         if len(frames):
             padded = np.pad(
                 array=np.stack(frames),
@@ -134,10 +135,10 @@ def get_detections_from_video_capture(self,
                                           video_capture: mpf_util.VideoCapture) -> Iterable[mpf.VideoTrack]:
         logger.info("Received video job: %s", video_job)
         kwargs = self._parse_properties(video_job.job_properties)
-        
+
         # If processing a video where each frame is cropped into 144 images, the batch size is set to one so that the crops aren't split between batches
         batch_size = 1 if kwargs['enable_cropping'] else kwargs['batch_size']
-        
+
         batch_gen = self._batches_from_video_capture(video_capture, batch_size)
         detections = []
         wrapper = self._get_model_wrapper(kwargs['model_name'])
@@ -148,7 +149,7 @@ def get_detections_from_video_capture(self,
             except Exception as e:
                 logger.exception(f"Job failed due to: {e}")
                 raise
-        
+
         tracks = create_tracks(detections)
         logger.info(f"Job complete. Found {len(tracks)} tracks.")
         return tracks
@@ -159,8 +160,18 @@ def _get_model_wrapper(self, model_name):
 
         return self._model_wrappers[model_name]
 
+
+def get_model_cache_size():
+    if env_val := os.getenv('MODEL_CACHE_SIZE'):
+        try:
+            return int(env_val)
+        except ValueError:
+            pass
+    return 4
+
+@functools.lru_cache(get_model_cache_size())
 class ClipWrapper(object):
-    def __init__(self, model_name='ViT-L/14'):
+    def __init__(self, model_name):
         logger.info("Loading model...")
         model, _ = clip.load(model_name, device=device, download_root='/models')
         logger.info("Model loaded.")
@@ -169,9 +180,8 @@ def __init__(self, model_name='ViT-L/14'):
         self._preprocessor = None
         self._input_resolution = self._model.visual.input_resolution
 
-        self._classification_path = ''
+        self._class_list_source: ClassListSource | None = None
         self._template_path = ''
-        self._classification_list = ''
 
         self._templates = None
         self._template_type = None
@@ -210,7 +220,7 @@ def get_detections(self, images, **kwargs) -> Iterable[mpf.ImageLocation]:
 
         if kwargs['enable_cropping']:
             similarity = torch.mean(similarity, 0).unsqueeze(0)
-        
+
         values, indices = similarity.topk(len(self._class_mapping))
 
         for detection_values, detection_indices, image_size in zip(values, indices, image_sizes):
@@ -225,16 +235,16 @@ def get_detections(self, images, **kwargs) -> Iterable[mpf.ImageLocation]:
                     classification_list.append(class_name)
                     classification_confidence_list.append(str(value.item()))
                     count += 1
-            
+
             classification_list = '; '.join(classification_list)
             classification_confidence_list = '; '.join(classification_confidence_list)
-            
+
             detection_properties = {
                 "CLASSIFICATION": classification_list.split('; ')[0],
                 "CLASSIFICATION CONFIDENCE LIST": classification_confidence_list,
                 "CLASSIFICATION LIST": classification_list
             }
-            
+
             if kwargs['include_features']:
                 detection_properties['FEATURE'] = base64.b64encode(image_features.cpu().numpy()).decode()
 
@@ -256,7 +266,7 @@ def _check_template_list(self, template_path: str, template_type: str) -> bool:
                 )
             elif self._template_path != template_path:
                 self._template_path = template_path
-            
+
                 try:
                     logger.info("Updating templates...")
                     self._templates = self._get_templates_from_file(template_path)
@@ -274,10 +284,11 @@ def _check_template_list(self, template_path: str, template_type: str) -> bool:
                 template_filename = 'seven_templates.txt'
             elif template_type == 'openai_1':
                 template_filename = 'one_template.txt'
-            
-            template_path = os.path.realpath(resource_filename(__name__, 'data/' + template_filename))
+
             logger.info("Updating templates...")
-            self._templates = self._get_templates_from_file(template_path)
+            template_resource = importlib.resources.files(__name__) / 'data' / template_filename
+            with importlib.resources.as_file(template_resource) as template_resource_path:
+                self._templates = self._get_templates_from_file(str(template_resource_path))
             self._template_type = template_type
             logger.info("Templates updated.")
             return True
@@ -290,17 +301,20 @@ def _check_class_list(self, classification_path: str, classification_list: str,
                     f"The path {classification_path} is not valid",
                     mpf.DetectionError.COULD_NOT_OPEN_DATAFILE
                 )
+            list_source = ClassListSource(classification_list, classification_path, False)
         else:
-            if self._classification_list != classification_list.lower():
-                self._classification_list = classification_list.lower()
-            classification_path = os.path.realpath(resource_filename(__name__, f'data/{self._classification_list}_classification_list.csv'))
-        
-        if self._classification_path != classification_path or templates_changed:
-            self._classification_path = classification_path
+            classification_list = classification_list.lower()
+            list_source = ClassListSource(
+                    classification_list,
+                    f'data/{classification_list}_classification_list.csv',
+                    True)
+
+        if list_source != self._class_list_source or templates_changed:
+            self._class_list_source = list_source
 
             try:
                 logger.info("Updating classifications...")
-                self._class_mapping = self._get_mapping_from_classifications(classification_path)
+                self._class_mapping = self._get_mapping_from_classifications(list_source)
                 logger.info("Classifications updated.")
             except Exception:
                 raise mpf.DetectionException(
@@ -321,15 +335,15 @@ def _check_class_list(self, classification_path: str, classification_list: str,
                     text_features.append(text_embedding)
                 self._text_features = torch.stack(text_features, dim=1).float().to(device)
                 logger.info("Text embeddings created.")
-    
+
     @staticmethod
-    def _get_mapping_from_classifications(classification_path: str) -> Mapping[str, str]:
-        with open(classification_path) as csvfile:
+    def _get_mapping_from_classifications(class_list_source: 'ClassListSource') -> Mapping[str, str]:
+        with class_list_source.open() as csvfile:
             mapping = {}
             csvreader = csv.reader(csvfile)
             for row in csvreader:
                 mapping[row[0].strip()] = row[1].strip()
-                
+
         return mapping
 
     @staticmethod
@@ -337,6 +351,19 @@ def _get_templates_from_file(template_path: str) -> Iterable[str]:
         with open(template_path) as f:
             return [line.strip() for line in f.readlines()]
 
+
+class ClassListSource(NamedTuple):
+    list_type: str
+    file_name: str
+    is_resource: bool
+
+    def open(self) -> IO[str]:
+        if self.is_resource:
+            return (importlib.resources.files(__name__) / self.file_name).open()
+        else:
+            return open(self.file_name)
+
+
 class CLIPInferencingServer(object):
     '''
     Class that handles Triton inferencing if enabled.
@@ -357,7 +384,7 @@ def __init__(self, triton_server: str, model_name: str = 'ViT-L/14'):
         except InferenceServerException as e:
             logger.exception("Client creation failed.")
             raise
-        
+
         # Check if triton server is alive and ready
         self._check_triton_server()
 
@@ -380,7 +407,7 @@ def get_model_name(self) -> str:
     def _parse_model(self, model_metadata) -> None:
         input_metadata = model_metadata.inputs[0]
         output_metadata = model_metadata.outputs[0]
-        
+
         self._input_name = input_metadata.name
         self._output_name = output_metadata.name
         self._dtype = input_metadata.datatype
@@ -396,7 +423,7 @@ def _get_inputs_outputs(self, images):
     def get_responses(self, images):
         images = np.array(images.cpu())
         images = images.astype(triton_to_np_dtype(self._dtype))
-    
+
         responses = []
         try:
             for inputs, outputs in self._get_inputs_outputs(images):
@@ -406,13 +433,13 @@ def get_responses(self, images):
                 f"Inference failed: {e}",
                 mpf.DetectionError.NETWORK_ERROR
             )
-        
+
         results = []
         for response in responses:
-            result = response.as_numpy(self._output_name)       
+            result = response.as_numpy(self._output_name)
             results.append(result)
-        return results   
-    
+        return results
+
     def _check_triton_server(self) -> None:
         try:
             if not self._triton_client.is_server_live():
@@ -449,7 +476,7 @@ def __init__(self, enable_cropping: bool, image_size: int):
             self.preprocess = self.crop
         else:
             self.preprocess = self.resize_pad
-    
+
     def crop(self, image):
         return T.Compose([
             self._resize_images,
@@ -471,7 +498,7 @@ def resize_pad(self, image):
             top = (self.image_size - new_h) // 2
             bottom = (self.image_size + 1 - new_h) // 2
             padding = (0, top, 0, bottom)
-        
+
         new_img = T.Compose([
             T.Resize(size=(new_h, new_w)),
             T.Pad(padding=padding, padding_mode='edge'),
diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
index f6fbc8ce7..5ff1084a3 100644
--- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
+++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
@@ -25,6 +25,9 @@
 #############################################################################
 
 import logging
+import os
+import functools
+import importlib.resources
 
 import mpf_component_api as mpf
 import mpf_component_util as mpf_util
@@ -35,7 +38,6 @@
 import os
 import time
 
-from pkg_resources import resource_filename
 from nltk.tokenize.punkt import PunktSentenceTokenizer
 import pandas as pd
 
@@ -45,7 +47,12 @@ class TransformerTaggingComponent:
 
     def __init__(self):
         self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2')
-        self._cached_corpuses: Dict[str, Corpus] = {}
+
+        @functools.lru_cache(get_corpus_cache_size())
+        def get_corpus(file_name):
+            return Corpus(file_name, self._cached_model)
+
+        self._get_corpus = get_corpus
 
 
     def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]:
@@ -89,7 +96,7 @@ def get_detections_from_generic(self, job: mpf.GenericJob) -> Sequence[mpf.Gener
             }
 
             config = JobConfig(new_job_props)
-            corpus = self._get_corpus(config.corpus_path)
+            corpus = self._get_corpus(config.corpus_file_name)
             self._add_tags(config, corpus, new_ff_props)
 
             return [ff_track]
@@ -103,7 +110,7 @@ def _get_feed_forward_detections(self, job, job_feed_forward, video_job=False):
                     ' jobs, but no feed forward track provided. ')
 
             config = JobConfig(job.job_properties)
-            corpus = self._get_corpus(config.corpus_path)
+            corpus = self._get_corpus(config.corpus_file_name)
 
             self._add_tags(config, corpus, job_feed_forward.detection_properties)
 
@@ -119,13 +126,6 @@ def _get_feed_forward_detections(self, job, job_feed_forward, video_job=False):
             raise
 
 
-    def _get_corpus(self, corpus_path):
-        if not corpus_path in self._cached_corpuses:
-            self._cached_corpuses[corpus_path] = Corpus(corpus_path, self._cached_model)
-
-        return self._cached_corpuses[corpus_path]
-
-
     def _add_tags(self, config, corpus, ff_props: Dict[str, str]):
         input_texts = {}
         for prop_to_tag in config.props_to_process:
@@ -137,7 +137,7 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]):
             logger.warning("Feed forward element missing one of the following properties: "
                            + ", ".join(config.props_to_process))
             return
-            
+
         for prop_to_tag, input_text in input_texts.items():
             self._add_tags_for_prop(config, corpus, ff_props, prop_to_tag, input_text)
 
@@ -161,10 +161,10 @@ def _add_tags_for_prop(self, config, corpus, ff_props: Dict[str, str], prop_to_t
             for probe in probe_list:
                 # strip probe of leading and trailing whitespace
                 stripped_probe = probe.strip()
-                
+
                 # determine probe character offsets
                 num_leading_chars = len(probe) - len(probe.lstrip())
-                offset_start = offset_counter + num_leading_chars 
+                offset_start = offset_counter + num_leading_chars
                 offset_end = offset_start + len(stripped_probe) - 1
 
                 # set character offset counter for next iteration
@@ -210,7 +210,7 @@ def _add_tags_for_prop(self, config, corpus, ff_props: Dict[str, str], prop_to_t
 
         # create detection properties for each tag found in the text
         # detection properties formatted as <input property> <tag> TRIGGER SENTENCES...
-        for tag in new_tags: 
+        for tag in new_tags:
             tag_df = all_tag_results[all_tag_results["tag"] == tag]
 
             sents = []
@@ -243,15 +243,41 @@ def _add_tags_for_prop(self, config, corpus, ff_props: Dict[str, str], prop_to_t
                 ff_props[prop_name_matches] = "; ".join(matches)
 
 
-class Corpus:
-    def __init__(self, corpus_path, model):
-        self.json = pd.read_json(corpus_path)
+def get_corpus_cache_size():
+    if env_val := os.getenv('CORPUS_CACHE_SIZE'):
+        try:
+            return int(env_val)
+        except ValueError:
+            pass
+    return 4
 
+class Corpus:
+    def __init__(self, corpus_file_name: str, model: SentenceTransformer):
+        self.json = self._load_json(corpus_file_name)
         start = time.time()
-        self.embed= model.encode(self.json["text"], convert_to_tensor=True, show_progress_bar=False)
+        self.embed = model.encode(self.json["text"], convert_to_tensor=True, show_progress_bar=False)
         elapsed = time.time() - start
         logger.info(f"Successfully encoded corpus in {elapsed} seconds.")
 
+    @staticmethod
+    def _load_json(corpus_file_name: str) -> pd.DataFrame:
+        try:
+            if '$' not in corpus_file_name and '/' not in corpus_file_name:
+                with importlib.resources.open_text(__name__, corpus_file_name) as f:
+                    return pd.read_json(f)
+            else:
+                path = os.path.expandvars(corpus_file_name)
+                with open(path) as f:
+                    return pd.read_json(f)
+        except FileNotFoundError as e:
+            logger.exception(
+                'Failed to complete job due incorrect file path for the transformer tagging corpus: '
+                f'"{corpus_file_name}"')
+            raise mpf.DetectionException(
+                'Invalid path provided for transformer tagging corpus: '
+                f'"{corpus_file_name}"',
+                mpf.DetectionError.COULD_NOT_READ_DATAFILE) from e
+
 
 class JobConfig:
     def __init__(self, props: Mapping[str, str]):
@@ -274,19 +300,5 @@ def __init__(self, props: Mapping[str, str]):
         # if split on newline is true will split input on newline and carriage returns
         self.split_on_newline = mpf_util.get_property(props, 'ENABLE_NEWLINE_SPLIT', False)
 
-        self.corpus_file = \
+        self.corpus_file_name = \
             mpf_util.get_property(props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json")
-
-        self.corpus_path = ""
-        if "$" not in self.corpus_file and "/" not in self.corpus_file:
-            self.corpus_path = os.path.realpath(resource_filename(__name__, self.corpus_file))
-        else:
-            self.corpus_path = os.path.expandvars(self.corpus_file)
-
-        if not os.path.exists(self.corpus_path):
-            logger.exception('Failed to complete job due incorrect file path for the transformer tagging corpus: '
-                             f'"{self.corpus_file}"')
-            raise mpf.DetectionException(
-                'Invalid path provided for transformer tagging corpus: '
-                f'"{self.corpus_file}"',
-                mpf.DetectionError.COULD_NOT_READ_DATAFILE)

From d495cf73266f889cf71d37a639e83942a5e1d578 Mon Sep 17 00:00:00 2001
From: Brian Rosenberg <brosenberg@mitre.org>
Date: Mon, 18 Aug 2025 12:30:56 -0400
Subject: [PATCH 9/9] Remove debug line

---
 python/ArgosTranslation/Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/ArgosTranslation/Dockerfile b/python/ArgosTranslation/Dockerfile
index 246cbbf5b..63f1ecfd7 100644
--- a/python/ArgosTranslation/Dockerfile
+++ b/python/ArgosTranslation/Dockerfile
@@ -77,7 +77,6 @@ RUN --mount=target=.,readwrite \
     --mount=from=setup_models,source=/root/.local/share/argos-translate,target=/root/.local/share/argos-translate \
 <<eot
     if [ "${RUN_TESTS,,}" == true ]; then
-        echo test
         python tests/test_argos_translation.py
     fi
 eot