From dd4bdcdcaea352de80d4f4d0b23e122f315f8859 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 5 Aug 2025 02:43:22 -0700
Subject: [PATCH 01/10] CharacterHistogramLangValidator processor
 implementation

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                          |   3 +
 sdp/processors/__init__.py                    |   1 +
 .../modify_manifest/data_to_data.py           | 165 ++++++++++++++++++
 3 files changed, 169 insertions(+)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 9bbbc42e..c6c7ff01 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -255,6 +255,9 @@ Data modifications
 .. autodata:: sdp.processors.EstimateBandwidth
    :annotation:
 
+.. autodata:: sdp.processors.CharacterHistogramLangValidator
+   :annotation:
+
 Data filtering
 ''''''''''''''
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index e8ce45c3..d5deaaed 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -119,6 +119,7 @@
     ListToEntries,
     LambdaExpression,
     EstimateBandwidth,
+    CharacterHistogramLangValidator,
 )
 from sdp.processors.modify_manifest.data_to_dropbool import (
     DropASRError,
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 09a55011..88405d2c 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -16,6 +16,12 @@
 import os
 import re
 from typing import Dict, List, Optional
+import tempfile
+import shutil
+import requests
+import wget
+import tarfile
+from glob import glob
 
 import soundfile
 import torchaudio
@@ -1316,4 +1322,163 @@ def process_dataset_entry(self, data_entry):
         audio, sr = librosa.load(path=audio_file, sr=self.sample_rate, duration=self.max_seconds)
         bandwidth = self._estimate_bandwidth(audio=audio, sample_rate=sr)
         data_entry[self.output_bandwidth_key] = int(bandwidth)
+        return [DataEntry(data=data_entry)]
+
+
+class CharacterHistogramLangValidator(BaseParallelProcessor):
+    """
+    A processor that filters text based on character histogram similarity to trusted data in the target language.
+
+    This processor computes the ratio of characters in a given text that are found in a reference character histogram
+    for a specific language. If this ratio is below a certain threshold, the text is likely mislabeled or noisy.
+
+    Histograms are sourced from the NLLB paper (https://arxiv.org/pdf/2207.04672), see page 30 for methodology. This
+    technique is a lightweight language ID filter, designed to catch mismatches between text content and claimed language.
+
+    Reference implementation: https://github.com/facebookresearch/fairseq/blob/main/examples/m2m_100/process_data/clean_histogram.py
+
+    Args:
+        text_field (str): Key in the data entry containing the text to evaluate.
+        lang_field (str, optional): Key in the data entry that identifies the language. Required if `lang` is not provided.
+        lang (str, optional): Language code to use for all entries (overrides `lang_field`). Required if `lang_field` is not provided.
+        threshold (float): Threshold ratio to determine if text matches the histogram. Used only externally (not enforced in this processor).
+        cache_dir (str, optional): Directory where histograms are downloaded and cached.
+        threshold_char (str): Character used to truncate the histogram file (default is ']').
+        output_score_field (str): Key name under which the computed character match ratio will be stored.
+        **kwargs: Additional keyword arguments passed to `BaseParallelProcessor`.
+
+    Raises:
+        ValueError: If both `lang` and `lang_field` are provided, or if neither is provided.
+                    Also raised if histogram for specified language is missing.
+
+    Returns:
+        A manifest where each entry includes the additional field `output_score_field` with the character match ratio.
+            Example::
+
+                {
+                    "text": "hello world",
+                    "lang": "en",
+                    "hist_token_ratio": 0.95
+                }
+    """
+
+    HISTOGRAMS_URL = 'https://dl.fbaipublicfiles.com/m2m_100/histograms.tar.gz'
+
+    def __init__(self,
+                 text_field: str,
+                 lang_field: str = None,
+                 lang: str = None,
+                 threshold: float = 0.8,
+                 cache_dir: str = None,
+                 threshold_char: str = "]",
+                 output_score_field: str = "hist_token_ratio",
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.text_field = text_field
+
+        # Ensure exactly one of `lang` or `lang_field` is provided
+        if lang_field is None and lang is None: 
+            raise ValueError("One of the arguments `lang` or `lang_field` must be provided.")
+        if lang_field is not None and lang is not None: 
+            raise ValueError(
+                f"Both `lang` ({lang}) and `lang_field` ({lang_field}) are provided, which makes the source of language ambiguous. Please provide only one of them."
+            )
+
+        self.lang_field = lang_field
+        self.lang = lang
+        self.threshold = threshold
+        self.cache_dir = cache_dir
+        self.threshold_char = threshold_char
+        self.output_score_field = output_score_field
+        self.histograms = dict()
+
+    def _read_hist(self, lang: str):
+        """
+        Read and parse the histogram file for a given language, stopping at the threshold character.
+        """
+        hist_file = os.path.join(self.cache_dir, lang)
+        chars = []
+        with open(hist_file) as hist:
+            for line in hist:
+                char = line[0] 
+                chars.append(char)
+                if char == self.threshold_char:
+                    break
+        self.histograms[lang] = set(chars)
+
+    def _download_histograms(self):
+        """
+        Download and extract histogram files into the cache directory.
+        """
+        logger.info('Downloading histograms collection..')
+        response = requests.get(self.HISTOGRAMS_URL)
+        if response.status_code != 200:
+            raise requests.exceptions.RequestException(
+                f"Failed to download model file. Status code: {response.status_code}"
+            )
+
+        if self.cache_dir is None:
+            self.cache_dir = tempfile.mkdtemp()
+        
+        os.makedirs(self.cache_dir, exist_ok=True)
+
+        histograms_tarfile = wget.download(self.HISTOGRAMS_URL, out=self.cache_dir)
+        with tarfile.open(histograms_tarfile, "r:gz") as tar:
+            tar.extractall(path=self.cache_dir)
+
+        # Flatten subdirectories into the main cache_dir
+        histograms_filepaths = glob(f'{self.cache_dir}/checkpoint/edunov/cc60_multilingual/clean_hists/*')
+        for histogram_filepath in histograms_filepaths:
+            shutil.move(histogram_filepath, os.path.join(self.cache_dir, os.path.basename(histogram_filepath)))
+
+        os.remove(histograms_tarfile)
+        shutil.rmtree(f'{self.cache_dir}/checkpoint/edunov/cc60_multilingual/clean_hists/')
+        logger.info(f'Histograms have been downloaded to {self.cache_dir}.')
+
+    def prepare(self):
+        """
+        Ensure histograms are available and read them into memory.
+        """
+        if (self.cache_dir is None or 
+            not os.path.exists(self.cache_dir) or 
+            not os.path.isdir(self.cache_dir) or 
+            len(os.listdir(self.cache_dir)) == 0):
+            
+            self._download_histograms()
+
+        logger.info('Reading histograms...')
+        available_langs = os.listdir(self.cache_dir)
+        if self.lang is not None:
+            if self.lang in available_langs:
+                self._read_hist(self.lang)
+            else:
+                raise ValueError(f"Invalid value for `lang`: {self.lang}. Please provide one of the following: {available_langs}")
+            logger.info(f'Histogram for `{self.lang}` has been read.')
+        else:
+            for lang in tqdm(available_langs):
+                self._read_hist(lang)
+            logger.info(f'Histograms have been read.')
+
+    def process_dataset_entry(self, data_entry):
+        """
+        Compute and attach the character histogram match ratio for a given text entry.
+
+        Args:
+            data_entry (dict): A dictionary containing at least `text_field` and either `lang_field` or a preset `lang`.
+
+        Returns:
+            List[DataEntry]: A list with one updated `DataEntry` including the character match ratio field.
+        """
+        # Determine language for this entry
+        lang = self.lang if self.lang is not None else data_entry[self.lang_field]
+        if lang not in self.histograms:
+            raise ValueError(f'lang `{lang}` is not supported.')
+
+        # Compute how many characters match the histogram
+        text = data_entry[self.text_field].strip()
+        cnt = len([c for c in text if c in self.histograms[lang]])
+        token_ratio = cnt / len(text) if len(text) > 0 else 0.0
+
+        # Store the ratio in the data entry
+        data_entry[self.output_score_field] = token_ratio
         return [DataEntry(data=data_entry)]
\ No newline at end of file

From c45773e5a37d61f3acf8b8cbcdb419c04248b94a Mon Sep 17 00:00:00 2001
From: root <root@cpu-00100.cm.cluster>
Date: Wed, 6 Aug 2025 02:41:09 -0700
Subject: [PATCH 02/10] Tests added

Signed-off-by: root <root@cpu-00100.cm.cluster>
---
 tests/test_data_to_data.py | 45 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index 9dd3278a..d4d79d0a 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import pytest
+import os
+import boto3
 
 from sdp.processors.modify_manifest.data_to_data import (
     InsIfASRInsertion,
@@ -21,6 +23,7 @@
     SubRegex,
     ListToEntries,
     LambdaExpression,
+    CharacterHistogramLangValidator,
 )
 
 from sdp.processors.inference.llm.utils.qwen_cleaning import CleanQwenGeneration
@@ -282,6 +285,48 @@ def test_detect_whisper_hallucinations(tmp_path, text, expected_flags):
     for key, value in expected_flags.items():
         assert result_entry[key] == value, f"Failed for text='{text}' on key='{key}'"
 
+@pytest.fixture
+def download_en_hist(tmp_path):
+    s3 = boto3.client(
+        's3',
+        aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
+        aws_secret_access_key=os.getenv("AWS_SECRET_KEY")
+    )
+
+    s3.download_file("sdp-test-data",
+                     "test_processor/CharacterHistogramLangValidator/histograms/en", 
+                     os.path.join(tmp_path, "en"))
+ 
+    assert os.path.exists(os.path.join(tmp_path, "en")), "No histogram files downloaded from S3"
+    return str(tmp_path)
+
+@pytest.mark.parametrize(
+    "text,expected",
+    [   
+        # Plain English sentence; all characters expected in 'en' histogram -> ratio 1.0
+        ("Hello, how are you today?", 1.0),
+        # # Chinese characters; none expected in 'en' histogram -> ratio 0.0
+        ("今天天气很好，我们去公园吧。", 0.0),
+        # Symbols + digits; only digits 1..5 expected in 'en' histogram -> 5 matches out of 17 chars
+        ("@#$%^&*()_+=12345", 5 / 17), # 0.29411764705882354
+        # French sentence with one accented char 'é' not in 'en' histogram -> 23 matches out of 24 chars
+        ("C'est une belle journée.", 23 / 24), # 0.9583333333333334
+    ],
+)
+def test_character_hist_validator_from_s3(text, expected, download_en_hist):
+    processor = CharacterHistogramLangValidator(
+        text_field="text",
+        lang="en",
+        cache_dir=download_en_hist,
+        output_manifest_file=None,
+    )
+    processor.prepare()
+
+    entry = {"text": text}
+    result_entry = processor.process_dataset_entry(entry)[0].data
+
+    assert result_entry[processor.output_score_field] == pytest.approx(expected, rel=1e-12)
+
 @pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str)
 def test_data_to_data(test_class, class_kwargs, test_input, expected_output):
     processor = test_class(**class_kwargs, output_manifest_file=None)

From 87d9292e2848df3184b3175a8a4c00b1d9391ea8 Mon Sep 17 00:00:00 2001
From: root <root@cpu-00100.cm.cluster>
Date: Wed, 6 Aug 2025 04:23:56 -0700
Subject: [PATCH 03/10] tmp_change

Signed-off-by: root <root@cpu-00100.cm.cluster>
---
 tests/test_data_to_data.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index d4d79d0a..c4630ef4 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -286,19 +286,26 @@ def test_detect_whisper_hallucinations(tmp_path, text, expected_flags):
         assert result_entry[key] == value, f"Failed for text='{text}' on key='{key}'"
 
 @pytest.fixture
-def download_en_hist(tmp_path):
+def download_en_hist(tmp_dir):
     s3 = boto3.client(
         's3',
         aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
-        aws_secret_access_key=os.getenv("AWS_SECRET_KEY")
+        aws_secret_access_key=os.getenv("AWS_SECRET_KEY"),
     )
 
-    s3.download_file("sdp-test-data",
-                     "test_processor/CharacterHistogramLangValidator/histograms/en", 
-                     os.path.join(tmp_path, "en"))
- 
-    assert os.path.exists(os.path.join(tmp_path, "en")), "No histogram files downloaded from S3"
-    return str(tmp_path)
+    #s3.download_file("sdp-test-data",
+    #                 "test_processor/CharacterHistogramLangValidator/histograms/en", 
+    #                 os.path.join(tmp_dir, "en"))
+    
+    s3.download_file(
+       "sdp-test-data",
+       "test_data/tts/ytc/test_data_reference.json",
+       tmp_dir/"test_data_reference.json",
+    )
+
+    print('ok! ' * 100)
+    #assert os.path.exists(os.path.join(tmp_path, "en")), "No histogram files downloaded from S3"
+    return str(tmp_dir)
 
 @pytest.mark.parametrize(
     "text,expected",

From 1dbd026f27a24d654f68847b018ff181831eee1e Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 6 Aug 2025 06:40:17 -0700
Subject: [PATCH 04/10] Tmp check

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .github/workflows/tests.yml | 2 +-
 tests/test_data_to_data.py  | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 351b4167..5eace051 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -93,7 +93,7 @@ jobs:
         sudo cp incommon-rsa-ca2.pem     /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL]
         sudo update-ca-certificates # [cert for CORAL]
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
-        python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt
+        python -m pytest tests/test_data_to_data.py --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt
 
 
 # TODO: add some way to see if e2e tests were skipped
diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index c4630ef4..4f08adcd 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -286,7 +286,7 @@ def test_detect_whisper_hallucinations(tmp_path, text, expected_flags):
         assert result_entry[key] == value, f"Failed for text='{text}' on key='{key}'"
 
 @pytest.fixture
-def download_en_hist(tmp_dir):
+def download_en_hist(tmp_path):
     s3 = boto3.client(
         's3',
         aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
@@ -295,17 +295,17 @@ def download_en_hist(tmp_dir):
 
     #s3.download_file("sdp-test-data",
     #                 "test_processor/CharacterHistogramLangValidator/histograms/en", 
-    #                 os.path.join(tmp_dir, "en"))
+    #                 os.path.join(tmp_path, "en"))
     
     s3.download_file(
        "sdp-test-data",
        "test_data/tts/ytc/test_data_reference.json",
-       tmp_dir/"test_data_reference.json",
+       tmp_path/"test_data_reference.json",
     )
 
     print('ok! ' * 100)
     #assert os.path.exists(os.path.join(tmp_path, "en")), "No histogram files downloaded from S3"
-    return str(tmp_dir)
+    return str(tmp_path)
 
 @pytest.mark.parametrize(
     "text,expected",

From bfc03b34c9e6066b0e21914ae28389a8384d8a1d Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 6 Aug 2025 07:02:05 -0700
Subject: [PATCH 05/10] new test s3 key

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 tests/test_data_to_data.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index 4f08adcd..2d9db7f2 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -293,18 +293,11 @@ def download_en_hist(tmp_path):
         aws_secret_access_key=os.getenv("AWS_SECRET_KEY"),
     )
 
-    #s3.download_file("sdp-test-data",
-    #                 "test_processor/CharacterHistogramLangValidator/histograms/en", 
-    #                 os.path.join(tmp_path, "en"))
+    s3.download_file("sdp-test-data",
+                     "test_data/test_processors/CharacterHistogramLangValidator/histograms/en", 
+                     os.path.join(tmp_path, "en"))
     
-    s3.download_file(
-       "sdp-test-data",
-       "test_data/tts/ytc/test_data_reference.json",
-       tmp_path/"test_data_reference.json",
-    )
-
-    print('ok! ' * 100)
-    #assert os.path.exists(os.path.join(tmp_path, "en")), "No histogram files downloaded from S3"
+    assert os.path.exists(os.path.join(tmp_path, "en")), "No histogram files downloaded from S3"
     return str(tmp_path)
 
 @pytest.mark.parametrize(

From a372ab24ae9a2162933d66f1385610ba25f5cb53 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 6 Aug 2025 07:33:42 -0700
Subject: [PATCH 06/10] Turn on all tests

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 5eace051..351b4167 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -93,7 +93,7 @@ jobs:
         sudo cp incommon-rsa-ca2.pem     /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL]
         sudo update-ca-certificates # [cert for CORAL]
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
-        python -m pytest tests/test_data_to_data.py --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt
+        python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt
 
 
 # TODO: add some way to see if e2e tests were skipped

From d8e49b436a66db495f0d3bf9a3af7982744340cc Mon Sep 17 00:00:00 2001
From: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Date: Wed, 6 Aug 2025 19:32:24 +0400
Subject: [PATCH 07/10] From s3_client to s3_resource

---
 tests/test_data_to_data.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index 2d9db7f2..e3ddb6d4 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -287,15 +287,17 @@ def test_detect_whisper_hallucinations(tmp_path, text, expected_flags):
 
 @pytest.fixture
 def download_en_hist(tmp_path):
-    s3 = boto3.client(
-        's3',
+    s3_resource = boto3.resource(
+        "s3",
         aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
         aws_secret_access_key=os.getenv("AWS_SECRET_KEY"),
     )
 
-    s3.download_file("sdp-test-data",
-                     "test_data/test_processors/CharacterHistogramLangValidator/histograms/en", 
-                     os.path.join(tmp_path, "en"))
+    bucket = s3_resource.Bucket("sdp-test-data")
+    
+    bucket.download_file(
+        "test_data/test_processors/CharacterHistogramLangValidator/histograms/en",
+        os.path.join(tmp_path, "en"))
     
     assert os.path.exists(os.path.join(tmp_path, "en")), "No histogram files downloaded from S3"
     return str(tmp_path)
@@ -332,4 +334,4 @@ def test_data_to_data(test_class, class_kwargs, test_input, expected_output):
     processor = test_class(**class_kwargs, output_manifest_file=None)
     result = [entry.data for entry in processor.process_dataset_entry(test_input)]
 
-    assert result == expected_output
\ No newline at end of file
+    assert result == expected_output

From 8c753edb9d19e91d6a584de15b78bcd776d4bfd1 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Thu, 7 Aug 2025 00:52:30 -0700
Subject: [PATCH 08/10] Fix test

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 tests/test_data_to_data.py | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index e3ddb6d4..a0e33ff3 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -285,22 +285,29 @@ def test_detect_whisper_hallucinations(tmp_path, text, expected_flags):
     for key, value in expected_flags.items():
         assert result_entry[key] == value, f"Failed for text='{text}' on key='{key}'"
 
-@pytest.fixture
-def download_en_hist(tmp_path):
-    s3_resource = boto3.resource(
-        "s3",
-        aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
-        aws_secret_access_key=os.getenv("AWS_SECRET_KEY"),
-    )
+@pytest.fixture(scope="session")
+def en_hist_dir(tmp_path_factory):
+    """
+    Download the English histogram from S3 just once
+    and return the directory path that contains it.
+
+    Uses tmp_path_factory → one persistent temp-dir for the whole session.
+    """
+    s3 = boto3.client('s3',
+                    aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
+                    aws_secret_access_key=os.getenv("AWS_SECRET_KEY")
+                    )
+    
+    bucket = "sdp-test-data"
+    key = "test_data/test_processors/CharacterHistogramLangValidator/histograms/en"
 
-    bucket = s3_resource.Bucket("sdp-test-data")
+    tmp_dir = tmp_path_factory.mktemp("char_hists")
+    local_path = tmp_dir / "en"
     
-    bucket.download_file(
-        "test_data/test_processors/CharacterHistogramLangValidator/histograms/en",
-        os.path.join(tmp_path, "en"))
+    s3.download_file(bucket, key, str(local_path))
     
-    assert os.path.exists(os.path.join(tmp_path, "en")), "No histogram files downloaded from S3"
-    return str(tmp_path)
+    assert local_path.exists(), "Histogram file was not downloaded"
+    return str(tmp_dir)
 
 @pytest.mark.parametrize(
     "text,expected",
@@ -315,11 +322,11 @@ def download_en_hist(tmp_path):
         ("C'est une belle journée.", 23 / 24), # 0.9583333333333334
     ],
 )
-def test_character_hist_validator_from_s3(text, expected, download_en_hist):
+def test_character_hist_validator(text, expected, en_hist_dir):
     processor = CharacterHistogramLangValidator(
         text_field="text",
         lang="en",
-        cache_dir=download_en_hist,
+        cache_dir=en_hist_dir,
         output_manifest_file=None,
     )
     processor.prepare()

From 471b8cd8d14c3bee98e503cc30ac5364f0642ce0 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Thu, 7 Aug 2025 01:30:51 -0700
Subject: [PATCH 09/10] Added try/except to s3 file download

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 tests/test_data_to_data.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index a0e33ff3..fedca80f 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -15,6 +15,7 @@
 import pytest
 import os
 import boto3
+from botocore.exceptions import ClientError
 
 from sdp.processors.modify_manifest.data_to_data import (
     InsIfASRInsertion,
@@ -303,6 +304,13 @@ def en_hist_dir(tmp_path_factory):
 
     tmp_dir = tmp_path_factory.mktemp("char_hists")
     local_path = tmp_dir / "en"
+
+    if not local_path.exists():
+        try:
+            s3.download_file(bucket, key, str(local_path))
+        except ClientError as e:
+            code = e.response.get("Error", {}).get("Code", "")
+            pytest.skip(f"Cannot download s3://{bucket}/{key} ({code}).")
     
     s3.download_file(bucket, key, str(local_path))
     

From 79ada79a8cb1a8c46c3bf45bd7cd43ad1563d030 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Thu, 7 Aug 2025 01:31:54 -0700
Subject: [PATCH 10/10] Removed duplicated row

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 tests/test_data_to_data.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index fedca80f..b9f2007f 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -311,9 +311,7 @@ def en_hist_dir(tmp_path_factory):
         except ClientError as e:
             code = e.response.get("Error", {}).get("Code", "")
             pytest.skip(f"Cannot download s3://{bucket}/{key} ({code}).")
-    
-    s3.download_file(bucket, key, str(local_path))
-    
+        
     assert local_path.exists(), "Histogram file was not downloaded"
     return str(tmp_dir)