From 540c11b18b16e7b010d34b08393b17b5c9502284 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Fri, 25 Jul 2025 07:30:25 -0700
Subject: [PATCH 1/5] FastTextLangIdClassifier processor implementation

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                          |   3 +
 requirements/main.txt                         |   3 +-
 sdp/processors/__init__.py                    |   1 +
 .../inference/nlp/fasttext/fasttext.py        | 113 ++++++++++++++++++
 tests/test_fasttext_inference.py              |  57 +++++++++
 5 files changed, 176 insertions(+), 1 deletion(-)
 create mode 100644 sdp/processors/inference/nlp/fasttext/fasttext.py
 create mode 100644 tests/test_fasttext_inference.py

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 4c8f5aff..2d326463 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -208,6 +208,9 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.AudioLid
    :annotation:
 
+.. autodata:: sdp.processors.FastTextLangIdClassifier
+   :annotation:
+
 Text-only processors
 ####################
 
diff --git a/requirements/main.txt b/requirements/main.txt
index 9b8858d4..13f3dbd8 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -28,4 +28,5 @@ datasets>=2.14.0,<3.0.0
 # for FasterWhisperInference processor is required: 
     # pip install pytorch-lightning nvidia-cublas-cu12 nvidia-cudnn-cu12==9.* faster_whisper
     # export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
-# for vLLMInference processor is required: pip install "optree>=0.13.0" vllm
\ No newline at end of file
+# for vLLMInference processor is required: pip install "optree>=0.13.0" vllm
+# for FastTextLangIdClassifier processor is required: pip install fasttext
\ No newline at end of file
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index b25ce04d..0100a1aa 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -148,6 +148,7 @@
 from sdp.processors.inference.asr.utils.whisper_hallucinations import DetectWhisperHallucinationFeatures
 from sdp.processors.inference.asr.utils.rttm import GetRttmSegments, SplitAudioFile
 from sdp.processors.inference.nlp.nemo.pc_inference import PCInference
+from sdp.processors.inference.nlp.fasttext.fasttext import FastTextLangIdClassifier
 from sdp.processors.inference.llm.vllm.vllm import vLLMInference
 from sdp.processors.inference.llm.utils.qwen_cleaning import CleanQwenGeneration
 
diff --git a/sdp/processors/inference/nlp/fasttext/fasttext.py b/sdp/processors/inference/nlp/fasttext/fasttext.py
new file mode 100644
index 00000000..c1256cb8
--- /dev/null
+++ b/sdp/processors/inference/nlp/fasttext/fasttext.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import requests
+import tempfile
+import wget
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class FastTextLangIdClassifier(BaseParallelProcessor):
+    """
+    This processor supports language identification using pretrained FastText models.
+    It classifies text and adds the predicted label and probability to the dataset entry.
+    If needed, it downloads the model, loads it into memory, and performs prediction on the 
+    specified input text field.
+
+    Args:
+        model_name_or_path (str): Path to a FastText model file or the name of a supported remote model ('lid.176.bin' or 'lid.176.ftz').
+        text_field (str): The name of the field in the dataset entry that contains the input text for classification.
+        output_field (str): The name of the field to store the predicted label. Defaults to "label".
+        cache_dir (str, optional): Directory to store the downloaded model file. If not provided, a temporary directory is used.
+        **kwargs: Additional keyword arguments passed to `BaseParallelProcessor`.
+
+    Returns:
+        A manifest where each entry contains the original data fields plus:
+            - `<output_field>`: The predicted label (e.g., language code for `lid.176.bin`).
+            - `<output_field>_prob`: The probability of the prediction.
+    
+    .. note::
+        Make sure to install `fasttext` before using this processor:
+            pip install fasttext
+
+    """
+
+    SUPPROTED_MODELS_URLS = {
+        'lid.176.bin' : 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin',
+        'lid.176.ftz' : 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
+    }
+    
+    def __init__(
+        self,
+        model_name_or_path: str,
+        text_field: str,
+        output_field: str = "label",
+        cache_dir: str = None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.model_name_or_path = model_name_or_path
+        self.text_field = text_field
+        self.output_field = output_field
+        self.cache_dir = cache_dir
+        self._model = None
+
+    def _download_model(self):
+        """Downloads the FastText model from a predefined URL and stores it in the cache directory."""
+        model_url = self.SUPPROTED_MODELS_URLS[self.model_name_or_path]
+        logger.info(f'Downloading {self.model_name_or_path}..')
+        response = requests.get(model_url)
+
+        if response.status_code != 200:
+            raise requests.exceptions.RequestException(
+            f"Failed to download model file. Status code: {response.status_code}"
+        )
+
+        if self.cache_dir is None:
+            self.cache_dir = tempfile.mkdtemp()
+        os.makedirs(self.cache_dir, exist_ok=True)
+
+        self.model_name_or_path = wget.download(model_url, out=self.cache_dir)
+        logger.info(f'Model `{self.model_name_or_path}` has been downloaded to {self.cache_dir}.')
+    
+    def prepare(self):
+        """
+        Prepares the model for classification:
+        - Checks if the model file exists locally.
+        - Downloads the model if only the name is given and it's known.
+        - Raises ValueError if the path or model name is invalid.
+        """
+        import fasttext
+
+        if not os.path.exists(self.model_name_or_path):
+            if self.cache_dir and os.path.exists(os.path.join(self.cache_dir, self.model_name_or_path)):
+                self.model_name_or_path = os.path.join(self.cache_dir, self.model_name_or_path)
+            elif self.model_name_or_path in self.SUPPROTED_MODELS_URLS:
+                self._download_model()
+            else:
+                raise ValueError(f'Current model is not supported or filepath is invalid: {self.model_name_or_path}.')
+        
+        self._model = fasttext.load_model(self.model_name_or_path)
+
+    def process_dataset_entry(self, data_entry: dict):
+        """Applies the classifier to a single dataset entry."""
+        text = data_entry[self.text_field].strip().replace("\n", " ")
+        label, prob = self._model.predict(text)
+        data_entry[self.output_field] = label[0].replace('__label__', '')
+        data_entry[f"{self.output_field}_prob"] = prob[0]
+
+        return [DataEntry(data=data_entry)]
\ No newline at end of file
diff --git a/tests/test_fasttext_inference.py b/tests/test_fasttext_inference.py
new file mode 100644
index 00000000..86e1e1f5
--- /dev/null
+++ b/tests/test_fasttext_inference.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from sdp.processors.inference.nlp.fasttext.fasttext import FastTextLangIdClassifier
+
+
+@pytest.fixture(scope="module")
+def classifier():
+    processor = FastTextLangIdClassifier(
+        model_name_or_path="lid.176.ftz",
+        text_field="text",
+        output_field="lang",
+        num_workers=1,
+        batch_size=1,
+    )
+    processor.prepare()
+    return processor
+
+
+@pytest.mark.parametrize("text,expected_lang", [
+    ("Hello, how are you?", "en"),
+    ("Bonjour tout le monde", "fr"),
+    ("Привет, как дела?", "ru"),
+    ("Hola, ¿cómo estás?", "es"),
+])
+def test_language_identification(classifier, text, expected_lang):
+    input_entry = {"text": text}
+    result = classifier.process_dataset_entry(input_entry)
+
+    assert isinstance(result, list)
+    assert len(result) == 1
+
+    output = result[0].data
+    assert "lang" in output
+    assert "lang_prob" in output
+
+    predicted_lang = output["lang"]
+    prob = output["lang_prob"]
+
+    assert isinstance(predicted_lang, str)
+    assert 0 <= prob <= 1.0
+
+    #Exact matching may depend on the model, so we compare based on presence in the top predictions.
+    assert predicted_lang == expected_lang, f"Expected: {expected_lang}, got: {predicted_lang}"
\ No newline at end of file

From 388dccdf28d92fb3ac526ed40b0650a4a59bb8fb Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Fri, 25 Jul 2025 08:27:04 -0700
Subject: [PATCH 2/5] Fix docs

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../inference/nlp/fasttext/fasttext.py          | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/sdp/processors/inference/nlp/fasttext/fasttext.py b/sdp/processors/inference/nlp/fasttext/fasttext.py
index c1256cb8..aa55b30b 100644
--- a/sdp/processors/inference/nlp/fasttext/fasttext.py
+++ b/sdp/processors/inference/nlp/fasttext/fasttext.py
@@ -29,21 +29,22 @@ class FastTextLangIdClassifier(BaseParallelProcessor):
     specified input text field.
 
     Args:
-        model_name_or_path (str): Path to a FastText model file or the name of a supported remote model ('lid.176.bin' or 'lid.176.ftz').
+        model_name_or_path (str): Path to a FastText model file or the name of a supported remote model 
+            ('lid.176.bin' or 'lid.176.ftz').
         text_field (str): The name of the field in the dataset entry that contains the input text for classification.
         output_field (str): The name of the field to store the predicted label. Defaults to "label".
-        cache_dir (str, optional): Directory to store the downloaded model file. If not provided, a temporary directory is used.
+        cache_dir (str, optional): Directory to store the downloaded model file. If not provided, a temporary 
+            directory is used.
         **kwargs: Additional keyword arguments passed to `BaseParallelProcessor`.
 
     Returns:
         A manifest where each entry contains the original data fields plus:
-            - `<output_field>`: The predicted label (e.g., language code for `lid.176.bin`).
-            - `<output_field>_prob`: The probability of the prediction.
-    
-    .. note::
-        Make sure to install `fasttext` before using this processor:
-            pip install fasttext
+        - `<output_field>`: The predicted label (e.g., language code for `lid.176.bin`).
+        - `<output_field>_prob`: The probability of the prediction.
 
+    Note:
+        Make sure to install `fasttext` before using this processor:
+        `pip install fasttext`
     """
 
     SUPPROTED_MODELS_URLS = {

From c1be2655f3dc454a2e3f8631b71a889d28910924 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 30 Jul 2025 05:09:06 -0700
Subject: [PATCH 3/5] Fix docs

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/inference/nlp/fasttext/fasttext.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sdp/processors/inference/nlp/fasttext/fasttext.py b/sdp/processors/inference/nlp/fasttext/fasttext.py
index aa55b30b..bb439a79 100644
--- a/sdp/processors/inference/nlp/fasttext/fasttext.py
+++ b/sdp/processors/inference/nlp/fasttext/fasttext.py
@@ -38,9 +38,9 @@ class FastTextLangIdClassifier(BaseParallelProcessor):
         **kwargs: Additional keyword arguments passed to `BaseParallelProcessor`.
 
     Returns:
-        A manifest where each entry contains the original data fields plus:
-        - `<output_field>`: The predicted label (e.g., language code for `lid.176.bin`).
-        - `<output_field>_prob`: The probability of the prediction.
+        List[DataEntry]: A manifest where each entry contains the original data fields plus
+            - `<output_field>`: The predicted label (e.g., language code for `lid.176.bin`).
+            - `<output_field>_prob`: The probability of the prediction.
 
     Note:
         Make sure to install `fasttext` before using this processor:

From bf5175463198887a65900e806f01156363b891b9 Mon Sep 17 00:00:00 2001
From: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Date: Wed, 30 Jul 2025 16:20:33 +0400
Subject: [PATCH 4/5] Update fasttext.py

Fix docs
---
 sdp/processors/inference/nlp/fasttext/fasttext.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sdp/processors/inference/nlp/fasttext/fasttext.py b/sdp/processors/inference/nlp/fasttext/fasttext.py
index bb439a79..42d6e986 100644
--- a/sdp/processors/inference/nlp/fasttext/fasttext.py
+++ b/sdp/processors/inference/nlp/fasttext/fasttext.py
@@ -38,8 +38,8 @@ class FastTextLangIdClassifier(BaseParallelProcessor):
         **kwargs: Additional keyword arguments passed to `BaseParallelProcessor`.
 
     Returns:
-        List[DataEntry]: A manifest where each entry contains the original data fields plus
-            - `<output_field>`: The predicted label (e.g., language code for `lid.176.bin`).
+        A manifest where each entry contains the original data fields plus
+            - `<output_field>`: The predicted label (e.g., language code for `lid.176.bin`),
             - `<output_field>_prob`: The probability of the prediction.
 
     Note:
@@ -111,4 +111,4 @@ def process_dataset_entry(self, data_entry: dict):
         data_entry[self.output_field] = label[0].replace('__label__', '')
         data_entry[f"{self.output_field}_prob"] = prob[0]
 
-        return [DataEntry(data=data_entry)]
\ No newline at end of file
+        return [DataEntry(data=data_entry)]

From f7d66e8c247ea73f29707d22099bb24124562629 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 4 Aug 2025 02:25:45 -0700
Subject: [PATCH 5/5] Make returning best or all labels optional based on top_k
 value

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../inference/nlp/fasttext/fasttext.py           | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/sdp/processors/inference/nlp/fasttext/fasttext.py b/sdp/processors/inference/nlp/fasttext/fasttext.py
index 42d6e986..2759ba9e 100644
--- a/sdp/processors/inference/nlp/fasttext/fasttext.py
+++ b/sdp/processors/inference/nlp/fasttext/fasttext.py
@@ -33,6 +33,7 @@ class FastTextLangIdClassifier(BaseParallelProcessor):
             ('lid.176.bin' or 'lid.176.ftz').
         text_field (str): The name of the field in the dataset entry that contains the input text for classification.
         output_field (str): The name of the field to store the predicted label. Defaults to "label".
+        top_k (int): The number of top predictions to return. Defaults to 1 (-1 for all).
         cache_dir (str, optional): Directory to store the downloaded model file. If not provided, a temporary 
             directory is used.
         **kwargs: Additional keyword arguments passed to `BaseParallelProcessor`.
@@ -57,6 +58,7 @@ def __init__(
         model_name_or_path: str,
         text_field: str,
         output_field: str = "label",
+        top_k: int = 1,
         cache_dir: str = None,
         **kwargs
     ):
@@ -65,6 +67,7 @@ def __init__(
         self.text_field = text_field
         self.output_field = output_field
         self.cache_dir = cache_dir
+        self.top_k = top_k
         self._model = None
 
     def _download_model(self):
@@ -108,7 +111,14 @@ def process_dataset_entry(self, data_entry: dict):
         """Applies the classifier to a single dataset entry."""
         text = data_entry[self.text_field].strip().replace("\n", " ")
         label, prob = self._model.predict(text)
-        data_entry[self.output_field] = label[0].replace('__label__', '')
-        data_entry[f"{self.output_field}_prob"] = prob[0]
+        if self.top_k == 1:
+            data_entry[self.output_field] = label[0].replace('__label__', '')
+            data_entry[f"{self.output_field}_prob"] = prob[0]
+        else:
+            max_k = len(label) if self.top_k == -1 else self.top_k
 
-        return [DataEntry(data=data_entry)]
+            for _label, _prob, top_i in zip(label, prob, range(1, max_k + 1)):
+                data_entry[f"{self.output_field}_{top_i}"] = _label.replace('__label__', '')
+                data_entry[f"{self.output_field}_prob_{top_i}"] = _prob
+        
+        return [DataEntry(data=data_entry)]
\ No newline at end of file