PacificAI · ArshaanNazir · Aug 27, 2023 · Aug 16, 2023 · Aug 17, 2023 · Aug 17, 2023
diff --git a/demo/tutorials/llm_notebooks/HuggingFaceAPI_QA_Summarization_Testing_Notebook.ipynb b/demo/tutorials/llm_notebooks/HuggingFaceAPI_QA_Summarization_Testing_Notebook.ipynb
diff --git a/demo/tutorials/llm_notebooks/HuggingFaceHub_QA_Summarization_Testing_Notebook.ipynb b/demo/tutorials/llm_notebooks/HuggingFaceHub_QA_Summarization_Testing_Notebook.ipynb
diff --git a/docs/pages/tutorials/tutorials.md b/docs/pages/tutorials/tutorials.md
diff --git a/langtest/data/config/hf_hub_config.yml b/langtest/data/config/hf_hub_config.yml
@@ -0,0 +1,11 @@
+# this section is only required for LLM models
+model_parameters:
+  max_new_tokens: 128
+
+tests:
+  defaults:
+    min_pass_rate: 1.0
+
+  robustness:
+    lowercase:
+      min_pass_rate: 0.70
diff --git a/langtest/datahandler/datasource.py b/langtest/datahandler/datasource.py
@@ -1385,6 +1385,50 @@ def load_data_summarization(
         samples = [self._row_to_sample_summarization(example) for example in dataset]
         return samples
 
+    def load_data_qa(
+        self,
+        question_column: str,
+        context_column: str,
+        target_column: str,
+        split: str,
+        subset: str = None,
+    ) -> List[Sample]:
+        """Load the specified split from the dataset for QA task.
+
+        Args:
+            feature_column (str):
+                Name of the column containing the input text or document.
+            target_column (str):
+                Name of the column containing the target summary.
+            split (str):
+                Name of the split to load (e.g., train, validation, test).
+            subset (str):
+                Name of the configuration or subset to load.
+
+        Returns:
+            List[Sample]:
+                Loaded split as a list of Sample objects for QA task.
+        """
+        question_column = "question" if question_column is None else question_column
+        target_column = "answer" if target_column is None else target_column
+        split = "test" if split is None else split
+
+        if subset:
+            dataset = self.load_dataset(self.dataset_name, name=subset, split=split)
+        else:
+            dataset = self.load_dataset(self.dataset_name, split=split)
+
+        dataset = dataset.map(
+            lambda example: {
+                "question": example[question_column],
+                "context": example[context_column],
+                "answer": example[target_column],
+            }
+        )
+
+        samples = [self._row_to_sample_qa(example) for example in dataset]
+        return samples
+
     def load_raw_data(
         self,
         split: str = "test",
@@ -1455,6 +1499,30 @@ def _row_to_sample_summarization(data_row: Dict[str, str]) -> Sample:
 
         return SummarizationSample(original=original, expected_results=summary)
 
+    @staticmethod
+    def _row_to_sample_qa(data_row: Dict[str, str]) -> Sample:
+        """Convert a row from the dataset into a Sample for summarization.
+
+        Args:
+            data_row (Dict[str, str]):
+                Single row of the dataset.
+
+        Returns:
+            Sample:
+                Row formatted into a Sample object for summarization.
+        """
+        context = data_row.get("context", "")
+        question = data_row.get("question", "")
+        answer = data_row.get("answer", "")
+        if isinstance(answer, str):
+            answer = [answer]
+
+        return QASample(
+            original_question=question,
+            original_context=context,
+            actual_results=answer,
+        )
+
     def export_data(self, data: List[Sample], output_path: str):
         """Exports the data to the corresponding format and saves it to 'output_path'.
 

diff --git a/langtest/modelhandler/transformers_modelhandler.py b/langtest/modelhandler/transformers_modelhandler.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Union
 
 import numpy as np
 from transformers import Pipeline, pipeline
@@ -11,6 +11,8 @@
     TranslationOutput,
 )
 
+from langchain import PromptTemplate
+
 
 class PretrainedModelForNER(_ModelHandler):
     """Transformers pretrained model for NER tasks
@@ -334,3 +336,111 @@ def predict(self, text: str, **kwargs) -> TranslationOutput:
     def __call__(self, text: str, *args, **kwargs) -> TranslationOutput:
         """Alias of the 'predict' method"""
         return self.predict(text=text, **kwargs)
+
+
+class PretrainedModelForQA(_ModelHandler):
+    """Transformers pretrained model for QA tasks
+
+    Args:
+        model (transformers.pipeline.Pipeline): Pretrained HuggingFace QA pipeline for predictions.
+    """
+
+    def __init__(self, hub, model, **kwargs):
+        """Constructor method
+
+        Args:
+            model (transformers.pipeline.Pipeline): Pretrained HuggingFace QA pipeline for predictions.
+        """
+        assert isinstance(model, Pipeline), ValueError(
+            f"Invalid transformers pipeline! "
+            f"Pipeline should be '{Pipeline}', passed model is: '{type(model)}'"
+        )
+        self.model = model
+
+    @staticmethod
+    def load_model(hub: str, path: str, **kwargs) -> "Pipeline":
+        """Load the QA model into the `model` attribute.
+
+        Args:
+            path (str):
+                path to model or model name
+
+        Returns:
+            'Pipeline':
+        """
+
+        return pipeline(model=path, **kwargs)
+
+    def predict(self, text: Union[str, dict], prompt: dict, **kwargs) -> str:
+        """Perform predictions on the input text.
+
+        Args:
+            text (str): Input text to perform QA on.
+            kwargs: Additional keyword arguments.
+
+
+        Returns:
+            str: Output model for QA tasks
+        """
+        prompt_template = PromptTemplate(**prompt)
+        p = prompt_template.format(**text)
+        prediction = self.model(p, **kwargs)
+        return prediction[0]["generated_text"][len(p) :]
+
+    def __call__(self, text: Union[str, dict], prompt: dict, **kwargs) -> str:
+        """Alias of the 'predict' method"""
+        return self.predict(text=text, prompt=prompt, **kwargs)
+
+
+class PretrainedModelForSummarization(_ModelHandler):
+    """Transformers pretrained model for QA tasks
+
+    Args:
+        model (transformers.pipeline.Pipeline): Pretrained HuggingFace QA pipeline for predictions.
+    """
+
+    def __init__(self, hub, model, **kwargs):
+        """Constructor method
+
+        Args:
+            model (transformers.pipeline.Pipeline): Pretrained HuggingFace QA pipeline for predictions.
+        """
+        assert isinstance(model, Pipeline), ValueError(
+            f"Invalid transformers pipeline! "
+            f"Pipeline should be '{Pipeline}', passed model is: '{type(model)}'"
+        )
+        self.model = model
+
+    @staticmethod
+    def load_model(hub: str, path: str, **kwargs) -> "Pipeline":
+        """Load the QA model into the `model` attribute.
+
+        Args:
+            path (str):
+                path to model or model name
+
+        Returns:
+            'Pipeline':
+        """
+
+        return pipeline(model=path, **kwargs)
+
+    def predict(self, text: Union[str, dict], prompt: dict, **kwargs) -> str:
+        """Perform predictions on the input text.
+
+        Args:
+            text (str): Input text to perform QA on.
+            kwargs: Additional keyword arguments.
+
+
+        Returns:
+            str: Output model for QA tasks
+        """
+        prompt_template = PromptTemplate(**prompt)
+        p = prompt_template.format(**text)
+        prediction = self.model(p, **kwargs)
+        return prediction[0]["generated_text"][len(p) :]
+
+    def __call__(self, text: Union[str, dict], prompt: dict, **kwargs) -> str:
+        """Alias of the 'predict' method"""
+        return self.predict(text=text, prompt=prompt, **kwargs)
diff --git a/langtest/transform/__init__.py b/langtest/transform/__init__.py
@@ -305,7 +305,6 @@ def __init__(self, data_handler: List[Sample], tests: Dict = None, **kwargs) ->
             tests Optional[Dict]:
                 A dictionary of test names and corresponding parameters (default is None).
         """
-
         self.supported_tests = self.available_tests()
         self._data_handler = data_handler
         self.tests = tests

diff --git a/langtest/utils/custom_types/sample.py b/langtest/utils/custom_types/sample.py
@@ -495,38 +495,58 @@ def is_pass(self) -> bool:
         from ...transform.constants import qa_prompt_template
         from langchain.prompts import PromptTemplate
 
-        if self.dataset_name not in ["BoolQ", "TruthfulQA", "Quac", "BBQ"]:
-            PROMPT = PromptTemplate(
-                input_variables=["query", "answer", "result"], template=qa_prompt_template
-            )
-            eval_chain = QAEvalChain.from_llm(
-                llm=llm_model.model_class.model, prompt=PROMPT
-            )
-            inputs = [
-                {"question": self.original_question, "answer": self.expected_results}
-            ]
-
-            predictions = [
-                {"question": self.perturbed_question, "text": self.actual_results}
-            ]
-
-            graded_outputs = eval_chain.evaluate(
-                inputs,
-                predictions,
-                question_key="question",
-                answer_key="answer",
-                prediction_key="text",
-            )
+        if "llm" in str(type(llm_model.model_class)):
+            if self.dataset_name not in ["BoolQ", "TruthfulQA", "Quac", "BBQ"]:
+                PROMPT = PromptTemplate(
+                    input_variables=["query", "answer", "result"],
+                    template=qa_prompt_template,
+                )
+                eval_chain = QAEvalChain.from_llm(
+                    llm=llm_model.model_class.model, prompt=PROMPT
+                )
+                inputs = [
+                    {"question": self.original_question, "answer": self.expected_results}
+                ]
+
+                predictions = [
+                    {"question": self.perturbed_question, "text": self.actual_results}
+                ]
+
+                graded_outputs = eval_chain.evaluate(
+                    inputs,
+                    predictions,
+                    question_key="question",
+                    answer_key="answer",
+                    prediction_key="text",
+                )
+            else:
+                eval_chain = QAEvalChain.from_llm(llm=llm_model.model_class.model)
+                graded_outputs = eval_chain.evaluate(
+                    [
+                        {
+                            "question": self.original_question,
+                            "answer": self.expected_results,
+                        }
+                    ],
+                    [{"question": self.perturbed_question, "text": self.actual_results}],
+                    question_key="question",
+                    prediction_key="text",
+                )
+
+            return graded_outputs[0]["text"].strip() == "CORRECT"
         else:
-            eval_chain = QAEvalChain.from_llm(llm=llm_model.model_class.model)
-            graded_outputs = eval_chain.evaluate(
-                [{"question": self.original_question, "answer": self.expected_results}],
-                [{"question": self.perturbed_question, "text": self.actual_results}],
-                question_key="question",
-                prediction_key="text",
+            prediction = llm_model(
+                text={
+                    "query": self.perturbed_question,
+                    "answer": self.expected_results,
+                    "result": self.actual_results,
+                },
+                prompt={
+                    "input_variables": ["query", "answer", "result"],
+                    "template": qa_prompt_template,
+                },
             )
-
-        return graded_outputs[0]["text"].strip() == "CORRECT"
+            return prediction == "CORRECT"
 
 
 class MinScoreQASample(QASample):

diff --git a/tests/test_huggingface_model.py b/tests/test_huggingface_model.py
@@ -27,7 +27,7 @@ def setUp(self) -> None:
 
         self.tasks = ["ner", "text-classifier"]
 
-    def test_transformers_models(self):
+    def test_transformers_ner_models(self):
         """
         Test loading Hugging Face models.
 
@@ -39,6 +39,18 @@ def test_transformers_models(self):
         )
         self.assertIsInstance(model, ModelFactory)
 
+    def test_transformers_QA_models(self):
+        """
+        Test loading Hugging Face models.
+
+        This method tests the loading of a Hugging Face model using the `ModelFactory` class.
+        It asserts that the loaded model is an instance of `ModelFactory`.
+        """
+        model = ModelFactory.load_model(
+            task="question-answering", hub="huggingface", path="gpt2"
+        )
+        self.assertIsInstance(model, ModelFactory)
+
     def test_unsupported_task(self):
         """
         Test unsupported task.