diff --git a/langtest/transform/__init__.py b/langtest/transform/__init__.py index c41a576c2..309d92617 100644 --- a/langtest/transform/__init__.py +++ b/langtest/transform/__init__.py @@ -1068,9 +1068,9 @@ def transform(self) -> List[Sample]: ) y_true = y_true.dropna() - params["test_name"] = test_name + transformed_samples = self.supported_tests[test_name].transform( - y_true, params + test_name, y_true, params ) for sample in transformed_samples: diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py index 3c12d5bbb..4734f2a4b 100644 --- a/langtest/transform/accuracy.py +++ b/langtest/transform/accuracy.py @@ -19,7 +19,7 @@ class BaseAccuracy(ABC): alias_name = None supported_tasks = ["ner", "text-classification"] - @staticmethod + @classmethod @abstractmethod def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: """Abstract method that implements the accuracy measure. @@ -77,10 +77,12 @@ class MinPrecisionScore(BaseAccuracy): transform(y_true, y_pred) -> Any: Creates accuracy test results. """ - alias_name = "min_precision_score" + alias_name = ["min_precision_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum precision score for the given data. Args: @@ -90,6 +92,10 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: Precision test results. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" + labels = set(y_true) # .union(set(y_pred)) if isinstance(params["min_score"], dict): @@ -149,10 +155,12 @@ class MinRecallScore(BaseAccuracy): transform(y_true, y_pred) -> Any: Creates accuracy test results. """ - alias_name = "min_recall_score" + alias_name = ["min_recall_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum recall score for the given data. Args: @@ -162,6 +170,10 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: minimum recall results. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" + labels = set(y_true) # .union(set(y_pred)) if isinstance(params["min_score"], dict): @@ -221,10 +233,12 @@ class MinF1Score(BaseAccuracy): """ - alias_name = "min_f1_score" + alias_name = ["min_f1_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum F1 score for the given data. Args: @@ -234,6 +248,10 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: F1 score test results. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" + labels = set(y_true) if isinstance(params["min_score"], dict): @@ -292,10 +310,12 @@ class MinMicroF1Score(BaseAccuracy): alias_name (str): The name for config. """ - alias_name = "min_micro_f1_score" + alias_name = ["min_micro_f1_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum micro F1 score for the given data. Args: @@ -305,6 +325,10 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum micro F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" + min_score = params["min_score"] sample = MinScoreSample( @@ -351,10 +375,12 @@ class MinMacroF1Score(BaseAccuracy): transform(y_true, params) -> Any: Creates accuracy test results. """ - alias_name = "min_macro_f1_score" + alias_name = ["min_macro_f1_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum macro F1 score for the given data. Args: @@ -364,6 +390,9 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum macro F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" min_score = params["min_score"] sample = MinScoreSample( @@ -409,10 +438,12 @@ class MinWeightedF1Score(BaseAccuracy): transform(y_true, params) -> Any: Creates accuracy test results. """ - alias_name = "min_weighted_f1_score" + alias_name = ["min_weighted_f1_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum weighted F1 score for the given data. Args: @@ -422,6 +453,9 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" min_score = params["min_score"] sample = MinScoreSample( @@ -466,11 +500,13 @@ class MinEMcore(BaseAccuracy): transform(y_true, y_pred) -> Any: Creates accuracy test results. """ - alias_name = "min_exact_match_score" + alias_name = ["min_exact_match_score"] supported_tasks = ["question-answering", "summarization"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum F1 score for the given data. Args: @@ -480,6 +516,9 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" min_score = params["min_score"] sample = MinScoreSample( @@ -528,11 +567,13 @@ class MinBLEUcore(BaseAccuracy): transform(y_true, y_pred) -> Any: Creates accuracy test results. """ - alias_name = "min_bleu_score" + alias_name = ["min_bleu_score"] supported_tasks = ["question-answering", "summarization"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum F1 score for the given data. Args: @@ -542,6 +583,9 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" min_score = params["min_score"] sample = MinScoreSample( @@ -598,8 +642,10 @@ class MinROUGEcore(BaseAccuracy): ] supported_tasks = ["question-answering", "summarization"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum F1 score for the given data. Args: @@ -610,11 +656,14 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" min_score = params["min_score"] sample = MinScoreSample( category="accuracy", - test_type=params["test_name"], + test_type=test, expected_results=MinScoreOutput(min_score=min_score), ) diff --git a/tests/test_accuracy.py b/tests/test_accuracy.py index 1b2d19c8b..db1c2cc0a 100644 --- a/tests/test_accuracy.py +++ b/tests/test_accuracy.py @@ -1,51 +1,163 @@ -import unittest +import pytest import pandas as pd -from langtest import Harness +from langtest.transform.accuracy import ( + BaseAccuracy, + MinPrecisionScore, + MinF1Score, + MinMicroF1Score, + MinMacroF1Score, + MinWeightedF1Score, + MinEMcore, + MinBLEUcore, + MinROUGEcore, +) +from langtest.utils.custom_types import SequenceLabel, Span +from langtest.utils.custom_types.output import ( + NEROutput, + NERPrediction, + SequenceClassificationOutput, +) +from langtest.utils.custom_types.sample import ( + MinScoreSample, + NERSample, + QASample, + SequenceClassificationSample, + SummarizationSample, +) -class AccuracyTestCase(unittest.TestCase): +class TestAccuracy: + """ + A test suite for evaluating accuracy classes. """ - A test case for accuracy evaluation. - This test case performs accuracy evaluation using the `Harness` class from `langtest` package. - It tests the accuracy of a named entity recognition (NER) model using a sample dataset in CoNLL format. + accuracy_config = { + "min_precision_score": {"min_score": 0.66}, + "min_recall_score": {"min_score": 0.60}, + "min_f1_score": {"min_score": 0.60}, + "min_micro_f1_score": {"min_score": 0.60}, + "min_macro_f1_score": {"min_score": 0.60}, + "min_weighted_f1_score": {"min_score": 0.60}, + "min_bleu_score": {"min_score": 0.66}, + "min_exact_match_score": {"min_score": 0.60}, + "min_rouge1_score": {"min_score": 0.60}, + "min_rouge2_score": {"min_score": 0.60}, + "min_rougeL_score": {"min_score": 0.60}, + "min_rougeLsum_score": {"min_score": 0.60}, + } - Attributes: - h_spacy (Harness): The `Harness` instance for evaluating the model. - report (pd.DataFrame): The evaluation report generated by the `Harness`. - """ + @pytest.fixture + def sample_data(self): + """A fixture providing sample data for testing. - def setUp(self) -> None: + Returns: + dict: A dictionary containing sample data for different tasks. """ - Set up the test case. + return { + "text-classification": [ + SequenceClassificationSample( + original="The last good ernest movie, and the best at that. how can you not laugh at least once at this movie. the last line is a classic, as is ernest's gangster impressions, his best moment on film. this has his best lines and is a crowning achievement among the brainless screwball comedies.", + expected_results=SequenceClassificationOutput( + predictions=[SequenceLabel(label="Positive", score=1.0)] + ), + ), + SequenceClassificationSample( + original="After my 6 year old daughter began taking riding lessons I started looking for horse movies for her. I had always heard of National Velvet but had never seen it. Boy am I glad I bought it! It's become a favorite of mine, my 6 year old AND my 2 year old. It's a shame movies like this aren't made anymore.", + expected_results=SequenceClassificationOutput( + predictions=[SequenceLabel(label="Positive", score=1.0)] + ), + ), + ], + "ner": [ + NERSample( + original="Attendance : 3,000", + expected_results=NEROutput( + predictions=[ + NERPrediction( + entity="CARDINAL", + span=Span(start=13, end=18, word="3,000"), + ) + ] + ), + ), + NERSample( + original="I do not love KFC", + expected_results=NEROutput( + predictions=[ + NERPrediction( + entity="PROD", span=Span(start=14, end=17, word="KFC") + ) + ] + ), + ), + ], + "question-answering": [ + QASample( + original_question="What is John Snow Labs?", + original_context="John Snow Labs is a healthcare company specializing in accelerating progress in data science.", + expected_results="A healthcare company specializing in accelerating progress in data science. ", + ) + ], + "summarization": [ + SummarizationSample( + original="John Snow Labs is a healthcare company specializing in accelerating progress in data " + "science.", + expected_results="JSL is a data science company", + ) + ], + } - This method initializes the `Harness` instance with the necessary configuration for evaluating the model. - It configures the test settings, such as the minimum pass rate and minimum F1 score, for accuracy evaluation. - """ + @pytest.mark.parametrize( + "accuracy", + [ + MinPrecisionScore, + MinF1Score, + MinMicroF1Score, + MinMacroF1Score, + MinWeightedF1Score, + MinEMcore, + MinBLEUcore, + MinROUGEcore, + ], + ) + def test_transform(self, accuracy: BaseAccuracy, sample_data) -> None: + """Test the transform method of accuracy-related classes. - self.h_spacy = Harness( - task="ner", - model="en_core_web_sm", - data="langtest/data/conll/sample.conll", - hub="spacy", - ) - self.h_spacy.configure( - { - "tests": { - "defaults": { - "min_pass_rate": 0.65, - }, - "accuracy": {"min_f1_score": {"min_score": 0.65}}, - } - } - ) - self.report = self.h_spacy.generate().run().report() - - def test_report(self): - """ - Test the evaluation report. + Args: + accuracy (BaseAccuracy): An accuracy-related class to test. + sample_data (dict): Sample data for different tasks. - This method asserts that the evaluation report generated by the `Harness` is an instance of `pd.DataFrame`. + Returns: + None """ + for alias in accuracy.alias_name: + for task in accuracy.supported_tasks: + if task == "text-classification": + y_true = ( + pd.Series(sample_data["text-classification"]) + .apply( + lambda x: [y.label for y in x.expected_results.predictions] + ) + .explode() + ) + elif task == "ner": + y_true = pd.Series(sample_data["ner"]).apply( + lambda x: [y.entity for y in x.expected_results.predictions] + ) + y_true = y_true.explode().apply( + lambda x: x.split("-")[-1] if isinstance(x, str) else x + ) + + else: + y_true = ( + pd.Series(sample_data[task]) + .apply(lambda x: x.expected_results) + .explode() + ) + transform_results = accuracy.transform( + alias, y_true, self.accuracy_config[alias] + ) + assert isinstance(transform_results, list) - self.assertIsInstance(self.report, pd.DataFrame) + for _, result in zip(y_true, transform_results): + assert isinstance(result, MinScoreSample) diff --git a/tests/test_robustness.py b/tests/test_robustness.py index 8b40bb0e6..547954323 100644 --- a/tests/test_robustness.py +++ b/tests/test_robustness.py @@ -2,6 +2,8 @@ from langtest.transform.robustness import * from langtest.transform.constants import A2B_DICT from langtest.utils.custom_types import SequenceClassificationSample +from langtest.utils.custom_types.sample import QASample, SummarizationSample +from langtest.transform import TestFactory class RobustnessTestCase(unittest.TestCase): @@ -434,3 +436,104 @@ def test_random_age(self) -> None: self.assertIsInstance(transformed_samples, list) for sample in transformed_samples: self.assertNotEqual(sample.test_case, sample.original) + + +class RobustnessTestCaseQaAndSummarization(unittest.TestCase): + """ + A test case class for testing QA and summarization samples on robustness classes. + """ + + def available_test(self) -> dict: + """ + Get a dictionary of available robustness tests. + + Returns: + dict: A dictionary containing available robustness tests. + """ + tests = { + j: i + for i in BaseRobustness.__subclasses__() + for j in (i.alias_name if isinstance(i.alias_name, list) else [i.alias_name]) + } + return tests + + def setUp(self) -> None: + """ + Set up the test environment before each test. + + Returns: + None + """ + test_scenarios = TestFactory.test_scenarios() + self.available_tests = { + test: list(scenarios.keys()) for test, scenarios in test_scenarios.items() + } + + self.perturbations_list = self.available_tests["robustness"] + self.supported_tests = self.available_test() + self.samples = { + "question-answering": [ + QASample( + original_question="What is John Snow Labs?", + original_context="John Snow Labs is a healthcare company specializing in accelerating progress in data science.", + ) + ], + "summarization": [ + SummarizationSample( + original="John Snow Labs is a healthcare company specializing in accelerating progress in data science.", + ) + ], + } + + def test(self) -> None: + """ + Test QA and summarization sample for robustness classes. + + Returns: + None + """ + prob = 1.0 + for test in self.perturbations_list: + for task in self.samples: + sample = self.samples[task][-1] + test_func = self.supported_tests[test].transform + + if test not in [ + "swap_entities", + "american_to_british", + "british_to_american", + "add_context", + "multiple_perturbations", + ]: + sample.transform(test_func, {}, prob) + elif test in ["american_to_british", "british_to_american"]: + sample.transform(test_func, {"accent_map": A2B_DICT}, prob) + elif test == "add_context": + sample.transform( + test_func, + { + "ending_context": ["Bye", "Reported"], + "starting_context": ["Hi", "Good morning", "hello"], + }, + prob, + ) + elif test == "multiple_perturbations": + sample.transform( + test_func, + {}, + prob, + perturbations=[ + "lowercase", + "add_ocr_typo", + "titlecase", + "number_to_word", + ], + ) + + if task == "question-answering": + assert ( + sample.perturbed_question is not None + and sample.perturbed_context is not None + ) + else: + assert sample.test_case is not None