From d6156c6c202ad30b1806b440016eee3e95f48a63 Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Tue, 8 Aug 2023 18:23:07 +0530 Subject: [PATCH 1/6] test(test_robustness): setup new test class --- tests/test_robustness.py | 50 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tests/test_robustness.py b/tests/test_robustness.py index 8b40bb0e6..11f91a3dd 100644 --- a/tests/test_robustness.py +++ b/tests/test_robustness.py @@ -2,6 +2,8 @@ from langtest.transform.robustness import * from langtest.transform.constants import A2B_DICT from langtest.utils.custom_types import SequenceClassificationSample +from langtest.utils.custom_types.sample import QASample, SummarizationSample +from langtest.transform import TestFactory class RobustnessTestCase(unittest.TestCase): @@ -434,3 +436,51 @@ def test_random_age(self) -> None: self.assertIsInstance(transformed_samples, list) for sample in transformed_samples: self.assertNotEqual(sample.test_case, sample.original) + + +class RobustnessTestCaseQaAndSummarization(unittest.TestCase): + """ + A test case class for testing QA and summarization samples on robustness classes. + """ + + def available_test(self) -> dict: + """ + Get a dictionary of available robustness tests. + + Returns: + dict: A dictionary containing available robustness tests. + """ + tests = { + j: i + for i in BaseRobustness.__subclasses__() + for j in (i.alias_name if isinstance(i.alias_name, list) else [i.alias_name]) + } + return tests + + def setUp(self) -> None: + """ + Set up the test environment before each test. + + Returns: + None + """ + test_scenarios = TestFactory.test_scenarios() + self.available_tests = { + test: list(scenarios.keys()) for test, scenarios in test_scenarios.items() + } + + self.perturbations_list = self.available_tests["robustness"] + self.supported_tests = self.available_test() + self.samples = { + "question-answering": [ + QASample( + original_question="What is John Snow Labs?", + original_context="John Snow Labs is a healthcare company specializing in accelerating progress in data science.", + ) + ], + "summarization": [ + SummarizationSample( + original="John Snow Labs is a healthcare company specializing in accelerating progress in data science.", + ) + ], + } From 4a0971a21f056c5ff6fe6721bdfec68d9b003ae9 Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Tue, 8 Aug 2023 18:34:28 +0530 Subject: [PATCH 2/6] test(test_robustness.py): added unittest --- tests/test_robustness.py | 53 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/test_robustness.py b/tests/test_robustness.py index 11f91a3dd..ff4b0e481 100644 --- a/tests/test_robustness.py +++ b/tests/test_robustness.py @@ -484,3 +484,56 @@ def setUp(self) -> None: ) ], } + + def test(self) -> None: + """ + Test QA and summarization sample for robustness classes. + + Returns: + None + """ + prob = 1.0 + for test in self.perturbations_list: + for s in self.samples: + sample = self.samples[s][-1] + test_func = self.supported_tests[test].transform + + if test not in [ + "swap_entities", + "american_to_british", + "british_to_american", + "add_context", + "multiple_perturbations", + ]: + sample.transform(test_func, {}, prob) + elif test in ["american_to_british", "british_to_american"]: + sample.transform(test_func, {"accent_map": A2B_DICT}, prob) + elif test == "add_context": + sample.transform( + test_func, + { + "ending_context": ["Bye", "Reported"], + "starting_context": ["Hi", "Good morning", "hello"], + }, + prob, + ) + elif test == "multiple_perturbations": + sample.transform( + test_func, + {}, + prob, + perturbations=[ + "lowercase", + "add_ocr_typo", + "titlecase", + "number_to_word", + ], + ) + + if s == "question-answering": + assert ( + sample.perturbed_question is not None + and sample.perturbed_context is not None + ) + else: + assert sample.test_case is not None From c93d40903679aeb64f078a7c6cf8b60098306ee3 Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Tue, 8 Aug 2023 18:43:59 +0530 Subject: [PATCH 3/6] refacto(accuracy.py): Added checks --- langtest/transform/accuracy.py | 50 +++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py index 3c12d5bbb..133ca238a 100644 --- a/langtest/transform/accuracy.py +++ b/langtest/transform/accuracy.py @@ -19,7 +19,7 @@ class BaseAccuracy(ABC): alias_name = None supported_tasks = ["ner", "text-classification"] - @staticmethod + @classmethod @abstractmethod def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: """Abstract method that implements the accuracy measure. @@ -77,10 +77,12 @@ class MinPrecisionScore(BaseAccuracy): transform(y_true, y_pred) -> Any: Creates accuracy test results. """ - alias_name = "min_precision_score" + alias_name = ["min_precision_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum precision score for the given data. Args: @@ -90,6 +92,10 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: Precision test results. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" + labels = set(y_true) # .union(set(y_pred)) if isinstance(params["min_score"], dict): @@ -149,10 +155,12 @@ class MinRecallScore(BaseAccuracy): transform(y_true, y_pred) -> Any: Creates accuracy test results. """ - alias_name = "min_recall_score" + alias_name = ["min_recall_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum recall score for the given data. Args: @@ -162,6 +170,10 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: minimum recall results. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" + labels = set(y_true) # .union(set(y_pred)) if isinstance(params["min_score"], dict): @@ -221,10 +233,12 @@ class MinF1Score(BaseAccuracy): """ - alias_name = "min_f1_score" + alias_name = ["min_f1_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum F1 score for the given data. Args: @@ -234,6 +248,10 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: F1 score test results. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" + labels = set(y_true) if isinstance(params["min_score"], dict): @@ -292,10 +310,12 @@ class MinMicroF1Score(BaseAccuracy): alias_name (str): The name for config. """ - alias_name = "min_micro_f1_score" + alias_name = ["min_micro_f1_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum micro F1 score for the given data. Args: @@ -305,6 +325,10 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum micro F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" + min_score = params["min_score"] sample = MinScoreSample( From 38f8251276a4730b00044835d20316bdab5663f4 Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Tue, 8 Aug 2023 18:49:38 +0530 Subject: [PATCH 4/6] refacto accuracy.py --- langtest/transform/__init__.py | 4 +-- langtest/transform/accuracy.py | 55 ++++++++++++++++++++++++---------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/langtest/transform/__init__.py b/langtest/transform/__init__.py index c41a576c2..309d92617 100644 --- a/langtest/transform/__init__.py +++ b/langtest/transform/__init__.py @@ -1068,9 +1068,9 @@ def transform(self) -> List[Sample]: ) y_true = y_true.dropna() - params["test_name"] = test_name + transformed_samples = self.supported_tests[test_name].transform( - y_true, params + test_name, y_true, params ) for sample in transformed_samples: diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py index 133ca238a..4734f2a4b 100644 --- a/langtest/transform/accuracy.py +++ b/langtest/transform/accuracy.py @@ -375,10 +375,12 @@ class MinMacroF1Score(BaseAccuracy): transform(y_true, params) -> Any: Creates accuracy test results. """ - alias_name = "min_macro_f1_score" + alias_name = ["min_macro_f1_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum macro F1 score for the given data. Args: @@ -388,6 +390,9 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum macro F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" min_score = params["min_score"] sample = MinScoreSample( @@ -433,10 +438,12 @@ class MinWeightedF1Score(BaseAccuracy): transform(y_true, params) -> Any: Creates accuracy test results. """ - alias_name = "min_weighted_f1_score" + alias_name = ["min_weighted_f1_score"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum weighted F1 score for the given data. Args: @@ -446,6 +453,9 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" min_score = params["min_score"] sample = MinScoreSample( @@ -490,11 +500,13 @@ class MinEMcore(BaseAccuracy): transform(y_true, y_pred) -> Any: Creates accuracy test results. """ - alias_name = "min_exact_match_score" + alias_name = ["min_exact_match_score"] supported_tasks = ["question-answering", "summarization"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum F1 score for the given data. Args: @@ -504,6 +516,9 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" min_score = params["min_score"] sample = MinScoreSample( @@ -552,11 +567,13 @@ class MinBLEUcore(BaseAccuracy): transform(y_true, y_pred) -> Any: Creates accuracy test results. """ - alias_name = "min_bleu_score" + alias_name = ["min_bleu_score"] supported_tasks = ["question-answering", "summarization"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum F1 score for the given data. Args: @@ -566,6 +583,9 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" min_score = params["min_score"] sample = MinScoreSample( @@ -622,8 +642,10 @@ class MinROUGEcore(BaseAccuracy): ] supported_tasks = ["question-answering", "summarization"] - @staticmethod - def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: + @classmethod + def transform( + cls, test: str, y_true: List[Any], params: Dict + ) -> List[MinScoreSample]: """Computes the minimum F1 score for the given data. Args: @@ -634,11 +656,14 @@ def transform(y_true: List[Any], params: Dict) -> List[MinScoreSample]: Returns: List[MinScoreSample]: The transformed data based on the minimum F1 score. """ + assert ( + test in cls.alias_name + ), f"Parameter 'test' should be in: {cls.alias_name}, got '{test}'" min_score = params["min_score"] sample = MinScoreSample( category="accuracy", - test_type=params["test_name"], + test_type=test, expected_results=MinScoreOutput(min_score=min_score), ) From 81e589fe8b464d13bbc33f280dabd5c4072670e4 Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Tue, 8 Aug 2023 18:54:28 +0530 Subject: [PATCH 5/6] pytest: test_accuracy.py --- tests/test_accuracy.py | 129 +++++++++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 42 deletions(-) diff --git a/tests/test_accuracy.py b/tests/test_accuracy.py index 1b2d19c8b..ccb855ae3 100644 --- a/tests/test_accuracy.py +++ b/tests/test_accuracy.py @@ -1,51 +1,96 @@ -import unittest +import pytest import pandas as pd -from langtest import Harness +from langtest.utils.custom_types import SequenceLabel, Span +from langtest.utils.custom_types.output import ( + NEROutput, + NERPrediction, + SequenceClassificationOutput, +) +from langtest.utils.custom_types.sample import ( + NERSample, + QASample, + SequenceClassificationSample, + SummarizationSample, +) -class AccuracyTestCase(unittest.TestCase): +class TestAccuracy: """ - A test case for accuracy evaluation. - - This test case performs accuracy evaluation using the `Harness` class from `langtest` package. - It tests the accuracy of a named entity recognition (NER) model using a sample dataset in CoNLL format. - - Attributes: - h_spacy (Harness): The `Harness` instance for evaluating the model. - report (pd.DataFrame): The evaluation report generated by the `Harness`. + A test suite for evaluating accuracy classes. """ - def setUp(self) -> None: - """ - Set up the test case. - - This method initializes the `Harness` instance with the necessary configuration for evaluating the model. - It configures the test settings, such as the minimum pass rate and minimum F1 score, for accuracy evaluation. - """ - - self.h_spacy = Harness( - task="ner", - model="en_core_web_sm", - data="langtest/data/conll/sample.conll", - hub="spacy", - ) - self.h_spacy.configure( - { - "tests": { - "defaults": { - "min_pass_rate": 0.65, - }, - "accuracy": {"min_f1_score": {"min_score": 0.65}}, - } - } - ) - self.report = self.h_spacy.generate().run().report() + accuracy_config = { + "min_precision_score": {"min_score": 0.66}, + "min_recall_score": {"min_score": 0.60}, + "min_f1_score": {"min_score": 0.60}, + "min_micro_f1_score": {"min_score": 0.60}, + "min_macro_f1_score": {"min_score": 0.60}, + "min_weighted_f1_score": {"min_score": 0.60}, + "min_bleu_score": {"min_score": 0.66}, + "min_exact_match_score": {"min_score": 0.60}, + "min_rouge1_score": {"min_score": 0.60}, + "min_rouge2_score": {"min_score": 0.60}, + "min_rougeL_score": {"min_score": 0.60}, + "min_rougeLsum_score": {"min_score": 0.60}, + } - def test_report(self): - """ - Test the evaluation report. + @pytest.fixture + def sample_data(self): + """A fixture providing sample data for testing. - This method asserts that the evaluation report generated by the `Harness` is an instance of `pd.DataFrame`. + Returns: + dict: A dictionary containing sample data for different tasks. """ - - self.assertIsInstance(self.report, pd.DataFrame) + return { + "text-classification": [ + SequenceClassificationSample( + original="The last good ernest movie, and the best at that. how can you not laugh at least once at this movie. the last line is a classic, as is ernest's gangster impressions, his best moment on film. this has his best lines and is a crowning achievement among the brainless screwball comedies.", + expected_results=SequenceClassificationOutput( + predictions=[SequenceLabel(label="Positive", score=1.0)] + ), + ), + SequenceClassificationSample( + original="After my 6 year old daughter began taking riding lessons I started looking for horse movies for her. I had always heard of National Velvet but had never seen it. Boy am I glad I bought it! It's become a favorite of mine, my 6 year old AND my 2 year old. It's a shame movies like this aren't made anymore.", + expected_results=SequenceClassificationOutput( + predictions=[SequenceLabel(label="Positive", score=1.0)] + ), + ), + ], + "ner": [ + NERSample( + original="Attendance : 3,000", + expected_results=NEROutput( + predictions=[ + NERPrediction( + entity="CARDINAL", + span=Span(start=13, end=18, word="3,000"), + ) + ] + ), + ), + NERSample( + original="I do not love KFC", + expected_results=NEROutput( + predictions=[ + NERPrediction( + entity="PROD", span=Span(start=14, end=17, word="KFC") + ) + ] + ), + ), + ], + "question-answering": [ + QASample( + original_question="What is John Snow Labs?", + original_context="John Snow Labs is a healthcare company specializing in accelerating progress in data science.", + expected_results="A healthcare company specializing in accelerating progress in data science. ", + ) + ], + "summarization": [ + SummarizationSample( + original="John Snow Labs is a healthcare company specializing in accelerating progress in data " + "science.", + expected_results="JSL is a data science company", + ) + ], + } From 2ef2e7a7d6e51dcb525ede9c03ca96bf0de9c1b5 Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Tue, 8 Aug 2023 19:04:28 +0530 Subject: [PATCH 6/6] test(test_accuracy.py):pytest for accuracy --- tests/test_accuracy.py | 67 ++++++++++++++++++++++++++++++++++++++++ tests/test_robustness.py | 6 ++-- 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/tests/test_accuracy.py b/tests/test_accuracy.py index ccb855ae3..db1c2cc0a 100644 --- a/tests/test_accuracy.py +++ b/tests/test_accuracy.py @@ -1,5 +1,16 @@ import pytest import pandas as pd +from langtest.transform.accuracy import ( + BaseAccuracy, + MinPrecisionScore, + MinF1Score, + MinMicroF1Score, + MinMacroF1Score, + MinWeightedF1Score, + MinEMcore, + MinBLEUcore, + MinROUGEcore, +) from langtest.utils.custom_types import SequenceLabel, Span from langtest.utils.custom_types.output import ( NEROutput, @@ -7,6 +18,7 @@ SequenceClassificationOutput, ) from langtest.utils.custom_types.sample import ( + MinScoreSample, NERSample, QASample, SequenceClassificationSample, @@ -94,3 +106,58 @@ def sample_data(self): ) ], } + + @pytest.mark.parametrize( + "accuracy", + [ + MinPrecisionScore, + MinF1Score, + MinMicroF1Score, + MinMacroF1Score, + MinWeightedF1Score, + MinEMcore, + MinBLEUcore, + MinROUGEcore, + ], + ) + def test_transform(self, accuracy: BaseAccuracy, sample_data) -> None: + """Test the transform method of accuracy-related classes. + + Args: + accuracy (BaseAccuracy): An accuracy-related class to test. + sample_data (dict): Sample data for different tasks. + + Returns: + None + """ + for alias in accuracy.alias_name: + for task in accuracy.supported_tasks: + if task == "text-classification": + y_true = ( + pd.Series(sample_data["text-classification"]) + .apply( + lambda x: [y.label for y in x.expected_results.predictions] + ) + .explode() + ) + elif task == "ner": + y_true = pd.Series(sample_data["ner"]).apply( + lambda x: [y.entity for y in x.expected_results.predictions] + ) + y_true = y_true.explode().apply( + lambda x: x.split("-")[-1] if isinstance(x, str) else x + ) + + else: + y_true = ( + pd.Series(sample_data[task]) + .apply(lambda x: x.expected_results) + .explode() + ) + transform_results = accuracy.transform( + alias, y_true, self.accuracy_config[alias] + ) + assert isinstance(transform_results, list) + + for _, result in zip(y_true, transform_results): + assert isinstance(result, MinScoreSample) diff --git a/tests/test_robustness.py b/tests/test_robustness.py index ff4b0e481..547954323 100644 --- a/tests/test_robustness.py +++ b/tests/test_robustness.py @@ -494,8 +494,8 @@ def test(self) -> None: """ prob = 1.0 for test in self.perturbations_list: - for s in self.samples: - sample = self.samples[s][-1] + for task in self.samples: + sample = self.samples[task][-1] test_func = self.supported_tests[test].transform if test not in [ @@ -530,7 +530,7 @@ def test(self) -> None: ], ) - if s == "question-answering": + if task == "question-answering": assert ( sample.perturbed_question is not None and sample.perturbed_context is not None