Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
6b1d47f
updated model as dict
Prikshit7766 Aug 11, 2023
8ed0f49
Refacto(langtest.py): Updated Model as a list of dict
RakshitKhajuria Aug 11, 2023
a412253
test(translation): Updated model type
RakshitKhajuria Aug 11, 2023
e1e0a89
ner_pipeline.py: updated model type
Prikshit7766 Aug 11, 2023
82dfc86
test(augmentation): Updated model type
RakshitKhajuria Aug 11, 2023
33baf23
test(test_harness.py):updated params
Prikshit7766 Aug 11, 2023
feba035
Merge branch 'refacto/model-as-a-dict' of https://github.com/JohnSnow…
Prikshit7766 Aug 11, 2023
7884cc2
test(performance): Updated model as a dict
RakshitKhajuria Aug 11, 2023
ca8a12f
test(spacy): Updated model as a dict
RakshitKhajuria Aug 11, 2023
357e6d4
test(test_mlflow.py):updated params
Prikshit7766 Aug 11, 2023
a405d4b
test(test_sparknlp_model.py):updated params
Prikshit7766 Aug 11, 2023
ad5d06b
refacto(datasource) : Added data_source param
RakshitKhajuria Aug 11, 2023
762359a
langtest.py: updated data param as dict
Prikshit7766 Aug 11, 2023
34f4848
tests : updated data param as a dict
RakshitKhajuria Aug 12, 2023
de0aed4
ner_pipeline : updated data param as a dict
RakshitKhajuria Aug 12, 2023
270cb06
Test: updated data as a dict
Prikshit7766 Aug 12, 2023
f19cf82
test(test_harness.py): updated params
Prikshit7766 Aug 13, 2023
03bcb18
Merge branch 'release/1.3.0' of https://github.com/JohnSnowLabs/langt…
Prikshit7766 Aug 14, 2023
e727c43
fix : Lint
RakshitKhajuria Aug 14, 2023
2516435
fix: formatting
RakshitKhajuria Aug 14, 2023
e0e5344
Merge branch 'refacto/model-as-a-dict' of https://github.com/JohnSnow…
Prikshit7766 Aug 15, 2023
1895b3e
updated param
Prikshit7766 Aug 15, 2023
9450bc6
updated load method of Harness
Prikshit7766 Aug 15, 2023
b382322
updated param in langtest.py and augmentation/__init__.py
Prikshit7766 Aug 15, 2023
3b9051c
fix: Datafactory file_path
RakshitKhajuria Aug 15, 2023
0b64e52
fix: DataFactory file_path as a dict object
RakshitKhajuria Aug 15, 2023
ea591ec
ner_pipeline.py: param updated
Prikshit7766 Aug 15, 2023
491ac17
langtest.py: updated self.data as list
Prikshit7766 Aug 15, 2023
721aba5
langtest.py: updated the condtion
Prikshit7766 Aug 15, 2023
57cd30f
fix: Bias datasets are working
RakshitKhajuria Aug 17, 2023
c343d79
Merge branch 'refacto/model-as-a-dict' of https://github.com/JohnSnow…
RakshitKhajuria Aug 17, 2023
5125d2a
added source param for HuggingFaceDataset
Prikshit7766 Aug 17, 2023
538063e
updated augmentation/__init__.py
Prikshit7766 Aug 17, 2023
2a9fea9
test(test_augmentation.py):added source param
Prikshit7766 Aug 17, 2023
1affd08
datasource.py: added ValueError
Prikshit7766 Aug 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions langtest/augmentation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,8 @@ def fix(
Returns:
List[Dict[str, Any]]: A list of augmented data samples.
"""
if "." not in training_data["data_source"]:

if "source" in training_data and training_data["source"] == "huggingface":
self.df = HuggingFaceDataset(training_data["data_source"], self.task)
data = self.df.load_data(
feature_column=training_data.get("feature_column", "text"),
Expand All @@ -122,7 +123,7 @@ def fix(
subset=training_data.get("subset", None),
)
else:
self.df = DataFactory(training_data["data_source"], self.task)
self.df = DataFactory(training_data, self.task)
data = self.df.load()
TestFactory.is_augment = True
supported_tests = TestFactory.test_scenarios()
Expand Down Expand Up @@ -345,7 +346,7 @@ def fix(
Returns:
bool: Returns True upon successful completion of the method.
"""
df = DataFactory(training_data["data_source"], self.__task)
df = DataFactory(training_data, self.__task)
data = df.load()
new_data = []
self.__search_results = self.search_sample_results(data)
Expand Down
13 changes: 10 additions & 3 deletions langtest/datahandler/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,21 @@ class DataFactory:
correct Dataset type based on the file extension.
"""

def __init__(self, file_path: str, task: str, **kwargs) -> None:
def __init__(self, file_path: dict, task: str, **kwargs) -> None:
"""Initializes DataFactory object.

Args:
file_path (str): Path to the dataset.
file_path (dict): Dictionary containing 'data_source' key with the path to the dataset.
task (str): Task to be evaluated.
"""
self._file_path = file_path
if not isinstance(file_path, dict):
raise ValueError("'file_path' must be a dictionary.")

if "data_source" not in file_path:
raise ValueError(
"The 'data_source' key must be provided in the 'file_path' dictionary."
)
self._file_path = file_path.get("data_source")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if data_source is not provided, then it should raise an error.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please make it changes, @Prikshit7766 @RakshitKhajuria

self._class_map = {
cls.__name__.replace("Dataset", "").lower(): cls
for cls in _IDataset.__subclasses__()
Expand Down
155 changes: 92 additions & 63 deletions langtest/langtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import pickle
from collections import defaultdict
from typing import Any, Dict, List, Optional, Union
from typing import Dict, List, Optional, Union

import pandas as pd
import yaml
Expand Down Expand Up @@ -91,18 +91,18 @@ class Harness:
def __init__(
self,
task: str,
model: Optional[Union[str, Any]] = None,
hub: Optional[str] = None,
data: Optional[Union[str, dict]] = None,
model: Optional[Union[list, dict]] = None,
data: Optional[dict] = None,
config: Optional[Union[str, dict]] = None,
):
"""Initialize the Harness object.

Args:
task (str, optional): Task for which the model is to be evaluated.
model (str | ModelFactory): ModelFactory object or path to the model to be evaluated.
hub (str, optional): model hub to load from the path. Required if path is passed as 'model'.
data (str, optional): Path to the data to be used for evaluation.
model (list | dict, optional): Specifies the model to be evaluated.
If provided as a list, each element should be a dictionary with 'model' and 'hub' keys.
If provided as a dictionary, it must contain 'model' and 'hub' keys when specifying a path.
data (dict, optional): The data to be used for evaluation.
config (str | dict, optional): Configuration for the tests to be performed.

Raises:
Expand All @@ -111,8 +111,27 @@ def __init__(
super().__init__()

self.is_default = False
self._actual_model = model
self.hub = hub

if isinstance(model, list):
for item in model:
if not isinstance(item, dict):
raise ValueError("Each item in the list must be a dictionary")
if "model" not in item or "hub" not in item:
raise ValueError(
"Each dictionary in the list must have 'model' and 'hub' keys"
)
elif isinstance(model, dict):
if "model" not in model or "hub" not in model:
raise ValueError("The dictionary must have 'model' and 'hub' keys")
else:
raise ValueError("Invalid 'model' parameter type")

if isinstance(model, dict):
hub, model = model["hub"], model["model"]
self.hub = hub
self._actual_model = model
else:
hub = None

if task not in self.SUPPORTED_TASKS:
raise ValueError(
Expand All @@ -133,7 +152,7 @@ def __init__(

if data is None and (task, model, hub) in self.DEFAULTS_DATASET:
data_path = os.path.join("data", self.DEFAULTS_DATASET[(task, model, hub)])
data = resource_filename("langtest", data_path)
data = {"data_source": resource_filename("langtest", data_path)}
self.data = DataFactory(data, task=self.task).load()
if model == "textcat_imdb":
model = resource_filename("langtest", "data/textcat_imdb")
Expand All @@ -142,61 +161,60 @@ def __init__(

elif (
isinstance(data, dict)
and hub in self.SUPPORTED_HUBS_HF_DATASET_CLASSIFICATION
and task == "text-classification"
and "source" in data
and data["source"] == "huggingface"
):
self.data = (
HuggingFaceDataset(data["name"], task=task).load_data(
if (
task == "text-classification"
and hub in self.SUPPORTED_HUBS_HF_DATASET_CLASSIFICATION
):
self.data = HuggingFaceDataset(data["data_source"], task=task).load_data(
feature_column=data.get("feature_column", "text"),
target_column=data.get("target_column", "label"),
split=data.get("split", "test"),
subset=data.get("subset", None),
)
if data is not None
else None
)

if hub == "spacy" and (model == "textcat_imdb" or model is None):
if model is None:
logging.warning(
"Using the default 'textcat_imdb' model for Spacy hub. Please provide a custom model path if desired."
)
model = resource_filename("langtest", "data/textcat_imdb")
if hub == "spacy" and (model == "textcat_imdb" or model is None):
if model is None:
logging.warning(
"Using the default 'textcat_imdb' model for Spacy hub. Please provide a custom model path if desired."
)
model = resource_filename("langtest", "data/textcat_imdb")

elif (
isinstance(data, dict)
and hub in self.SUPPORTED_HUBS_HF_DATASET_NER
and task == "ner"
):
self.data = HuggingFaceDataset(data["name"], task=task).load_data(
feature_column=data.get("feature_column", "tokens"),
target_column=data.get("target_column", "ner_tags"),
split=data.get("split", "test"),
subset=data.get("subset", None),
)
elif task == "ner" and hub in self.SUPPORTED_HUBS_HF_DATASET_NER:
self.data = HuggingFaceDataset(data["data_source"], task=task).load_data(
feature_column=data.get("feature_column", "tokens"),
target_column=data.get("target_column", "ner_tags"),
split=data.get("split", "test"),
subset=data.get("subset", None),
)

elif (
isinstance(data, dict)
and hub in self.SUPPORTED_HUBS_HF_DATASET_SUMMARIZATION
and task == "summarization"
):
self.data = HuggingFaceDataset(data["name"], task=task).load_data(
feature_column=data.get("feature_column", "document"),
target_column=data.get("target_column", "summary"),
split=data.get("split", "test"),
subset=data.get("subset", None),
)
elif (
task == "summarization"
and hub in self.SUPPORTED_HUBS_HF_DATASET_SUMMARIZATION
):
self.data = HuggingFaceDataset(data["data_source"], task=task).load_data(
feature_column=data.get("feature_column", "document"),
target_column=data.get("target_column", "summary"),
split=data.get("split", "test"),
subset=data.get("subset", None),
)

elif data is None and (task, model, hub) not in self.DEFAULTS_DATASET.keys():
raise ValueError(
"You haven't specified any value for the parameter 'data' and the configuration you "
"passed is not among the default ones. You need to either specify the parameter 'data' "
"or use a default configuration."
)
elif isinstance(data, list):
self.data = data
elif isinstance(data["data_source"], list):
self.data = data["data_source"]
else:
self.file_path = data
if "data_source" not in data:
raise ValueError(
"The 'data_source' key must be provided in the 'data' parameter."
)
self.file_path = data["data_source"]
self.data = (
DataFactory(data, task=self.task).load() if data is not None else None
)
Expand All @@ -221,13 +239,19 @@ def __init__(
path=model, task=task, hub=hub, **self._config.get("model_parameters", {})
)

elif type(model) == dict:
elif isinstance(model, list):
model_dict = {}
for k, v in model.items():
model_dict[k] = ModelFactory.load_model(
task=task, path=k, hub=v, **self._config.get("model_parameters", {})
for i in model:
model = i["model"]
hub = i["hub"]

model_dict[model] = ModelFactory.load_model(
path=model,
task=task,
hub=hub,
**self._config.get("model_parameters", {}),
)
self.model = model_dict
self.model = model_dict

else:
self.model = ModelFactory(
Expand All @@ -241,7 +265,7 @@ def __init__(
print("Test Configuration : \n", formatted_config)

global GLOBAL_MODEL
if not isinstance(model, dict):
if not isinstance(model, list):
GLOBAL_MODEL = self.model

self._testcases = None
Expand Down Expand Up @@ -290,13 +314,17 @@ def configure(self, config: Union[str, dict]) -> dict:
**self._config.get("model_parameters", {}),
)

elif isinstance(model, dict):
elif isinstance(model, list):
model_dict = {}
for k, v in model.items():
model_dict[k] = ModelFactory.load_model(

for i in model:
model = i["model"]
hub = i["hub"]

model_dict[model] = ModelFactory.load_model(
path=model,
task=task,
path=k,
hub=v,
hub=hub,
**self._config.get("model_parameters", {}),
)
self.model = model_dict
Expand Down Expand Up @@ -1017,9 +1045,8 @@ def load(

harness = Harness(
task=task,
model=model,
data=data,
hub=hub,
model={"model": model, "hub": hub},
data={"data_source": data},
config=os.path.join(save_dir, "config.yaml"),
)
harness.generate()
Expand Down Expand Up @@ -1050,7 +1077,9 @@ def import_edited_testcases(self, input_path: str, **kwargs):
if sample.category not in ["robustness", "bias"]
]

self._testcases = DataFactory(input_path, task=self.task, is_import=True).load()
self._testcases = DataFactory(
{"data_source": input_path}, task=self.task, is_import=True
).load()
self._testcases.extend(temp_testcases)

return self
Expand Down
15 changes: 9 additions & 6 deletions langtest/pipelines/transformers/ner_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,12 @@ def setup(self):
self.hub = "huggingface"
self.output_dir = "checkpoints/"

self.train_datasource = DataFactory(file_path=self.train_data, task=self.task)
self.eval_datasource = DataFactory(file_path=self.eval_data, task=self.task)
self.train_datasource = DataFactory(
file_path={"data_source": self.train_data}, task=self.task
)
self.eval_datasource = DataFactory(
file_path={"data_source": self.eval_data}, task=self.task
)

self.next(self.train)

Expand Down Expand Up @@ -153,9 +157,8 @@ def test(self):
"""Performs the testing procedure of the model on a set of tests using langtest"""
self.harness = Harness(
task=self.task,
model=self.output_dir,
hub=self.hub,
data=self.train_data,
model={"model": self.output_dir, "hub": self.hub},
data={"data_source": self.train_data},
)
if self.config:
self.harness.configure(self.config)
Expand Down Expand Up @@ -184,7 +187,7 @@ def augment(self):
def retrain(self):
"""Performs the training procedure using the augmented data created by langtest"""
self.augmented_train_datasource = DataFactory(
file_path=self.path_augmented_file, task=self.task
file_path={"data_source": self.path_augmented_file}, task=self.task
)
samples = self.augmented_train_datasource.load_raw()

Expand Down
Loading