Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
c2531aa
fix: update OpenAI embedding initialization and response handling due…
chakravarthik27 Dec 27, 2025
6112375
fix: linting and formatting issues.
chakravarthik27 Dec 27, 2025
a85f122
Merge pull request #1229 from Pacific-AI-Corp/fix/embedding-based-eva…
chakravarthik27 Jan 20, 2026
65053c9
Update Python and mlflow dependencies in pyproject.toml
chakravarthik27 Mar 21, 2026
08851d3
fixed the lint issues.
chakravarthik27 Mar 21, 2026
f2cfc01
Update pyproject-flake8 dependency to version 7.0.0
chakravarthik27 Mar 21, 2026
50790c7
fix: add verbose flag to poetry install command for better logging
chakravarthik27 Mar 21, 2026
efd6362
updated the poetry.lock file
chakravarthik27 Mar 21, 2026
91fd6b8
fix: update scipy version to 1.17.1 for improved compatibility and se…
chakravarthik27 Mar 21, 2026
d2312a0
fix: update TestConfig structure to support in python 3.13 above vers…
chakravarthik27 Mar 21, 2026
97d58bd
fix: refactor TestConfig structure in fairness and robustness modules
chakravarthik27 Mar 21, 2026
d513d93
fix: refactor TypedDict definitions in representation classes
chakravarthik27 Mar 21, 2026
d3453bd
fix: update TypedDict definitions for parameters and TestConfig in ro…
chakravarthik27 Mar 21, 2026
f1e558e
fix: update TestConfig structure
chakravarthik27 Mar 21, 2026
8c27241
updated: dependencies and improved the tests time with xdist
chakravarthik27 Mar 23, 2026
61b4d5d
Merge pull request #1233 from PacificAI/fix/vulnerabilities-and-secur…
chakravarthik27 Mar 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [ "3.9","3.10", "3.11" ]
python-version: ["3.12", "3.13" ]

steps:
- name: Free up disk space at start
Expand Down Expand Up @@ -53,7 +53,7 @@ jobs:
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: |
poetry cache clear pypi --all -n > /dev/null
poetry install --with dev --all-extras --no-cache --quiet --no-interaction
poetry install --with dev --all-extras --no-cache --no-interaction
source ./.venv/bin/activate && pip uninstall -y pyspark && rm -rf ./.venv/lib/python${{ matrix.python-version }}/site-packages/pyspark*/
pip install pyspark==3.5.6

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/llm_tests_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [ "3.8", "3.9", "3.10" ]
python-version: [ "3.12", "3.13" ]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.9]
python-version: [3.12]
poetry-version: [2.1.3]
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
Expand Down
2 changes: 1 addition & 1 deletion langtest/datahandler/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,7 +808,7 @@ def load_raw_data(self, standardize_columns: bool = False) -> List[Dict]:
parsed CSV file into list of dicts
"""

if type(self._file_path) == dict:
if isinstance(self._file_path, dict):
df = pd.read_csv(self._file_path["data_source"])

if self.task == "text-classification":
Expand Down
16 changes: 10 additions & 6 deletions langtest/embeddings/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@
class OpenaiEmbeddings:
LIB_NAME = "openai"

def __init__(self, model="text-embedding-ada-002"):
def __init__(self, model="text-embedding-3-small"):
self.model = model
self.api_key = os.environ.get("OPENAI_API_KEY")
self.openai = None
self._check_openai_package()
if not self.api_key:
raise ValueError(Errors.E032())

self.openai.api_key = self.api_key
# self.openai.api_key = self.api_key

def _check_openai_package(self):
"""Check if the 'openai' package is installed and import the required functions.
Expand All @@ -44,13 +44,17 @@ def get_embedding(
list[float]: A list of floating-point values representing the text's embedding.
"""
if isinstance(text, list):
response = self.openai.Embedding.create(input=text, model=self.model)
response = self.openai.Client(api_key=self.api_key).embedding.create(
input=text, model=self.model
)
embedding = [
np.array(response["data"][i]["embedding"]).reshape(1, -1)
np.array(response.data[i].embedding).reshape(1, -1)
for i in range(len(text))
]
return embedding
else:
response = self.openai.Embedding.create(input=[text], model=self.model)
embedding = np.array(response["data"][0]["embedding"]).reshape(1, -1)
response = self.openai.Client(api_key=self.api_key).embedding.create(
input=[text], model=self.model
)
embedding = np.array(response.data[0].embedding).reshape(1, -1)
return embedding
2 changes: 1 addition & 1 deletion langtest/langtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def configure(self, config: Union[HarnessConfig, dict, str]) -> HarnessConfig:
Returns:
dict: Loaded configuration.
"""
if type(config) == dict:
if isinstance(config, dict):
self._config = config
else:
with open(config, "r", encoding="utf-8") as yml:
Expand Down
35 changes: 16 additions & 19 deletions langtest/metrics/llm_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,25 +66,22 @@ def build_prompt(
f"""\n\nScore the student answer based on the following criteria:\n{eval_criteria}"""
)

prompt += dedent(
f"""
Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: {grade_list} here

{
("Grade the student answers based ONLY on their factual accuracy. Ignore differences"
" in punctuation and phrasing between the student answer and true answer. It is OK "
"if the student answer contains more or relevant information than the true answer, as"
" long as it does not contain any conflicting statements. Begin!")
}

QUESTION: {{query}}
STUDENT ANSWER: {{result}}
TRUE ANSWER: {{answer}}
GRADE:"""
prompt += (
"Example Format:\n"
"QUESTION: question here\n"
"STUDENT ANSWER: student's answer here\n"
"TRUE ANSWER: true answer here\n"
f"GRADE: {grade_list} here"
"\n\n"
"Grade the student answers based ONLY on their factual accuracy. Ignore differences"
" in punctuation and phrasing between the student answer and true answer. It is OK "
"if the student answer contains more or relevant information than the true answer, as"
" long as it does not contain any conflicting statements. Begin!"
"\n\n"
"QUESTION: {{query}}\n"
"STUDENT ANSWER: {{result}}\n"
"TRUE ANSWER: {{answer}}\n"
"GRADE:\n"
)
return prompt

Expand Down
4 changes: 2 additions & 2 deletions langtest/modelhandler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@


if "langchain" in INSTALLED_HUBS:
import langchain
import langchain_classic

LANGCHAIN_HUBS = {
(
RENAME_HUBS.get(hub.lower(), hub.lower())
if hub.lower() in RENAME_HUBS
else hub.lower()
): hub
for hub in langchain.llms.__all__
for hub in langchain_classic.llms.__all__
}
LANGCHAIN_HUBS["openrouter"] = "openrouter"

Expand Down
4 changes: 2 additions & 2 deletions langtest/modelhandler/llm_modelhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

import os
from typing import Any, List, Type, Union, TypeVar
import langchain.llms as lc
import langchain_classic.llms as lc
import langchain.chat_models as chat_models
from langchain.chains.llm import LLMChain
from langchain_classic.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_core.language_models.base import BaseLanguageModel
from langchain_core.exceptions import OutputParserException
Expand Down
6 changes: 3 additions & 3 deletions langtest/modelhandler/modelhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@
}

if try_import_lib("langchain"):
import langchain
import langchain.llms
import langchain_classic
import langchain_classic.llms

LANGCHAIN_HUBS = {
(
RENAME_HUBS.get(hub.lower(), hub.lower())
if hub.lower() in RENAME_HUBS
else hub.lower()
): hub
for hub in langchain.llms.__all__
for hub in langchain_classic.llms.__all__
}
LANGCHAIN_HUBS["openrouter"] = "openrouter"
else:
Expand Down
15 changes: 11 additions & 4 deletions langtest/transform/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,8 @@ class BaseAccuracy(ABC):

TestConfig = TypedDict(
"TestConfig",
min_score=Union[Dict[str, float], float],
# min_score=Union[Dict[str, float], float],
{"min_score": Union[Dict[str, float], float]},
)

@classmethod
Expand Down Expand Up @@ -1029,9 +1030,15 @@ class LLMEval(BaseAccuracy):

TestConfig = TypedDict(
"TestConfig",
model=str,
hub=str,
min_score=float,
# model=str,
# hub=str,
# min_score=float,
{
"model": str,
"hub": str,
"model_parameters": dict,
"min_score": float,
},
)

@classmethod
Expand Down
4 changes: 2 additions & 2 deletions langtest/transform/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ async def async_run(
category_output = all_categories[each].run(
values, model_handler, progress_bar=tests, **kwargs
)
if type(category_output) == list:
if isinstance(category_output, list):
all_results.extend(category_output)
else:
all_results.append(category_output)
Expand Down Expand Up @@ -264,7 +264,7 @@ def run(
if len(test_name.split("-")) > 1:
test_name = "multiple_perturbations"
test_output = supported_tests[test_name].async_run(samples, model, **kwargs)
if type(test_output) == list:
if isinstance(test_output, list):
tasks.extend(test_output)
else:
tasks.append(test_output)
Expand Down
2 changes: 1 addition & 1 deletion langtest/transform/bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ class BaseBias(ABC):
]

# Config Hint for the bias tests
TestConfig = TypedDict("TestConfig", min_pass_rate=float)
TestConfig = TypedDict("TestConfig", {"min_pass_rate": float})

@abstractmethod
def transform(self, sample_list: List[Sample], *args, **kwargs) -> List[Sample]:
Expand Down
3 changes: 2 additions & 1 deletion langtest/transform/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ class BaseClinical(ABC):
# TestConfig
TestConfig = TypedDict(
"TestConfig",
min_pass_rate=float,
# min_pass_rate=float,
{"min_pass_rate": float},
)

@staticmethod
Expand Down
3 changes: 2 additions & 1 deletion langtest/transform/disinformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ class DisinformationTestFactory(ITests):
# TestConfig
TestConfig = TypedDict(
"TestConfig",
min_pass_rate=float,
# min_pass_rate=float,
{"min_pass_rate": float},
)

def __init__(self, data_handler: List[Sample], tests: Dict = None, **kwargs) -> None:
Expand Down
3 changes: 2 additions & 1 deletion langtest/transform/factuality.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class FactualityTestFactory(ITests):
# TestConfig
TestConfig = TypedDict(
"TestConfig",
min_pass_rate=float,
# min_pass_rate=float,
{"min_pass_rate": float},
)

def __init__(self, data_handler: List[Sample], tests: Dict = None, **kwargs) -> None:
Expand Down
Loading
Loading