From 5fbbfb219ec009037b9d3d3025bbb16ad114cd2d Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 25 Oct 2024 12:46:09 -0400 Subject: [PATCH 01/16] Add overloads for __call__ methods that take query/response and conversation --- .../_evaluators/_coherence/_coherence.py | 53 +++++++++++++------ .../_content_safety/_content_safety.py | 50 ++++++++++++----- .../_content_safety/_hate_unfairness.py | 50 ++++++++++++----- .../_evaluators/_content_safety/_self_harm.py | 48 ++++++++++++----- .../_evaluators/_content_safety/_sexual.py | 48 ++++++++++++----- .../_evaluators/_content_safety/_violence.py | 48 ++++++++++++----- .../_groundedness/_groundedness.py | 50 ++++++++++++----- .../_protected_material.py | 48 ++++++++++++----- .../_evaluators/_relevance/_relevance.py | 53 +++++++++++++------ .../ai/evaluation/_evaluators/_xpia/xpia.py | 49 ++++++++++++----- 10 files changed, 360 insertions(+), 137 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py index b6cb803021e8..9d04f3ac679a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py @@ -4,7 +4,7 @@ import os from typing import Optional -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase @@ -49,6 +49,42 @@ def __init__(self, model_config): prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY) + @overload + def __call__( + self, + *, + query: str, + response: str, + ): + """Evaluate coherence for given input of query, response + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :return: The coherence score. + :rtype: Dict[str, float] + """ + ... + + @overload + def __call__( + self, + *, + conversation, + **kwargs, + ): + """Evaluate coherence for a conversation + + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The coherence score. + :rtype: Dict[str, Union[float, Dict[str, List[float]]]] + """ + ... + @override def __call__( self, @@ -58,19 +94,4 @@ def __call__( conversation=None, **kwargs, ): - """Evaluate coherence. Accepts either a query and response for a single evaluation, - or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of - turns, the evaluator will aggregate the results of each turn. - - :keyword response: The response to be evaluated. - :paramtype response: Optional[str] - :keyword context: The context to be evaluated. - :paramtype context: Optional[str] - :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages". Conversation turns are expected - to be dictionaries with keys "content" and "role". - :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The relevance score. - :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] - """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 3f56a7488684..9e3c15557b5b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -5,7 +5,7 @@ from typing import Callable, Dict, List, Optional, Union from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._evaluators._common import EvaluatorBase @@ -78,29 +78,51 @@ def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, * HateUnfairnessEvaluator(credential, azure_ai_project), ] - @override + @overload def __call__( self, *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, - **kwargs, + query: str, + response: str, ): - """Evaluate a collection of content safety metrics for the given query/response pair or conversation. - This inputs must supply either a query AND response, or a conversation, but not both. + """Evaluate a collection of content safety metrics for the given query/response pair + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :return: The content safety scores. + :rtype: Dict[str, Union[str, float]] + """ + ... - :keyword query: The query to evaluate. - :paramtype query: Optional[str] - :keyword response: The response to evaluate. - :paramtype response: Optional[str] + @overload + def __call__( + self, + *, + conversation, + **kwargs, + ): + """Evaluate a collection of content safety metrics for a conversation + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The evaluation result. - :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + :return: The content safety scores. + :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] """ + ... + + @override + def __call__( + self, + *, + query: Optional[str] = None, + response: Optional[str] = None, + conversation=None, + **kwargs, + ): return super().__call__(query=query, response=response, conversation=conversation, **kwargs) @override diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index ae078563dbbd..638daf8dcb8a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -3,7 +3,7 @@ # --------------------------------------------------------- from typing import Optional, Union -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics @@ -57,28 +57,50 @@ def __init__( credential=credential, eval_last_turn=eval_last_turn, ) - - @override + + @overload def __call__( self, *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, - **kwargs, + query: str, + response: str, ): - """ - Evaluate whether hateful content is present in your AI system's response. - + """Evaluate the given query/response pair for hateful content + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. :paramtype response: str + :return: The hate score + :rtype: Dict[str, Union[str, float]] + """ + ... + + @overload + def __call__( + self, + *, + conversation, + **kwargs, + ): + """Evaluate a conversation for hateful content + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages". Conversation turns are expected - to be dictionaries with keys "content" and "role". + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The fluency score. - :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + :return: The hate score + :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] """ + ... + + @override + def __call__( + self, + *, + query: Optional[str] = None, + response: Optional[str] = None, + conversation=None, + **kwargs, + ): return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index ea1f057e2fcf..47023c1900c5 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -3,7 +3,7 @@ # --------------------------------------------------------- from typing import Optional, Union -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics @@ -58,27 +58,49 @@ def __init__( eval_last_turn=eval_last_turn, ) - @override + @overload def __call__( self, *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, - **kwargs, + query: str, + response: str, ): - """ - Evaluate whether self-harm content is present in your AI system's response. - + """Evaluate a given query/response pair for self-harm content + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. :paramtype response: str + :return: The self-harm score + :rtype: Dict[str, Union[str, float]] + """ + ... + + @overload + def __call__( + self, + *, + conversation, + **kwargs, + ): + """Evaluate a conversation for self-harm content + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages". Conversation turns are expected - to be dictionaries with keys "content" and "role". + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The fluency score. - :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + :return: The self-harm score + :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] """ + ... + + @override + def __call__( + self, + *, + query: Optional[str] = None, + response: Optional[str] = None, + conversation=None, + **kwargs, + ): return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index 151874535a5e..35b802a4c1d6 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -3,7 +3,7 @@ # --------------------------------------------------------- from typing import Optional, Union -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics @@ -58,27 +58,49 @@ def __init__( eval_last_turn=eval_last_turn, ) - @override + @overload def __call__( self, *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, - **kwargs, + query: str, + response: str, ): - """ - Evaluate whether sexual content is present in your AI system's response. - + """Evaluate a given query/response pair for sexual content + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. :paramtype response: str + :return: The sexual score + :rtype: Dict[str, Union[str, float]] + """ + ... + + @overload + def __call__( + self, + *, + conversation, + **kwargs, + ): + """Evaluate a conversation for sexual content + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages". Conversation turns are expected - to be dictionaries with keys "content" and "role". + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The fluency score. - :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + :return: The sexual score + :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] """ + ... + + @override + def __call__( + self, + *, + query: Optional[str] = None, + response: Optional[str] = None, + conversation=None, + **kwargs, + ): return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index 44a9b3abf0bf..27762d03d497 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -3,7 +3,7 @@ # --------------------------------------------------------- from typing import Optional, Union -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics @@ -58,27 +58,49 @@ def __init__( eval_last_turn=eval_last_turn, ) - @override + @overload def __call__( self, *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, - **kwargs, + query: str, + response: str, ): - """ - Evaluate whether violent content is present in your AI system's response. - + """Evaluate a given query/response pair for violent content + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. :paramtype response: str + :return: The content safety score. + :rtype: Dict[str, Union[str, float]] + """ + ... + + @overload + def __call__( + self, + *, + conversation, + **kwargs, + ): + """Evaluate a conversation for violent content + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages". Conversation turns are expected - to be dictionaries with keys "content" and "role". + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The fluency score. - :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + :return: The violence score. + :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] """ + ... + + @override + def __call__( + self, + *, + query: Optional[str] = None, + response: Optional[str] = None, + conversation=None, + **kwargs, + ): return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index 36cec30a080e..c951388c4657 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -4,7 +4,7 @@ import os from typing import Optional -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase @@ -50,28 +50,52 @@ def __init__(self, model_config): prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY) - @override + @overload def __call__( self, *, - response: Optional[str] = None, - context: Optional[str] = None, - conversation=None, - **kwargs, + response: str, + context: str, ): - """Evaluate groundedless. Accepts either a response and context a single evaluation, - or a conversation for a multi-turn evaluation. If the conversation has more than one turn, - the evaluator will aggregate the results of each turn. - + """Evaluate groundedness for given input of response, context + + :keyword query: The query to be evaluated. + :paramtype query: str :keyword response: The response to be evaluated. - :paramtype response: Optional[str] + :paramtype response: str :keyword context: The context to be evaluated. - :paramtype context: Optional[str] + :paramtype context: str + :return: The groundedness score. + :rtype: Dict[str, float] + """ + ... + + @overload + def __call__( + self, + *, + conversation, + **kwargs, + ): + """Evaluate groundedness for a conversation + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The relevance score. - :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] + :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ + ... + + + @override + def __call__( + self, + *, + response: Optional[str] = None, + context: Optional[str] = None, + conversation=None, + **kwargs, + ): return super().__call__(response=response, context=context, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index 0ead00125c3d..8bc5de81d91d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -4,7 +4,7 @@ from typing import Optional, Union -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics @@ -59,27 +59,49 @@ def __init__( eval_last_turn=eval_last_turn, ) - @override + @overload def __call__( self, *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, - **kwargs, + query: str, + response: str, ): - """ - Evaluate if protected material is present in your AI system's response. - + """Evaluate a given query/response pair for protected material + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. :paramtype response: str + :return: The protected material score. + :rtype: Dict[str, Union[str, bool]] + """ + ... + + @overload + def __call__( + self, + *, + conversation, + **kwargs, + ): + """Evaluate a conversation for protected material + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages". Conversation turns are expected - to be dictionaries with keys "content" and "role". + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The fluency score. - :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]] + :return: The protected material score. + :rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]] """ + ... + + @override + def __call__( + self, + *, + query: Optional[str] = None, + response: Optional[str] = None, + conversation=None, + **kwargs, + ): return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index 918df8b8f0aa..1e7c18a04cd0 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -5,7 +5,7 @@ import os from typing import Optional -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase @@ -53,31 +53,54 @@ def __init__(self, model_config): prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY) - @override + @overload def __call__( self, *, - query: Optional[str] = None, - response: Optional[str] = None, - context: Optional[str] = None, - conversation=None, - **kwargs, + query: str, + response: str, + context: str, ): - """Evaluate relevance. Accepts either a response and context a single evaluation, - or a conversation for a multi-turn evaluation. If the conversation has more than one turn, - the evaluator will aggregate the results of each turn. - + """Evaluate relevance for given input of query, response, context + :keyword query: The query to be evaluated. - :paramtype query: Optional[str] + :paramtype query: str :keyword response: The response to be evaluated. - :paramtype response: Optional[str] + :paramtype response: str :keyword context: The context to be evaluated. - :paramtype context: Optional[str] + :paramtype context: str + :return: The relevance score. + :rtype: Dict[str, float] + """ + ... + + @overload + def __call__( + self, + *, + conversation, + **kwargs, + ): + """Evaluate relevance for a conversation + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The relevance score. - :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] + :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ + ... + + + @override + def __call__( + self, + *, + query: Optional[str] = None, + response: Optional[str] = None, + context: Optional[str] = None, + conversation=None, + **kwargs, + ): return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index 8db3b777f23c..133f126e4963 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -4,7 +4,7 @@ import logging from typing import Optional, Union -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics @@ -65,27 +65,50 @@ def __init__( eval_last_turn=eval_last_turn, ) - @override + + @overload def __call__( self, *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, - **kwargs, + query: str, + response: str, ): - """ - Evaluate whether cross domain injected attacks are present in your AI system's response. - + """Evaluate whether cross domain injected attacks are present in given query/response + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. :paramtype response: str + :return: The sexual score + :rtype: Dict[str, Union[str, float]] + """ + ... + + @overload + def __call__( + self, + *, + conversation, + **kwargs, + ): + """Evaluate cross domain injected attacks are present in a conversation + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages". Conversation turns are expected - to be dictionaries with keys "content" and "role". + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The fluency score. - :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]] + :return: The sexual score + :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] """ + ... + + @override + def __call__( + self, + *, + query: Optional[str] = None, + response: Optional[str] = None, + conversation=None, + **kwargs, + ): return super().__call__(query=query, response=response, conversation=conversation, **kwargs) From 840d5286f628ca7b59ec56235348d89bdfc5b4c5 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 25 Oct 2024 13:21:48 -0400 Subject: [PATCH 02/16] remove callable type hint --- .../_content_safety/_content_safety.py | 1 - .../azure/ai/evaluation/test.py | 56 +++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 9e3c15557b5b..b846193c3191 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -31,7 +31,6 @@ class ContentSafetyEvaluator(EvaluatorBase): :param kwargs: Additional arguments to pass to the evaluator. :type kwargs: Any :return: A function that evaluates content-safety metrics for "question-answering" scenario. - :rtype: Callable **Usage** diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py new file mode 100644 index 000000000000..3192d5bb07cb --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py @@ -0,0 +1,56 @@ +from azure.ai.evaluation import evaluate, RelevanceEvaluator, HateUnfairnessEvaluator, ProtectedMaterialEvaluator +from azure.identity import AzureCliCredential +import os + +if __name__ == "__main__": + model_config = { + # OpenAI API key + "type": "azure_openai", + "azure_deployment": "gpt-35-turbo", + "azure_endpoint": "https://ai-neduvvurai952818858670.openai.azure.com", + "api_key": "dc2f807bc52448deafb28b23e2f146f6", + } + + + relevance_eval = RelevanceEvaluator(model_config) + + relevance_eval + + hate_unfairness_evaluator = ProtectedMaterialEvaluator( + azure_ai_project={ + "subscription_id": "b17253fa-f327-42d6-9686-f3e553e24763", + "resource_group_name": "rg-neduvvurai", + "project_name": "neduvvur-4217", + }, + credential=AzureCliCredential(), + ) + + print(hate_unfairness_evaluator( + query="Which tent is the most waterproof?", + response="The Alpine Explorer Tent is the most waterproof.", + )) + + """os.environ["PF_EVALS_BATCH_USE_ASYNC"] = "false" + + datasets_folderpath = os.path.abspath(".") + input_path = os.path.join(datasets_folderpath, "data.jsonl") + + print(input_path) + + eval_result = evaluate( + evaluators={ + "relevance": relevance_eval, + "hate_unfairness": hate_unfairness_evaluator, + }, + data="C:/Users/neduvvur/azure-sdk-for-python/data.jsonl", + ) + + print(eval_result)""" + + + """print(relevance_eval( + query="Which tent is the most waterproof?", + response="The Alpine Explorer Tent is the most waterproof.", + context="From the our product list, the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight.", + ) + )""" \ No newline at end of file From fdd8bbdfaabe12c32d5b62da46d8f5d4bbd2ed65 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 25 Oct 2024 14:28:26 -0400 Subject: [PATCH 03/16] add docstrings/type hints --- .../_evaluators/_coherence/_coherence.py | 21 +++++++-- .../_content_safety/_content_safety.py | 21 +++++++-- .../_content_safety/_hate_unfairness.py | 23 ++++++++-- .../_evaluators/_content_safety/_self_harm.py | 23 ++++++++-- .../_evaluators/_content_safety/_sexual.py | 20 ++++++-- .../_evaluators/_content_safety/_violence.py | 24 ++++++++-- .../_evaluators/_fluency/_fluency.py | 41 ++++++++++++++++- .../_groundedness/_groundedness.py | 24 ++++++++-- .../_protected_material.py | 20 ++++++-- .../_evaluators/_relevance/_relevance.py | 46 +++++++++---------- .../ai/evaluation/_evaluators/_xpia/xpia.py | 31 +++++++++---- 11 files changed, 232 insertions(+), 62 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py index 9d04f3ac679a..292b4e7b4f46 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py @@ -2,7 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import os -from typing import Optional +from typing import Dict, Union, List, Optional from typing_extensions import overload, override @@ -55,7 +55,7 @@ def __call__( *, query: str, response: str, - ): + ) -> Dict[str, float]: """Evaluate coherence for given input of query, response :keyword query: The query to be evaluated. @@ -73,7 +73,7 @@ def __call__( *, conversation, **kwargs, - ): + ) -> Dict[str, Union[float, Dict[str, List[float]]]]: """Evaluate coherence for a conversation :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -94,4 +94,19 @@ def __call__( conversation=None, **kwargs, ): + """Evaluate coherence. Accepts either a query and response for a single evaluation, + or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of + turns, the evaluator will aggregate the results of each turn. + + :keyword response: The response to be evaluated. + :paramtype response: Optional[str] + :keyword context: The context to be evaluated. + :paramtype context: Optional[str] + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages". Conversation turns are expected + to be dictionaries with keys "content" and "role". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The relevance score. + :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] + """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index b846193c3191..9980586d756d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._evaluators._common import EvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation from ._hate_unfairness import HateUnfairnessEvaluator from ._self_harm import SelfHarmEvaluator @@ -83,7 +84,7 @@ def __call__( *, query: str, response: str, - ): + ) -> Dict[str, Union[str, float]]: """Evaluate a collection of content safety metrics for the given query/response pair :keyword query: The query to be evaluated. @@ -99,9 +100,9 @@ def __call__( def __call__( self, *, - conversation, + conversation: Conversation, **kwargs, - ): + ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a collection of content safety metrics for a conversation :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -122,6 +123,20 @@ def __call__( conversation=None, **kwargs, ): + """Evaluate a collection of content safety metrics for the given query/response pair or conversation. + This inputs must supply either a query AND response, or a conversation, but not both. + + :keyword query: The query to evaluate. + :paramtype query: Optional[str] + :keyword response: The response to evaluate. + :paramtype response: Optional[str] + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The evaluation result. + :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) @override diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index 638daf8dcb8a..7ddf9d7ae6a5 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -1,13 +1,14 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional, Union +from typing import Dict, List, Optional, Union from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation @experimental @@ -64,7 +65,7 @@ def __call__( *, query: str, response: str, - ): + ) -> Dict[str, Union[str, float]]: """Evaluate the given query/response pair for hateful content :keyword query: The query to be evaluated. @@ -80,9 +81,9 @@ def __call__( def __call__( self, *, - conversation, + conversation: Conversation, **kwargs, - ): + ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for hateful content :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -103,4 +104,18 @@ def __call__( conversation=None, **kwargs, ): + """ + Evaluate whether hateful content is present in your AI system's response. + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages". Conversation turns are expected + to be dictionaries with keys "content" and "role". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The fluency score. + :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index 47023c1900c5..8a67cb69cd10 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -1,13 +1,14 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional, Union +from typing import Dict, List, Optional, Union from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation @experimental @@ -64,7 +65,7 @@ def __call__( *, query: str, response: str, - ): + ) -> Dict[str, Union[str, float]]: """Evaluate a given query/response pair for self-harm content :keyword query: The query to be evaluated. @@ -80,9 +81,9 @@ def __call__( def __call__( self, *, - conversation, + conversation: Conversation, **kwargs, - ): + ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for self-harm content :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -103,4 +104,18 @@ def __call__( conversation=None, **kwargs, ): + """ + Evaluate whether self-harm content is present in your AI system's response. + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages". Conversation turns are expected + to be dictionaries with keys "content" and "role". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The fluency score. + :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index 35b802a4c1d6..6bb20ccd0500 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional, Union +from typing import Dict, List, Optional, Union from typing_extensions import overload, override @@ -64,7 +64,7 @@ def __call__( *, query: str, response: str, - ): + ) -> Dict[str, Union[str, float]]: """Evaluate a given query/response pair for sexual content :keyword query: The query to be evaluated. @@ -82,7 +82,7 @@ def __call__( *, conversation, **kwargs, - ): + ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] """Evaluate a conversation for sexual content :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -103,4 +103,18 @@ def __call__( conversation=None, **kwargs, ): + """ + Evaluate whether sexual content is present in your AI system's response. + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages". Conversation turns are expected + to be dictionaries with keys "content" and "role". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The fluency score. + :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index 27762d03d497..a6b791993354 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -1,13 +1,14 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional, Union +from typing import Dict, List, Optional, Union from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation @experimental @@ -64,7 +65,7 @@ def __call__( *, query: str, response: str, - ): + ) -> Dict[str, Union[str, float]]: """Evaluate a given query/response pair for violent content :keyword query: The query to be evaluated. @@ -80,9 +81,9 @@ def __call__( def __call__( self, *, - conversation, + conversation: Conversation, **kwargs, - ): + ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for violent content :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -103,4 +104,19 @@ def __call__( conversation=None, **kwargs, ): + """ + Evaluate whether violent content is present in your AI system's response. + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages". Conversation turns are expected + to be dictionaries with keys "content" and "role". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The fluency score. + :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + """ + return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py index 38cc2dcc9e9f..5ffb061a970c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py @@ -3,11 +3,12 @@ # --------------------------------------------------------- import os -from typing import Optional +from typing import Dict, List, Optional, Union -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation class FluencyEvaluator(PromptyEvaluatorBase): @@ -50,6 +51,42 @@ def __init__(self, model_config): prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY) + @overload + def __call__( + self, + *, + query: str, + response: str, + ) -> Dict[str, float]: + """Evaluate fleuncy in given query/response + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :return: The fluency score + :rtype: Dict[str, float] + """ + ... + + @overload + def __call__( + self, + *, + conversation: Conversation, + **kwargs, + ) -> Dict[str, Union[float, Dict[str, List[float]]]]: + """Evaluate cross domain injected attacks are present in a conversation + + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The fluency score + :rtype: Dict[str, Union[float, Dict[str, List[float]]]] + """ + ... + @override def __call__( self, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index c951388c4657..688cadeffe6f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -2,11 +2,12 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import os -from typing import Optional +from typing import Dict, List, Optional, Union from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation class GroundednessEvaluator(PromptyEvaluatorBase): @@ -56,7 +57,7 @@ def __call__( *, response: str, context: str, - ): + ) -> Dict[str, float]: """Evaluate groundedness for given input of response, context :keyword query: The query to be evaluated. @@ -74,9 +75,9 @@ def __call__( def __call__( self, *, - conversation, + conversation: Conversation, **kwargs, - ): + ) -> Dict[str, Union[float, Dict[str, List[float]]]]: """Evaluate groundedness for a conversation :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -98,4 +99,19 @@ def __call__( conversation=None, **kwargs, ): + """Evaluate groundedless. Accepts either a response and context a single evaluation, + or a conversation for a multi-turn evaluation. If the conversation has more than one turn, + the evaluator will aggregate the results of each turn. + + :keyword response: The response to be evaluated. + :paramtype response: Optional[str] + :keyword context: The context to be evaluated. + :paramtype context: Optional[str] + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The relevance score. + :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] + """ return super().__call__(response=response, context=context, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index 8bc5de81d91d..545c8c0c249a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -2,7 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional, Union +from typing import Dict, List, Optional, Union from typing_extensions import overload, override @@ -65,7 +65,7 @@ def __call__( *, query: str, response: str, - ): + ) -> Dict[str, Union[str, bool]]: """Evaluate a given query/response pair for protected material :keyword query: The query to be evaluated. @@ -83,7 +83,7 @@ def __call__( *, conversation, **kwargs, - ): + ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]: """Evaluate a conversation for protected material :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -104,4 +104,18 @@ def __call__( conversation=None, **kwargs, ): + """ + Evaluate if protected material is present in your AI system's response. + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages". Conversation turns are expected + to be dictionaries with keys "content" and "role". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The fluency score. + :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]] + """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index 1e7c18a04cd0..9da783ce1992 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -3,10 +3,11 @@ # --------------------------------------------------------- import os -from typing import Optional +from typing import Dict, Union, List, Optional from typing_extensions import overload, override +from azure.ai.evaluation._model_configurations import Conversation from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase @@ -60,36 +61,16 @@ def __call__( query: str, response: str, context: str, - ): - """Evaluate relevance for given input of query, response, context - - :keyword query: The query to be evaluated. - :paramtype query: str - :keyword response: The response to be evaluated. - :paramtype response: str - :keyword context: The context to be evaluated. - :paramtype context: str - :return: The relevance score. - :rtype: Dict[str, float] - """ + ) -> Dict[str, float]: ... @overload def __call__( self, *, - conversation, + conversation: Conversation, **kwargs, - ): - """Evaluate relevance for a conversation - - :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages", and potentially a global context under the key "context". Conversation turns are expected - to be dictionaries with keys "content", "role", and possibly "context". - :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The relevance score. - :rtype: Dict[str, Union[float, Dict[str, List[float]]]] - """ + ) -> Dict[str, Union[float, Dict[str, List[float]]]]: ... @@ -103,4 +84,21 @@ def __call__( conversation=None, **kwargs, ): + """Evaluate relevance. Accepts either a response and context a single evaluation, + or a conversation for a multi-turn evaluation. If the conversation has more than one turn, + the evaluator will aggregate the results of each turn. + + :keyword query: The query to be evaluated. + :paramtype query: Optional[str] + :keyword response: The response to be evaluated. + :paramtype response: Optional[str] + :keyword context: The context to be evaluated. + :paramtype context: Optional[str] + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The relevance score. + :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] + """ return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index 133f126e4963..c6f37e30d3fd 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -2,13 +2,14 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import logging -from typing import Optional, Union +from typing import Dict, List, Optional, Union from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation logger = logging.getLogger(__name__) @@ -72,15 +73,15 @@ def __call__( *, query: str, response: str, - ): + ) -> Dict[str, Union[str, bool]]: """Evaluate whether cross domain injected attacks are present in given query/response :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. :paramtype response: str - :return: The sexual score - :rtype: Dict[str, Union[str, float]] + :return: The cross domain injection attack score + :rtype: Dict[str, Union[str, bool]] """ ... @@ -88,17 +89,17 @@ def __call__( def __call__( self, *, - conversation, + conversation: Conversation, **kwargs, - ): + ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]: """Evaluate cross domain injected attacks are present in a conversation :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The sexual score - :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] + :return: The cross domain injection attack score + :rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]] """ ... @@ -111,4 +112,18 @@ def __call__( conversation=None, **kwargs, ): + """ + Evaluate whether cross domain injected attacks are present in your AI system's response. + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages". Conversation turns are expected + to be dictionaries with keys "content" and "role". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The cross domain injection attack score + :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]] + """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) From ee1a410d155ec50ad6c210d12a9924063f9f3a1b Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 25 Oct 2024 14:32:38 -0400 Subject: [PATCH 04/16] fix a typo --- .../azure/ai/evaluation/_evaluators/_fluency/_fluency.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py index 5ffb061a970c..22dd63f928b4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py @@ -58,7 +58,7 @@ def __call__( query: str, response: str, ) -> Dict[str, float]: - """Evaluate fleuncy in given query/response + """Evaluate fluency in given query/response :keyword query: The query to be evaluated. :paramtype query: str @@ -76,7 +76,7 @@ def __call__( conversation: Conversation, **kwargs, ) -> Dict[str, Union[float, Dict[str, List[float]]]]: - """Evaluate cross domain injected attacks are present in a conversation + """Evaluate fluency for a conversation :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected From d04684c3616c728b226b416310f779ff934f5930 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 25 Oct 2024 14:33:49 -0400 Subject: [PATCH 05/16] remove file --- .../_groundedness/_groundedness.py | 2 +- .../azure/ai/evaluation/test.py | 56 ------------------- 2 files changed, 1 insertion(+), 57 deletions(-) delete mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index 688cadeffe6f..99463b0607e7 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -84,7 +84,7 @@ def __call__( key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The relevance score. + :return: The groundedness score. :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ ... diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py deleted file mode 100644 index 3192d5bb07cb..000000000000 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py +++ /dev/null @@ -1,56 +0,0 @@ -from azure.ai.evaluation import evaluate, RelevanceEvaluator, HateUnfairnessEvaluator, ProtectedMaterialEvaluator -from azure.identity import AzureCliCredential -import os - -if __name__ == "__main__": - model_config = { - # OpenAI API key - "type": "azure_openai", - "azure_deployment": "gpt-35-turbo", - "azure_endpoint": "https://ai-neduvvurai952818858670.openai.azure.com", - "api_key": "dc2f807bc52448deafb28b23e2f146f6", - } - - - relevance_eval = RelevanceEvaluator(model_config) - - relevance_eval - - hate_unfairness_evaluator = ProtectedMaterialEvaluator( - azure_ai_project={ - "subscription_id": "b17253fa-f327-42d6-9686-f3e553e24763", - "resource_group_name": "rg-neduvvurai", - "project_name": "neduvvur-4217", - }, - credential=AzureCliCredential(), - ) - - print(hate_unfairness_evaluator( - query="Which tent is the most waterproof?", - response="The Alpine Explorer Tent is the most waterproof.", - )) - - """os.environ["PF_EVALS_BATCH_USE_ASYNC"] = "false" - - datasets_folderpath = os.path.abspath(".") - input_path = os.path.join(datasets_folderpath, "data.jsonl") - - print(input_path) - - eval_result = evaluate( - evaluators={ - "relevance": relevance_eval, - "hate_unfairness": hate_unfairness_evaluator, - }, - data="C:/Users/neduvvur/azure-sdk-for-python/data.jsonl", - ) - - print(eval_result)""" - - - """print(relevance_eval( - query="Which tent is the most waterproof?", - response="The Alpine Explorer Tent is the most waterproof.", - context="From the our product list, the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight.", - ) - )""" \ No newline at end of file From 7cbb85d24ef1e3279872513eae94a091c1c69458 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 25 Oct 2024 14:37:58 -0400 Subject: [PATCH 06/16] remove a bad param --- .../ai/evaluation/_evaluators/_groundedness/_groundedness.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index 99463b0607e7..6e2d536f1781 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -60,8 +60,6 @@ def __call__( ) -> Dict[str, float]: """Evaluate groundedness for given input of response, context - :keyword query: The query to be evaluated. - :paramtype query: str :keyword response: The response to be evaluated. :paramtype response: str :keyword context: The context to be evaluated. From ce01a37f27e3ba47f8a94d14db480bc8b8134a78 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 25 Oct 2024 14:39:25 -0400 Subject: [PATCH 07/16] add docs for relevance --- .../_evaluators/_relevance/_relevance.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index 9da783ce1992..c163843b37b3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -62,6 +62,18 @@ def __call__( response: str, context: str, ) -> Dict[str, float]: + """Evaluate groundedness for given input of query, response, context + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword context: The context to be evaluated. + :paramtype context: str + :return: The relevance score. + :rtype: Dict[str, float] + """ + ... @overload @@ -71,6 +83,15 @@ def __call__( conversation: Conversation, **kwargs, ) -> Dict[str, Union[float, Dict[str, List[float]]]]: + """Evaluate relevance for a conversation + + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The relevance score. + :rtype: Dict[str, Union[float, Dict[str, List[float]]]] + """ ... From ebc1425968e2f14897a40466acd12ef085c7fda1 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 25 Oct 2024 15:00:44 -0400 Subject: [PATCH 08/16] fix some missing type hints --- .../azure/ai/evaluation/_evaluators/_coherence/_coherence.py | 3 ++- .../ai/evaluation/_evaluators/_content_safety/_sexual.py | 5 +++-- .../_evaluators/_protected_material/_protected_material.py | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py index 292b4e7b4f46..e1e357407956 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py @@ -7,6 +7,7 @@ from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation class CoherenceEvaluator(PromptyEvaluatorBase): @@ -71,7 +72,7 @@ def __call__( def __call__( self, *, - conversation, + conversation: Conversation, **kwargs, ) -> Dict[str, Union[float, Dict[str, List[float]]]]: """Evaluate coherence for a conversation diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index 6bb20ccd0500..63744b2985a7 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -8,6 +8,7 @@ from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation @experimental @@ -80,9 +81,9 @@ def __call__( def __call__( self, *, - conversation, + conversation: Conversation, **kwargs, - ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] + ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for sexual content :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index 545c8c0c249a..1bf2ae108292 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation @experimental @@ -81,7 +82,7 @@ def __call__( def __call__( self, *, - conversation, + conversation: Conversation, **kwargs, ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]: """Evaluate a conversation for protected material From ab5a3d80fef79eed44e449b3e4e5738a4f7080da Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 25 Oct 2024 17:12:28 -0400 Subject: [PATCH 09/16] lint and run black --- .../azure/ai/evaluation/_common/_experimental.py | 6 ++++-- .../ai/evaluation/_evaluators/_coherence/_coherence.py | 4 ++-- .../_evaluators/_content_safety/_content_safety.py | 4 ++-- .../_evaluators/_content_safety/_hate_unfairness.py | 6 +++--- .../evaluation/_evaluators/_content_safety/_self_harm.py | 4 ++-- .../ai/evaluation/_evaluators/_content_safety/_sexual.py | 6 +++--- .../ai/evaluation/_evaluators/_content_safety/_violence.py | 4 ++-- .../azure/ai/evaluation/_evaluators/_fluency/_fluency.py | 6 +++--- .../evaluation/_evaluators/_groundedness/_groundedness.py | 5 ++--- .../_evaluators/_protected_material/_protected_material.py | 4 ++-- .../ai/evaluation/_evaluators/_relevance/_relevance.py | 7 +++---- .../azure/ai/evaluation/_evaluators/_xpia/xpia.py | 7 +++---- .../azure/ai/evaluation/simulator/_simulator.py | 1 - 13 files changed, 31 insertions(+), 33 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py index ca676c9bcdc9..6728a61649c6 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py @@ -27,11 +27,13 @@ @overload -def experimental(wrapped: Type[T]) -> Type[T]: ... +def experimental(wrapped: Type[T]) -> Type[T]: + ... @overload -def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: ... +def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: + ... def experimental(wrapped: Union[Type[T], Callable[P, T]]) -> Union[Type[T], Callable[P, T]]: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py index e1e357407956..6aefc1eecdfb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py @@ -58,7 +58,7 @@ def __call__( response: str, ) -> Dict[str, float]: """Evaluate coherence for given input of query, response - + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. @@ -76,7 +76,7 @@ def __call__( **kwargs, ) -> Dict[str, Union[float, Dict[str, List[float]]]]: """Evaluate coherence for a conversation - + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 7652fe70df6b..ae1325285266 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -87,7 +87,7 @@ def __call__( response: str, ) -> Dict[str, Union[str, float]]: """Evaluate a collection of content safety metrics for the given query/response pair - + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. @@ -105,7 +105,7 @@ def __call__( **kwargs, ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a collection of content safety metrics for a conversation - + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index 16ff9237e394..7c3acf13ea17 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -58,7 +58,7 @@ def __init__( credential=credential, eval_last_turn=eval_last_turn, ) - + @overload def __call__( self, @@ -67,7 +67,7 @@ def __call__( response: str, ) -> Dict[str, Union[str, float]]: """Evaluate the given query/response pair for hateful content - + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. @@ -85,7 +85,7 @@ def __call__( **kwargs, ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for hateful content - + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index 2c7efff8de5e..a613ae259eab 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -67,7 +67,7 @@ def __call__( response: str, ) -> Dict[str, Union[str, float]]: """Evaluate a given query/response pair for self-harm content - + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. @@ -85,7 +85,7 @@ def __call__( **kwargs, ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for self-harm content - + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index 4ec50b1df3e3..6de8e76a70b8 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -67,7 +67,7 @@ def __call__( response: str, ) -> Dict[str, Union[str, float]]: """Evaluate a given query/response pair for sexual content - + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. @@ -83,9 +83,9 @@ def __call__( *, conversation: Conversation, **kwargs, - ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: + ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for sexual content - + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index 193441fc35a5..2c66a681c9c0 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -67,7 +67,7 @@ def __call__( response: str, ) -> Dict[str, Union[str, float]]: """Evaluate a given query/response pair for violent content - + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. @@ -85,7 +85,7 @@ def __call__( **kwargs, ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for violent content - + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py index 22dd63f928b4..2bad778f2034 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py @@ -59,7 +59,7 @@ def __call__( response: str, ) -> Dict[str, float]: """Evaluate fluency in given query/response - + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. @@ -76,8 +76,8 @@ def __call__( conversation: Conversation, **kwargs, ) -> Dict[str, Union[float, Dict[str, List[float]]]]: - """Evaluate fluency for a conversation - + """Evaluate fluency for a conversation + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index 3260ec536c4b..5d39fcc9c79f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -59,7 +59,7 @@ def __call__( context: str, ) -> Dict[str, float]: """Evaluate groundedness for given input of response, context - + :keyword response: The response to be evaluated. :paramtype response: str :keyword context: The context to be evaluated. @@ -77,7 +77,7 @@ def __call__( **kwargs, ) -> Dict[str, Union[float, Dict[str, List[float]]]]: """Evaluate groundedness for a conversation - + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". @@ -86,7 +86,6 @@ def __call__( :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ ... - @override def __call__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index 2250fecf2dbb..379f0d47e5ac 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -68,7 +68,7 @@ def __call__( response: str, ) -> Dict[str, Union[str, bool]]: """Evaluate a given query/response pair for protected material - + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. @@ -86,7 +86,7 @@ def __call__( **kwargs, ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]: """Evaluate a conversation for protected material - + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index c163843b37b3..b871297dae2f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -63,7 +63,7 @@ def __call__( context: str, ) -> Dict[str, float]: """Evaluate groundedness for given input of query, response, context - + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. @@ -73,7 +73,7 @@ def __call__( :return: The relevance score. :rtype: Dict[str, float] """ - + ... @overload @@ -84,7 +84,7 @@ def __call__( **kwargs, ) -> Dict[str, Union[float, Dict[str, List[float]]]]: """Evaluate relevance for a conversation - + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". @@ -93,7 +93,6 @@ def __call__( :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ ... - @override def __call__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index 0b5935b71597..eb0874a1deff 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -66,7 +66,6 @@ def __init__( eval_last_turn=eval_last_turn, ) - @overload def __call__( self, @@ -75,7 +74,7 @@ def __call__( response: str, ) -> Dict[str, Union[str, bool]]: """Evaluate whether cross domain injected attacks are present in given query/response - + :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. @@ -92,8 +91,8 @@ def __call__( conversation: Conversation, **kwargs, ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]: - """Evaluate cross domain injected attacks are present in a conversation - + """Evaluate cross domain injected attacks are present in a conversation + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages", and potentially a global context under the key "context". Conversation turns are expected to be dictionaries with keys "content", "role", and possibly "context". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py index b6fcca19fb29..835f623612ed 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py @@ -226,7 +226,6 @@ async def _simulate_with_predefined_turns( semaphore = asyncio.Semaphore(concurrent_async_tasks) progress_bar_lock = asyncio.Lock() - async def run_simulation(simulation: List[Union[str, Dict[str, Any]]]) -> JsonLineChatProtocol: async with semaphore: current_simulation = ConversationHistory() From ee3a1db636d7753629f28ff00495b6aaba476079 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Mon, 28 Oct 2024 12:56:28 -0400 Subject: [PATCH 10/16] merge with main --- .../evaluation/_evaluators/_coherence/_coherence.py | 8 ++------ .../ai/evaluation/_evaluators/_common/_base_eval.py | 2 +- .../_evaluators/_common/_base_rai_svc_eval.py | 9 +++------ .../_evaluators/_content_safety/_content_safety.py | 8 ++------ .../_evaluators/_content_safety/_hate_unfairness.py | 8 ++------ .../_evaluators/_content_safety/_self_harm.py | 8 ++------ .../_evaluators/_content_safety/_sexual.py | 8 ++------ .../_evaluators/_content_safety/_violence.py | 8 ++------ .../ai/evaluation/_evaluators/_fluency/_fluency.py | 8 ++------ .../_evaluators/_groundedness/_groundedness.py | 8 ++------ .../_protected_material/_protected_material.py | 5 ++--- .../evaluation/_evaluators/_relevance/_relevance.py | 13 +++++-------- .../azure/ai/evaluation/_evaluators/_xpia/xpia.py | 12 ++++-------- 13 files changed, 31 insertions(+), 74 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py index 6aefc1eecdfb..067f236b01e4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py @@ -73,7 +73,6 @@ def __call__( self, *, conversation: Conversation, - **kwargs, ) -> Dict[str, Union[float, Dict[str, List[float]]]]: """Evaluate coherence for a conversation @@ -89,10 +88,7 @@ def __call__( @override def __call__( self, - *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, + *args, **kwargs, ): """Evaluate coherence. Accepts either a query and response for a single evaluation, @@ -110,4 +106,4 @@ def __call__( :return: The relevance score. :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] """ - return super().__call__(query=query, response=response, conversation=conversation, **kwargs) + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py index ef7a38e58e6f..1afb2eef668a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py @@ -87,7 +87,7 @@ def __init__( # This needs to be overridden just to change the function header into something more informative, # and to be able to add a more specific docstring. The actual function contents should just be # super().__call__() - def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]: + def __call__(self, *args, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]: """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for one main reason - to overwrite the method headers and docstring to include additional inputs as needed. The actual behavior of this function shouldn't change beyond adding more inputs to the diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py index 28d00f7977b6..5d9a60d14e74 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py @@ -18,7 +18,7 @@ from . import EvaluatorBase -T = Union[str, float] +T = Union[str, float, bool] class RaiServiceEvaluatorBase(EvaluatorBase[T]): @@ -52,10 +52,7 @@ def __init__( @override def __call__( self, - *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, + *args, **kwargs, ): """Evaluate either a query and response or a conversation. Must supply either a query AND response, @@ -71,7 +68,7 @@ def __call__( :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]] """ - return super().__call__(query=query, response=response, conversation=conversation, **kwargs) + return super().__call__(*args, **kwargs) @override async def _do_eval(self, eval_input: Dict) -> Dict[str, T]: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index ae1325285266..0e3e1a3bfc08 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -102,7 +102,6 @@ def __call__( self, *, conversation: Conversation, - **kwargs, ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a collection of content safety metrics for a conversation @@ -118,10 +117,7 @@ def __call__( @override def __call__( self, - *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, + *args, **kwargs, ): """Evaluate a collection of content safety metrics for the given query/response pair or conversation. @@ -138,7 +134,7 @@ def __call__( :return: The evaluation result. :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] """ - return super().__call__(query=query, response=response, conversation=conversation, **kwargs) + return super().__call__(*args, **kwargs) @override async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index 7c3acf13ea17..1fa3e55d7583 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -82,7 +82,6 @@ def __call__( self, *, conversation: Conversation, - **kwargs, ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for hateful content @@ -98,10 +97,7 @@ def __call__( @override def __call__( self, - *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, + *args, **kwargs, ): """ @@ -118,4 +114,4 @@ def __call__( :return: The fluency score. :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] """ - return super().__call__(query=query, response=response, conversation=conversation, **kwargs) + return super().__call__(*args,**kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index a613ae259eab..5f43c8874f4f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -82,7 +82,6 @@ def __call__( self, *, conversation: Conversation, - **kwargs, ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for self-harm content @@ -98,10 +97,7 @@ def __call__( @override def __call__( self, - *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, + *args, **kwargs, ): """ @@ -118,4 +114,4 @@ def __call__( :return: The fluency score. :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] """ - return super().__call__(query=query, response=response, conversation=conversation, **kwargs) + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index 6de8e76a70b8..d8d71fa38515 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -82,7 +82,6 @@ def __call__( self, *, conversation: Conversation, - **kwargs, ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for sexual content @@ -98,10 +97,7 @@ def __call__( @override def __call__( self, - *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, + *args, **kwargs, ): """ @@ -118,4 +114,4 @@ def __call__( :return: The fluency score. :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] """ - return super().__call__(query=query, response=response, conversation=conversation, **kwargs) + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index 2c66a681c9c0..df16fab93ed0 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -82,7 +82,6 @@ def __call__( self, *, conversation: Conversation, - **kwargs, ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for violent content @@ -98,10 +97,7 @@ def __call__( @override def __call__( self, - *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, + *args, **kwargs, ): """ @@ -119,4 +115,4 @@ def __call__( :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] """ - return super().__call__(query=query, response=response, conversation=conversation, **kwargs) + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py index 2bad778f2034..1fc7e6838b02 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py @@ -74,7 +74,6 @@ def __call__( self, *, conversation: Conversation, - **kwargs, ) -> Dict[str, Union[float, Dict[str, List[float]]]]: """Evaluate fluency for a conversation @@ -90,10 +89,7 @@ def __call__( @override def __call__( self, - *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, + *args, **kwargs, ): """ @@ -112,4 +108,4 @@ def __call__( :return: The fluency score. :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] """ - return super().__call__(query=query, response=response, conversation=conversation, **kwargs) + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index 5d39fcc9c79f..9df5861ea356 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -74,7 +74,6 @@ def __call__( self, *, conversation: Conversation, - **kwargs, ) -> Dict[str, Union[float, Dict[str, List[float]]]]: """Evaluate groundedness for a conversation @@ -90,10 +89,7 @@ def __call__( @override def __call__( self, - *, - response: Optional[str] = None, - context: Optional[str] = None, - conversation=None, + *args, **kwargs, ): """Evaluate groundedness. Accepts either a response and context a single evaluation, @@ -111,4 +107,4 @@ def __call__( :return: The relevance score. :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] """ - return super().__call__(response=response, context=context, conversation=conversation, **kwargs) + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index 379f0d47e5ac..0c7a23d19a8c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -83,8 +83,7 @@ def __call__( self, *, conversation: Conversation, - **kwargs, - ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]: """Evaluate a conversation for protected material :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -117,6 +116,6 @@ def __call__( to be dictionaries with keys "content" and "role". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The fluency score. - :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]] + :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]] """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index b871297dae2f..1fedc6d037e9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -57,10 +57,11 @@ def __init__(self, model_config): @overload def __call__( self, - *, + *args, query: str, response: str, context: str, + **kwargs, ) -> Dict[str, float]: """Evaluate groundedness for given input of query, response, context @@ -79,7 +80,7 @@ def __call__( @overload def __call__( self, - *, + *args, conversation: Conversation, **kwargs, ) -> Dict[str, Union[float, Dict[str, List[float]]]]: @@ -97,11 +98,7 @@ def __call__( @override def __call__( self, - *, - query: Optional[str] = None, - response: Optional[str] = None, - context: Optional[str] = None, - conversation=None, + *args, **kwargs, ): """Evaluate relevance. Accepts either a response and context a single evaluation, @@ -121,4 +118,4 @@ def __call__( :return: The relevance score. :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] """ - return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs) + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index eb0874a1deff..1640cd6fa5ab 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -89,8 +89,7 @@ def __call__( self, *, conversation: Conversation, - **kwargs, - ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]: """Evaluate cross domain injected attacks are present in a conversation :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -105,10 +104,7 @@ def __call__( @override def __call__( self, - *, - query: Optional[str] = None, - response: Optional[str] = None, - conversation=None, + *args, **kwargs, ): """ @@ -123,6 +119,6 @@ def __call__( to be dictionaries with keys "content" and "role". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The cross domain injection attack score - :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]] + :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]] """ - return super().__call__(query=query, response=response, conversation=conversation, **kwargs) + return super().__call__(*args, **kwargs) From 35ecd53aff927857cac52f78af102b7690d13622 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Mon, 28 Oct 2024 20:05:59 -0400 Subject: [PATCH 11/16] fix some mypy errors, not all pylint --- .../_evaluators/_coherence/_coherence.py | 10 ++-- .../_evaluators/_common/_base_prompty_eval.py | 6 ++- .../_evaluators/_common/_base_rai_svc_eval.py | 4 +- .../_content_safety/_content_safety.py | 12 ++--- .../_content_safety/_hate_unfairness.py | 14 +++--- .../_evaluators/_content_safety/_self_harm.py | 12 ++--- .../_evaluators/_content_safety/_sexual.py | 10 ++-- .../_evaluators/_content_safety/_violence.py | 12 ++--- .../_evaluators/_fluency/_fluency.py | 10 ++-- .../_groundedness/_groundedness.py | 10 ++-- .../_protected_material.py | 4 +- .../ai/evaluation/_evaluators/_qa/_qa.py | 6 +-- .../_evaluators/_relevance/_relevance.py | 17 +++---- .../_evaluators/_retrieval/_retrieval.py | 50 ++++++++++++++++--- .../_service_groundedness.py | 47 ++++++++++++++--- .../ai/evaluation/_evaluators/_xpia/xpia.py | 6 +-- 16 files changed, 139 insertions(+), 91 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py index 067f236b01e4..01d374704542 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py @@ -2,7 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import os -from typing import Dict, Union, List, Optional +from typing import Dict, Union, List from typing_extensions import overload, override @@ -10,7 +10,7 @@ from azure.ai.evaluation._model_configurations import Conversation -class CoherenceEvaluator(PromptyEvaluatorBase): +class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ Initialize a coherence evaluator configured for a specific Azure OpenAI model. @@ -56,7 +56,7 @@ def __call__( *, query: str, response: str, - ) -> Dict[str, float]: + ) -> Dict[str, Union[str, float]]: """Evaluate coherence for given input of query, response :keyword query: The query to be evaluated. @@ -66,14 +66,13 @@ def __call__( :return: The coherence score. :rtype: Dict[str, float] """ - ... @overload def __call__( self, *, conversation: Conversation, - ) -> Dict[str, Union[float, Dict[str, List[float]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluate coherence for a conversation :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -83,7 +82,6 @@ def __call__( :return: The coherence score. :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ - ... @override def __call__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py index e02f29ad0def..e851e8499260 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py @@ -4,7 +4,7 @@ import math import re -from typing import Dict, Union +from typing import Dict, TypeVar, Union from promptflow.core import AsyncPrompty from typing_extensions import override @@ -18,8 +18,10 @@ except ImportError: USER_AGENT = "None" +T = TypeVar("T") -class PromptyEvaluatorBase(EvaluatorBase[float]): + +class PromptyEvaluatorBase(EvaluatorBase[T]): """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators make use of a prompty file, and return their results as a dictionary, with a single key-value pair linking the result name to a float value (unless multi-turn evaluation occurs, in which case the diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py index 5d9a60d14e74..4a82c1fef20a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Dict, Optional, Union +from typing import Dict, TypeVar, Union from typing_extensions import override @@ -18,7 +18,7 @@ from . import EvaluatorBase -T = Union[str, float, bool] +T = TypeVar("T") class RaiServiceEvaluatorBase(EvaluatorBase[T]): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 0e3e1a3bfc08..597462f6952d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -2,7 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- from concurrent.futures import as_completed -from typing import Callable, Dict, List, Optional, Union +from typing import Callable, Dict, List, Union from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor from typing_extensions import overload, override @@ -18,7 +18,7 @@ @experimental -class ContentSafetyEvaluator(EvaluatorBase): +class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]): """ Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario. @@ -95,14 +95,13 @@ def __call__( :return: The content safety scores. :rtype: Dict[str, Union[str, float]] """ - ... @overload def __call__( self, *, conversation: Conversation, - ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluate a collection of content safety metrics for a conversation :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -110,9 +109,8 @@ def __call__( to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The content safety scores. - :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] + :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]] """ - ... @override def __call__( @@ -132,7 +130,7 @@ def __call__( to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The evaluation result. - :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]] """ return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index 1fa3e55d7583..b4f85d84d639 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Dict, List, Optional, Union +from typing import Dict, List, Union from typing_extensions import overload, override @@ -12,7 +12,7 @@ @experimental -class HateUnfairnessEvaluator(RaiServiceEvaluatorBase): +class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): """ Initialize a hate-unfairness evaluator for hate unfairness score. @@ -75,14 +75,13 @@ def __call__( :return: The hate score :rtype: Dict[str, Union[str, float]] """ - ... @overload def __call__( self, *, conversation: Conversation, - ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for hateful content :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -90,9 +89,8 @@ def __call__( to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The hate score - :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] + :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]] """ - ... @override def __call__( @@ -112,6 +110,6 @@ def __call__( to be dictionaries with keys "content" and "role". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The fluency score. - :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]] """ - return super().__call__(*args,**kwargs) + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index 5f43c8874f4f..e9831a065c66 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Dict, List, Optional, Union +from typing import Dict, List, Union from typing_extensions import overload, override @@ -12,7 +12,7 @@ @experimental -class SelfHarmEvaluator(RaiServiceEvaluatorBase): +class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): """ Initialize a self harm evaluator for self harm score. @@ -75,14 +75,13 @@ def __call__( :return: The self-harm score :rtype: Dict[str, Union[str, float]] """ - ... @overload def __call__( self, *, conversation: Conversation, - ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for self-harm content :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -90,9 +89,8 @@ def __call__( to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The self-harm score - :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] + :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]] """ - ... @override def __call__( @@ -112,6 +110,6 @@ def __call__( to be dictionaries with keys "content" and "role". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The fluency score. - :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]] """ return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index d8d71fa38515..575450a15efa 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Dict, List, Optional, Union +from typing import Dict, List, Union from typing_extensions import overload, override @@ -12,7 +12,7 @@ @experimental -class SexualEvaluator(RaiServiceEvaluatorBase): +class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): """ Initialize a sexual evaluator for sexual score. @@ -75,14 +75,13 @@ def __call__( :return: The sexual score :rtype: Dict[str, Union[str, float]] """ - ... @overload def __call__( self, *, conversation: Conversation, - ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for sexual content :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -90,9 +89,8 @@ def __call__( to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The sexual score - :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] + :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]] """ - ... @override def __call__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index df16fab93ed0..dfe5c1445d59 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Dict, List, Optional, Union +from typing import Dict, List, Union from typing_extensions import overload, override @@ -12,7 +12,7 @@ @experimental -class ViolenceEvaluator(RaiServiceEvaluatorBase): +class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): """ Initialize a violence evaluator for violence score. @@ -75,14 +75,13 @@ def __call__( :return: The content safety score. :rtype: Dict[str, Union[str, float]] """ - ... @overload def __call__( self, *, conversation: Conversation, - ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluate a conversation for violent content :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -90,9 +89,8 @@ def __call__( to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The violence score. - :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]] + :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]] """ - ... @override def __call__( @@ -112,7 +110,7 @@ def __call__( to be dictionaries with keys "content" and "role". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The fluency score. - :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]] + :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]] """ return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py index 1045ee9d0599..31553ca29780 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py @@ -3,7 +3,7 @@ # --------------------------------------------------------- import os -from typing import Dict, List, Optional, Union +from typing import Dict, List, Union from typing_extensions import overload, override @@ -11,7 +11,7 @@ from azure.ai.evaluation._model_configurations import Conversation -class FluencyEvaluator(PromptyEvaluatorBase): +class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ Initialize a fluency evaluator configured for a specific Azure OpenAI model. @@ -54,7 +54,7 @@ def __call__( self, *, response: str, - ) -> Dict[str, float]: + ) -> Dict[str, Union[str, float]]: """Evaluate fluency in given query/response :keyword response: The response to be evaluated. @@ -62,14 +62,13 @@ def __call__( :return: The fluency score :rtype: Dict[str, float] """ - ... @overload def __call__( self, *, conversation: Conversation, - ) -> Dict[str, Union[float, Dict[str, List[float]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluate fluency for a conversation :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -79,7 +78,6 @@ def __call__( :return: The fluency score :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ - ... @override def __call__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index fbfba6450cf7..f20e2d5f8637 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -17,7 +17,7 @@ USER_AGENT = "None" -class GroundednessEvaluator(PromptyEvaluatorBase): +class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ Initialize a groundedness evaluator configured for a specific Azure OpenAI model. @@ -70,7 +70,7 @@ def __call__( response: str, context: str, query: Optional[str] = None, - ) -> Dict[str, float]: + ) -> Dict[str, Union[str, float]]: """Evaluate groundedness for given input of response, context :keyword response: The response to be evaluated. @@ -83,14 +83,13 @@ def __call__( :return: The groundedness score. :rtype: Dict[str, float] """ - ... @overload def __call__( self, *, conversation: Conversation, - ) -> Dict[str, Union[float, Dict[str, List[float]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluate groundedness for a conversation :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -100,7 +99,6 @@ def __call__( :return: The groundedness score. :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ - ... @override def __call__( @@ -125,7 +123,7 @@ def __call__( to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The relevance score. - :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] + :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]] """ if kwargs.get("query", None): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index 3c96970f3823..70835cf73d67 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -13,7 +13,7 @@ @experimental -class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase): +class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): """ Initialize a protected material evaluator to detect whether protected material is present in the AI system's response. The evaluator outputs a Boolean label (`True` or `False`) @@ -81,7 +81,6 @@ def __call__( :return: The protected material score. :rtype: Dict[str, Union[str, bool]] """ - ... @overload def __call__( @@ -98,7 +97,6 @@ def __call__( :return: The protected material score. :rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]] """ - ... @override def __call__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py index e8198ff85e89..f27ce10c721c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py @@ -3,7 +3,7 @@ # --------------------------------------------------------- from concurrent.futures import as_completed -from typing import Callable, Dict, List +from typing import Callable, Dict, List, Union from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor @@ -58,7 +58,7 @@ class QAEvaluator: def __init__(self, model_config, parallel: bool = True): self._parallel = parallel - self._evaluators: List[Callable[..., Dict[str, float]]] = [ + self._evaluators: List[Union[Callable[..., Dict[str, Union[str, float]]], Callable[..., Dict[str, float]]]] = [ GroundednessEvaluator(model_config), RelevanceEvaluator(model_config), CoherenceEvaluator(model_config), @@ -82,7 +82,7 @@ def __call__(self, *, query: str, response: str, context: str, ground_truth: str :keyword parallel: Whether to evaluate in parallel. Defaults to True. :paramtype parallel: bool :return: The scores for QA scenario. - :rtype: Dict[str, float] + :rtype: Dict[str, Union[str, float]] """ results: Dict[str, float] = {} if self._parallel: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index 35f641b3e6f1..bb5a70fb06a0 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -3,7 +3,7 @@ # --------------------------------------------------------- import os -from typing import Dict, Union, List, Optional +from typing import Dict, Union, List from typing_extensions import overload, override @@ -56,11 +56,10 @@ def __init__(self, model_config): @overload def __call__( self, - *args, + *, query: str, response: str, - **kwargs, - ) -> Dict[str, float]: + ) -> Dict[str, Union[str, float]]: """Evaluate groundedness for given input of query, response, context :keyword query: The query to be evaluated. @@ -71,15 +70,12 @@ def __call__( :rtype: Dict[str, float] """ - ... - @overload def __call__( self, - *args, + *, conversation: Conversation, - **kwargs, - ) -> Dict[str, Union[float, Dict[str, List[float]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluate relevance for a conversation :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the @@ -89,7 +85,6 @@ def __call__( :return: The relevance score. :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ - ... @override def __call__( @@ -110,6 +105,6 @@ def __call__( to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The relevance score. - :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] + :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]] """ return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py index 748c4e4904b0..5ea83ce48c62 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py @@ -6,12 +6,15 @@ import logging import math import os -from typing import Optional +from typing import Dict, List, Union +from typing_extensions import overload from promptflow._utils.async_utils import async_run_allowing_running_loop from promptflow.core import AsyncPrompty +from azure.ai.evaluation._evaluators._common._base_prompty_eval import PromptyEvaluatorBase from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget +from azure.ai.evaluation._model_configurations import Conversation from ..._common.math import list_mean_nan_safe from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score @@ -107,7 +110,7 @@ async def __call__(self, *, query, context, conversation, **kwargs): } -class RetrievalEvaluator: +class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ Initialize an evaluator configured for a specific Azure OpenAI model. @@ -152,10 +155,42 @@ class RetrievalEvaluator: however, it is recommended to use the new key moving forward as the old key will be deprecated in the future. """ - def __init__(self, model_config): + def __init__(self, model_config): # pylint: disable=super-init-not-called self._async_evaluator = _AsyncRetrievalScoreEvaluator(model_config) - def __call__(self, *, query: Optional[str] = None, context: Optional[str] = None, conversation=None, **kwargs): + @overload + def __call__( + self, + *, + query: str, + context: str, + ) -> Dict[str, Union[str, float]]: + """Evaluates retrieval for a given a query and context + + :keyword query: The query to be evaluated. Mutually exclusive with `conversation` parameter. + :paramtype query: Optional[str] + :keyword context: The context to be evaluated. Mutually exclusive with `conversation` parameter. + :paramtype context: Optional[str] + :return: The scores for Chat scenario. + :rtype: Dict[str, Union[str, float]] + """ + + @overload + def __call__( + self, + *, + conversation: Conversation, + ) -> Dict[str, Union[float, Dict[str, List[float]]]]: + """Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn, + the evaluator will aggregate the results of each turn. + + :keyword conversation: The conversation to be evaluated. + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The scores for Chat scenario. + :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]] + """ + + def __call__(self, *args, **kwargs): """Evaluates retrieval score chat scenario. Accepts either a query and context for a single evaluation, or a conversation for a multi-turn evaluation. If the conversation has more than one turn, the evaluator will aggregate the results of each turn. @@ -169,6 +204,10 @@ def __call__(self, *, query: Optional[str] = None, context: Optional[str] = None :return: The scores for Chat scenario. :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ + query = kwargs.get("query", None) + context = kwargs.get("context", None) + conversation = kwargs.get("conversation", None) + if (query is None or context is None) and conversation is None: msg = "Either a pair of 'query'/'context' or 'conversation' must be provided." raise EvaluationException( @@ -192,6 +231,3 @@ def __call__(self, *, query: Optional[str] = None, context: Optional[str] = None return async_run_allowing_running_loop( self._async_evaluator, query=query, context=context, conversation=conversation, **kwargs ) - - def _to_async(self): - return self._async_evaluator diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py index 83780f6506ef..c617c977acde 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py @@ -1,16 +1,17 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional, Dict -from typing_extensions import override +from typing import List, Optional, Union, Dict +from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation @experimental -class GroundednessProEvaluator(RaiServiceEvaluatorBase): +class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): """ Initialize a Groundedness Pro evaluator for determine if the response is grounded in the query and context. @@ -100,14 +101,48 @@ def __init__( **kwargs, ) - @override + @overload def __call__( self, *, query: Optional[str] = None, response: Optional[str] = None, context: Optional[str] = None, - conversation=None, + ) -> Dict[str, Union[str, bool]]: + """Evaluate groundedness for a given query/response/context + + :keyword query: The query to be evaluated. + :paramtype query: Optional[str] + :keyword response: The response to be evaluated. + :paramtype response: Optional[str] + :keyword context: The context to be evaluated. + :paramtype context: Optional[str] + :return: The relevance score. + :rtype: Dict[str, Union[str, bool]] + """ + + @overload + def __call__( + self, + *, + conversation: Conversation, + ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]: + """Evaluate groundednessf for a conversation for a multi-turn evaluation. If the conversation has + more than one turn, the evaluator will aggregate the results of each turn, with the per-turn results + available in the output under the "evaluation_per_turn" key. + + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The relevance score. + :rtype: Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]] + """ + + @override + def __call__( + self, + *args, **kwargs, ): """Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a @@ -128,7 +163,7 @@ def __call__( :return: The relevance score. :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]] """ - return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs) + return super().__call__(*args, **kwargs) @override async def _do_eval(self, eval_input: Dict): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index 1640cd6fa5ab..efcc7b5032a9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -2,7 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import logging -from typing import Dict, List, Optional, Union +from typing import Dict, List, Union from typing_extensions import overload, override @@ -15,7 +15,7 @@ @experimental -class IndirectAttackEvaluator(RaiServiceEvaluatorBase): +class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): """A Cross-Domain Prompt Injection Attack (XPIA) jailbreak evaluator. Detect whether cross domain injected attacks are present in your AI system's response. @@ -82,7 +82,6 @@ def __call__( :return: The cross domain injection attack score :rtype: Dict[str, Union[str, bool]] """ - ... @overload def __call__( @@ -99,7 +98,6 @@ def __call__( :return: The cross domain injection attack score :rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]] """ - ... @override def __call__( From d3aacc191e3e79923b01ee7213f34a528f3e4c02 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Tue, 29 Oct 2024 10:34:36 -0400 Subject: [PATCH 12/16] fix black errors --- .../azure/ai/evaluation/_common/_experimental.py | 6 ++---- .../ai/evaluation/_evaluators/_coherence/_coherence.py | 2 +- .../azure/ai/evaluation/_evaluators/_common/_base_eval.py | 6 +++++- .../ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py | 4 ++-- .../_evaluators/_content_safety/_content_safety.py | 2 +- .../_evaluators/_content_safety/_hate_unfairness.py | 2 +- .../ai/evaluation/_evaluators/_content_safety/_self_harm.py | 2 +- .../ai/evaluation/_evaluators/_content_safety/_sexual.py | 2 +- .../ai/evaluation/_evaluators/_content_safety/_violence.py | 2 +- .../azure/ai/evaluation/_evaluators/_fluency/_fluency.py | 2 +- .../evaluation/_evaluators/_groundedness/_groundedness.py | 2 +- .../azure/ai/evaluation/_evaluators/_qa/_qa.py | 2 +- .../ai/evaluation/_evaluators/_relevance/_relevance.py | 2 +- .../ai/evaluation/_evaluators/_retrieval/_retrieval.py | 2 +- .../_service_groundedness/_service_groundedness.py | 4 ++-- .../azure/ai/evaluation/_evaluators/_xpia/xpia.py | 2 +- 16 files changed, 23 insertions(+), 21 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py index 6728a61649c6..ca676c9bcdc9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py @@ -27,13 +27,11 @@ @overload -def experimental(wrapped: Type[T]) -> Type[T]: - ... +def experimental(wrapped: Type[T]) -> Type[T]: ... @overload -def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: - ... +def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: ... def experimental(wrapped: Union[Type[T], Callable[P, T]]) -> Union[Type[T], Callable[P, T]]: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py index 01d374704542..a07754b69d56 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py @@ -84,7 +84,7 @@ def __call__( """ @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py index d5431045179d..71ade7fa33b9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py @@ -88,7 +88,11 @@ def __init__( # This needs to be overridden just to change the function header into something more informative, # and to be able to add a more specific docstring. The actual function contents should just be # super().__call__() - def __call__(self, *args, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]: + def __call__( # pylint: disable=docstring-missing-param + self, + *args, + **kwargs, + ) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]: """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for one main reason - to overwrite the method headers and docstring to include additional inputs as needed. The actual behavior of this function shouldn't change beyond adding more inputs to the diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py index 4a82c1fef20a..cb687d23e695 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py @@ -50,7 +50,7 @@ def __init__( self._credential = credential @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, @@ -105,7 +105,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]: ) input_data["context"] = context - return await evaluate_with_rai_service( + return await evaluate_with_rai_service( # type: ignore metric_name=self._eval_metric, data=input_data, project_scope=self._azure_ai_project, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 597462f6952d..05ad3c7539d8 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -113,7 +113,7 @@ def __call__( """ @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index b4f85d84d639..aed7deb827f1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -93,7 +93,7 @@ def __call__( """ @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index e9831a065c66..9d7693018a28 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -93,7 +93,7 @@ def __call__( """ @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index 575450a15efa..12c6b9ab2578 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -93,7 +93,7 @@ def __call__( """ @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index dfe5c1445d59..7867d563521b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -93,7 +93,7 @@ def __call__( """ @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py index 31553ca29780..dd995f1cf367 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py @@ -80,7 +80,7 @@ def __call__( """ @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index f20e2d5f8637..5215396aa9d2 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -101,7 +101,7 @@ def __call__( """ @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py index f27ce10c721c..b5f3ac810eff 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py @@ -84,7 +84,7 @@ def __call__(self, *, query: str, response: str, context: str, ground_truth: str :return: The scores for QA scenario. :rtype: Dict[str, Union[str, float]] """ - results: Dict[str, float] = {} + results: Dict[str, Union[str, float]] = {} if self._parallel: with ThreadPoolExecutor() as executor: futures = { diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index bb5a70fb06a0..f5fb2d96360b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -87,7 +87,7 @@ def __call__( """ @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py index 5ea83ce48c62..371453d682e3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py @@ -190,7 +190,7 @@ def __call__( :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ - def __call__(self, *args, **kwargs): + def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param """Evaluates retrieval score chat scenario. Accepts either a query and context for a single evaluation, or a conversation for a multi-turn evaluation. If the conversation has more than one turn, the evaluator will aggregate the results of each turn. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py index c617c977acde..be0d249c99b3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py @@ -127,7 +127,7 @@ def __call__( *, conversation: Conversation, ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]: - """Evaluate groundednessf for a conversation for a multi-turn evaluation. If the conversation has + """Evaluate groundedness for a conversation for a multi-turn evaluation. If the conversation has more than one turn, the evaluator will aggregate the results of each turn, with the per-turn results available in the output under the "evaluation_per_turn" key. @@ -140,7 +140,7 @@ def __call__( """ @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index efcc7b5032a9..079b035e7e93 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -100,7 +100,7 @@ def __call__( """ @override - def __call__( + def __call__( # pylint: disable=docstring-missing-param self, *args, **kwargs, From 28524258cb65591bf02be16f5814949b717ae2dc Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Tue, 29 Oct 2024 11:30:38 -0400 Subject: [PATCH 13/16] attempt to fix tests --- .../evaluation/_evaluators/_common/_base_eval.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py index 71ade7fa33b9..09fe859dd01f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py @@ -7,7 +7,7 @@ from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final from promptflow._utils.async_utils import async_run_allowing_running_loop -from typing_extensions import ParamSpec, TypeAlias +from typing_extensions import ParamSpec, TypeAlias, get_overloads from azure.ai.evaluation._common.math import list_mean from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException @@ -131,11 +131,18 @@ def _derive_singleton_inputs(self) -> List[str]: :rtype: List[str] """ + overloads = get_overloads(self.__call__) + if not overloads: + call_signatures = [inspect.signature(self.__call__)] + else: + call_signatures = [inspect.signature(overload) for overload in overloads] call_signature = inspect.signature(self.__call__) singletons = [] - for param in call_signature.parameters: - if param not in self._not_singleton_inputs: - singletons.append(param) + for call_signature in call_signatures: + params = call_signature.parameters + if any([not_singleton_input in params for not_singleton_input in self._not_singleton_inputs]): + continue + singletons.extend(params) return singletons def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]: From 0d76ce6f6f1cdb40560fbad69d1ad92059a5857b Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Tue, 29 Oct 2024 13:42:52 -0400 Subject: [PATCH 14/16] fix retrieval --- .../_evaluators/_common/_base_eval.py | 3 ++- .../ai/evaluation/_evaluators/_eci/_eci.py | 26 ++++++++++++++++++- .../_evaluators/_retrieval/_retrieval.py | 6 ++--- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py index 09fe859dd01f..cc6ff6e311df 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py @@ -142,7 +142,8 @@ def _derive_singleton_inputs(self) -> List[str]: params = call_signature.parameters if any([not_singleton_input in params for not_singleton_input in self._not_singleton_inputs]): continue - singletons.extend(params) + # exclude self since it is not a singleton input + singletons.extend([p for p in params if p != "self"]) return singletons def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py index 3e6e420e9305..c89df72fb13a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py @@ -1,11 +1,12 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing_extensions import override +from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._common.constants import _InternalEvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase +from azure.ai.evaluation._model_configurations import Conversation @experimental @@ -62,3 +63,26 @@ def __init__( credential=credential, eval_last_turn=eval_last_turn, ) + + @overload + def __call__( + self, + *, + query: str, + response: str, + ): ... + + @overload + def __call__( + self, + *, + conversation: Conversation, + ): ... + + @override + def __call__( # pylint: disable=docstring-missing-param + self, + *args, + **kwargs, + ): + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py index 371453d682e3..b23cf62b10be 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py @@ -204,9 +204,9 @@ def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param :return: The scores for Chat scenario. :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ - query = kwargs.get("query", None) - context = kwargs.get("context", None) - conversation = kwargs.get("conversation", None) + query = kwargs.pop("query", None) + context = kwargs.pop("context", None) + conversation = kwargs.pop("conversation", None) if (query is None or context is None) and conversation is None: msg = "Either a pair of 'query'/'context' or 'conversation' must be provided." From b8a1777cbef17b8848d676f684989600b6c2ce40 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Tue, 29 Oct 2024 13:52:42 -0400 Subject: [PATCH 15/16] fix up tests and lint --- .../azure/ai/evaluation/_evaluators/_common/_base_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py index cc6ff6e311df..8969d4dae9a9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py @@ -140,7 +140,7 @@ def _derive_singleton_inputs(self) -> List[str]: singletons = [] for call_signature in call_signatures: params = call_signature.parameters - if any([not_singleton_input in params for not_singleton_input in self._not_singleton_inputs]): + if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs): continue # exclude self since it is not a singleton input singletons.extend([p for p in params if p != "self"]) From 5aa71430e11c9da793cd89aeee6f481438008e4c Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Wed, 30 Oct 2024 15:15:28 -0400 Subject: [PATCH 16/16] fix some docstrings to mark some things as optional --- .../_evaluators/_content_safety/_hate_unfairness.py | 4 ++-- .../ai/evaluation/_evaluators/_content_safety/_self_harm.py | 4 ++-- .../ai/evaluation/_evaluators/_content_safety/_sexual.py | 4 ++-- .../ai/evaluation/_evaluators/_content_safety/_violence.py | 4 ++-- .../azure/ai/evaluation/_evaluators/_fluency/_fluency.py | 2 +- .../_evaluators/_protected_material/_protected_material.py | 4 ++-- .../azure/ai/evaluation/_evaluators/_xpia/xpia.py | 4 ++-- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index aed7deb827f1..afb645d45768 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -102,9 +102,9 @@ def __call__( # pylint: disable=docstring-missing-param Evaluate whether hateful content is present in your AI system's response. :keyword query: The query to be evaluated. - :paramtype query: str + :paramtype query: Optional[str] :keyword response: The response to be evaluated. - :paramtype response: str + :paramtype response: Optional[str] :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index 9d7693018a28..66cc70280737 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -102,9 +102,9 @@ def __call__( # pylint: disable=docstring-missing-param Evaluate whether self-harm content is present in your AI system's response. :keyword query: The query to be evaluated. - :paramtype query: str + :paramtype query: Optional[str] :keyword response: The response to be evaluated. - :paramtype response: str + :paramtype response: Optional[str] :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index 12c6b9ab2578..dbf7a2a0ae12 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -102,9 +102,9 @@ def __call__( # pylint: disable=docstring-missing-param Evaluate whether sexual content is present in your AI system's response. :keyword query: The query to be evaluated. - :paramtype query: str + :paramtype query: Optional[str] :keyword response: The response to be evaluated. - :paramtype response: str + :paramtype response: Optional[str] :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index 7867d563521b..f43c08726dcd 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -102,9 +102,9 @@ def __call__( # pylint: disable=docstring-missing-param Evaluate whether violent content is present in your AI system's response. :keyword query: The query to be evaluated. - :paramtype query: str + :paramtype query: Optional[str] :keyword response: The response to be evaluated. - :paramtype response: str + :paramtype response: Optional[str] :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py index dd995f1cf367..66c162a03993 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py @@ -91,7 +91,7 @@ def __call__( # pylint: disable=docstring-missing-param the evaluator will aggregate the results of each turn. :keyword response: The response to be evaluated. Mutually exclusive with the "conversation" parameter. - :paramtype response: str + :paramtype response: Optional[str] :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index 70835cf73d67..fb7dc8aefcb3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -111,9 +111,9 @@ def __call__( Evaluate if protected material is present in your AI system's response. :keyword query: The query to be evaluated. - :paramtype query: str + :paramtype query: Optional[str] :keyword response: The response to be evaluated. - :paramtype response: str + :paramtype response: Optional[str] :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index 079b035e7e93..9d591f8d75b7 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -109,9 +109,9 @@ def __call__( # pylint: disable=docstring-missing-param Evaluate whether cross domain injected attacks are present in your AI system's response. :keyword query: The query to be evaluated. - :paramtype query: str + :paramtype query: Optional[str] :keyword response: The response to be evaluated. - :paramtype response: str + :paramtype response: Optional[str] :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role".