From 5fbbfb219ec009037b9d3d3025bbb16ad114cd2d Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Fri, 25 Oct 2024 12:46:09 -0400
Subject: [PATCH 01/16] Add overloads for __call__ methods that take
 query/response and conversation

---
 .../_evaluators/_coherence/_coherence.py      | 53 +++++++++++++------
 .../_content_safety/_content_safety.py        | 50 ++++++++++++-----
 .../_content_safety/_hate_unfairness.py       | 50 ++++++++++++-----
 .../_evaluators/_content_safety/_self_harm.py | 48 ++++++++++++-----
 .../_evaluators/_content_safety/_sexual.py    | 48 ++++++++++++-----
 .../_evaluators/_content_safety/_violence.py  | 48 ++++++++++++-----
 .../_groundedness/_groundedness.py            | 50 ++++++++++++-----
 .../_protected_material.py                    | 48 ++++++++++++-----
 .../_evaluators/_relevance/_relevance.py      | 53 +++++++++++++------
 .../ai/evaluation/_evaluators/_xpia/xpia.py   | 49 ++++++++++++-----
 10 files changed, 360 insertions(+), 137 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
index b6cb803021e8..9d04f3ac679a 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
@@ -4,7 +4,7 @@
 import os
 from typing import Optional
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 
@@ -49,6 +49,42 @@ def __init__(self, model_config):
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
 
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: str,
+    ):
+        """Evaluate coherence for given input of query, response
+    
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :return: The coherence score.
+        :rtype: Dict[str, float]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate coherence for a conversation
+        
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The coherence score.
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        """
+        ...
+
     @override
     def __call__(
         self,
@@ -58,19 +94,4 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
-        """Evaluate coherence. Accepts either a query and response for a single evaluation,
-        or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
-        turns, the evaluator will aggregate the results of each turn.
-
-        :keyword response: The response to be evaluated.
-        :paramtype response: Optional[str]
-        :keyword context: The context to be evaluated.
-        :paramtype context: Optional[str]
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
-        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The relevance score.
-        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
-        """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
index 3f56a7488684..9e3c15557b5b 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -5,7 +5,7 @@
 from typing import Callable, Dict, List, Optional, Union
 
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
@@ -78,29 +78,51 @@ def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, *
             HateUnfairnessEvaluator(credential, azure_ai_project),
         ]
 
-    @override
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
     ):
-        """Evaluate a collection of content safety metrics for the given query/response pair or conversation.
-        This inputs must supply either a query AND response, or a conversation, but not both.
+        """Evaluate a collection of content safety metrics for the given query/response pair
+    
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :return: The content safety scores.
+        :rtype: Dict[str, Union[str, float]]
+        """
+        ...
 
-        :keyword query: The query to evaluate.
-        :paramtype query: Optional[str]
-        :keyword response: The response to evaluate.
-        :paramtype response: Optional[str]
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate a collection of content safety metrics for a conversation
+        
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The evaluation result.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :return: The content safety scores.
+        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
         """
+        ...
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
 
     @override
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
index ae078563dbbd..638daf8dcb8a 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 from typing import Optional, Union
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
@@ -57,28 +57,50 @@ def __init__(
             credential=credential,
             eval_last_turn=eval_last_turn,
         )
-
-    @override
+    
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
     ):
-        """
-        Evaluate whether hateful content is present in your AI system's response.
-
+        """Evaluate the given query/response pair for hateful content
+    
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
+        :return: The hate score
+        :rtype: Dict[str, Union[str, float]]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate a conversation for hateful content
+        
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :return: The hate score
+        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
         """
+        ...
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
index ea1f057e2fcf..47023c1900c5 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 from typing import Optional, Union
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
@@ -58,27 +58,49 @@ def __init__(
             eval_last_turn=eval_last_turn,
         )
 
-    @override
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
     ):
-        """
-        Evaluate whether self-harm content is present in your AI system's response.
-
+        """Evaluate a given query/response pair for self-harm content
+    
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
+        :return: The self-harm score
+        :rtype: Dict[str, Union[str, float]]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate a conversation for self-harm content
+        
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :return: The self-harm score
+        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
         """
+        ...
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
index 151874535a5e..35b802a4c1d6 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 from typing import Optional, Union
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
@@ -58,27 +58,49 @@ def __init__(
             eval_last_turn=eval_last_turn,
         )
 
-    @override
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
     ):
-        """
-        Evaluate whether sexual content is present in your AI system's response.
-
+        """Evaluate a given query/response pair for sexual content
+    
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
+        :return: The sexual score
+        :rtype: Dict[str, Union[str, float]]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate a conversation for sexual content
+        
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :return: The sexual score
+        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
         """
+        ...
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
index 44a9b3abf0bf..27762d03d497 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 from typing import Optional, Union
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
@@ -58,27 +58,49 @@ def __init__(
             eval_last_turn=eval_last_turn,
         )
 
-    @override
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
     ):
-        """
-        Evaluate whether violent content is present in your AI system's response.
-
+        """Evaluate a given query/response pair for violent content
+    
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
+        :return: The content safety score.
+        :rtype: Dict[str, Union[str, float]]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate a conversation for violent content
+        
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :return: The violence score.
+        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
         """
+        ...
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index 36cec30a080e..c951388c4657 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -4,7 +4,7 @@
 import os
 from typing import Optional
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 
@@ -50,28 +50,52 @@ def __init__(self, model_config):
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
 
-    @override
+    @overload
     def __call__(
         self,
         *,
-        response: Optional[str] = None,
-        context: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        response: str,
+        context: str,
     ):
-        """Evaluate groundedless. Accepts either a response and context a single evaluation,
-        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
-        the evaluator will aggregate the results of each turn.
-
+        """Evaluate groundedness for given input of response, context
+    
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
         :keyword response: The response to be evaluated.
-        :paramtype response: Optional[str]
+        :paramtype response: str
         :keyword context: The context to be evaluated.
-        :paramtype context: Optional[str]
+        :paramtype context: str
+        :return: The groundedness score.
+        :rtype: Dict[str, float]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate groundedness for a conversation
+        
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The relevance score.
-        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
+        ...
+        
+
+    @override
+    def __call__(
+        self,
+        *,
+        response: Optional[str] = None,
+        context: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(response=response, context=context, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
index 0ead00125c3d..8bc5de81d91d 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
@@ -4,7 +4,7 @@
 
 from typing import Optional, Union
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
@@ -59,27 +59,49 @@ def __init__(
             eval_last_turn=eval_last_turn,
         )
 
-    @override
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
     ):
-        """
-        Evaluate if protected material is present in your AI system's response.
-
+        """Evaluate a given query/response pair for protected material
+    
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
+        :return: The protected material score.
+        :rtype: Dict[str, Union[str, bool]]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate a conversation for protected material
+        
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]]
+        :return: The protected material score.
+        :rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]
         """
+        ...
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
index 918df8b8f0aa..1e7c18a04cd0 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
@@ -5,7 +5,7 @@
 import os
 from typing import Optional
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 
@@ -53,31 +53,54 @@ def __init__(self, model_config):
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
 
-    @override
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        context: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
+        context: str,
     ):
-        """Evaluate relevance. Accepts either a response and context a single evaluation,
-        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
-        the evaluator will aggregate the results of each turn.
-
+        """Evaluate relevance for given input of query, response, context
+    
         :keyword query: The query to be evaluated.
-        :paramtype query: Optional[str]
+        :paramtype query: str
         :keyword response: The response to be evaluated.
-        :paramtype response: Optional[str]
+        :paramtype response: str
         :keyword context: The context to be evaluated.
-        :paramtype context: Optional[str]
+        :paramtype context: str
+        :return: The relevance score.
+        :rtype: Dict[str, float]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate relevance for a conversation
+        
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The relevance score.
-        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
+        ...
+        
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        context: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
index 8db3b777f23c..133f126e4963 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
@@ -4,7 +4,7 @@
 import logging
 from typing import Optional, Union
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
@@ -65,27 +65,50 @@ def __init__(
             eval_last_turn=eval_last_turn,
         )
 
-    @override
+
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
     ):
-        """
-        Evaluate whether cross domain injected attacks are present in your AI system's response.
-
+        """Evaluate whether cross domain injected attacks are present in given query/response
+    
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
+        :return: The sexual score
+        :rtype: Dict[str, Union[str, float]]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate cross domain injected attacks are present in a conversation 
+        
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]]
+        :return: The sexual score
+        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
         """
+        ...
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)

From 840d5286f628ca7b59ec56235348d89bdfc5b4c5 Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Fri, 25 Oct 2024 13:21:48 -0400
Subject: [PATCH 02/16] remove callable type hint

---
 .../_content_safety/_content_safety.py        |  1 -
 .../azure/ai/evaluation/test.py               | 56 +++++++++++++++++++
 2 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
index 9e3c15557b5b..b846193c3191 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -31,7 +31,6 @@ class ContentSafetyEvaluator(EvaluatorBase):
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
-    :rtype: Callable
 
     **Usage**
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py
new file mode 100644
index 000000000000..3192d5bb07cb
--- /dev/null
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py
@@ -0,0 +1,56 @@
+from azure.ai.evaluation import evaluate, RelevanceEvaluator, HateUnfairnessEvaluator, ProtectedMaterialEvaluator
+from azure.identity import AzureCliCredential
+import os
+
+if __name__ == "__main__":
+    model_config = {
+        # OpenAI API key
+        "type": "azure_openai",
+        "azure_deployment": "gpt-35-turbo",
+        "azure_endpoint": "https://ai-neduvvurai952818858670.openai.azure.com",
+        "api_key": "dc2f807bc52448deafb28b23e2f146f6",
+    }
+    
+    
+    relevance_eval = RelevanceEvaluator(model_config)
+
+    relevance_eval
+
+    hate_unfairness_evaluator = ProtectedMaterialEvaluator(
+        azure_ai_project={
+            "subscription_id": "b17253fa-f327-42d6-9686-f3e553e24763",
+            "resource_group_name": "rg-neduvvurai",
+            "project_name": "neduvvur-4217",
+        },
+        credential=AzureCliCredential(),
+    )
+
+    print(hate_unfairness_evaluator(
+        query="Which tent is the most waterproof?",
+        response="The Alpine Explorer Tent is the most waterproof.",
+    ))
+
+    """os.environ["PF_EVALS_BATCH_USE_ASYNC"] = "false"
+
+    datasets_folderpath = os.path.abspath(".")
+    input_path = os.path.join(datasets_folderpath, "data.jsonl")
+
+    print(input_path)
+
+    eval_result = evaluate(
+        evaluators={
+            "relevance": relevance_eval,
+            "hate_unfairness": hate_unfairness_evaluator,
+        },
+        data="C:/Users/neduvvur/azure-sdk-for-python/data.jsonl",
+    )
+
+    print(eval_result)"""
+
+
+    """print(relevance_eval(
+            query="Which tent is the most waterproof?",
+            response="The Alpine Explorer Tent is the most waterproof.",
+            context="From the our product list, the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight.",
+        )
+    )"""
\ No newline at end of file

From fdd8bbdfaabe12c32d5b62da46d8f5d4bbd2ed65 Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Fri, 25 Oct 2024 14:28:26 -0400
Subject: [PATCH 03/16] add docstrings/type hints

---
 .../_evaluators/_coherence/_coherence.py      | 21 +++++++--
 .../_content_safety/_content_safety.py        | 21 +++++++--
 .../_content_safety/_hate_unfairness.py       | 23 ++++++++--
 .../_evaluators/_content_safety/_self_harm.py | 23 ++++++++--
 .../_evaluators/_content_safety/_sexual.py    | 20 ++++++--
 .../_evaluators/_content_safety/_violence.py  | 24 ++++++++--
 .../_evaluators/_fluency/_fluency.py          | 41 ++++++++++++++++-
 .../_groundedness/_groundedness.py            | 24 ++++++++--
 .../_protected_material.py                    | 20 ++++++--
 .../_evaluators/_relevance/_relevance.py      | 46 +++++++++----------
 .../ai/evaluation/_evaluators/_xpia/xpia.py   | 31 +++++++++----
 11 files changed, 232 insertions(+), 62 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
index 9d04f3ac679a..292b4e7b4f46 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import os
-from typing import Optional
+from typing import Dict, Union, List, Optional
 
 from typing_extensions import overload, override
 
@@ -55,7 +55,7 @@ def __call__(
         *,
         query: str,
         response: str,
-    ):
+    ) -> Dict[str, float]:
         """Evaluate coherence for given input of query, response
     
         :keyword query: The query to be evaluated.
@@ -73,7 +73,7 @@ def __call__(
         *,
         conversation,
         **kwargs,
-    ):
+    ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
         """Evaluate coherence for a conversation
         
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -94,4 +94,19 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
+        """Evaluate coherence. Accepts either a query and response for a single evaluation,
+        or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
+        turns, the evaluator will aggregate the results of each turn.
+
+        :keyword response: The response to be evaluated.
+        :paramtype response: Optional[str]
+        :keyword context: The context to be evaluated.
+        :paramtype context: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages". Conversation turns are expected
+            to be dictionaries with keys "content" and "role".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
index b846193c3191..9980586d756d 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -9,6 +9,7 @@
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 from ._hate_unfairness import HateUnfairnessEvaluator
 from ._self_harm import SelfHarmEvaluator
@@ -83,7 +84,7 @@ def __call__(
         *,
         query: str,
         response: str,
-    ):
+    ) -> Dict[str, Union[str, float]]:
         """Evaluate a collection of content safety metrics for the given query/response pair
     
         :keyword query: The query to be evaluated.
@@ -99,9 +100,9 @@ def __call__(
     def __call__(
         self,
         *,
-        conversation,
+        conversation: Conversation,
         **kwargs,
-    ):
+    ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a collection of content safety metrics for a conversation
         
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -122,6 +123,20 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
+        """Evaluate a collection of content safety metrics for the given query/response pair or conversation.
+        This inputs must supply either a query AND response, or a conversation, but not both.
+
+        :keyword query: The query to evaluate.
+        :paramtype query: Optional[str]
+        :keyword response: The response to evaluate.
+        :paramtype response: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The evaluation result.
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
 
     @override
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
index 638daf8dcb8a..7ddf9d7ae6a5 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -1,13 +1,14 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union
 
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 
 @experimental
@@ -64,7 +65,7 @@ def __call__(
         *,
         query: str,
         response: str,
-    ):
+    ) -> Dict[str, Union[str, float]]:
         """Evaluate the given query/response pair for hateful content
     
         :keyword query: The query to be evaluated.
@@ -80,9 +81,9 @@ def __call__(
     def __call__(
         self,
         *,
-        conversation,
+        conversation: Conversation,
         **kwargs,
-    ):
+    ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for hateful content
         
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -103,4 +104,18 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
+        """
+        Evaluate whether hateful content is present in your AI system's response.
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages". Conversation turns are expected
+            to be dictionaries with keys "content" and "role".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The fluency score.
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
index 47023c1900c5..8a67cb69cd10 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -1,13 +1,14 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union
 
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 
 @experimental
@@ -64,7 +65,7 @@ def __call__(
         *,
         query: str,
         response: str,
-    ):
+    ) -> Dict[str, Union[str, float]]:
         """Evaluate a given query/response pair for self-harm content
     
         :keyword query: The query to be evaluated.
@@ -80,9 +81,9 @@ def __call__(
     def __call__(
         self,
         *,
-        conversation,
+        conversation: Conversation,
         **kwargs,
-    ):
+    ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for self-harm content
         
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -103,4 +104,18 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
+        """
+        Evaluate whether self-harm content is present in your AI system's response.
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages". Conversation turns are expected
+            to be dictionaries with keys "content" and "role".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The fluency score.
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
index 35b802a4c1d6..6bb20ccd0500 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union
 
 from typing_extensions import overload, override
 
@@ -64,7 +64,7 @@ def __call__(
         *,
         query: str,
         response: str,
-    ):
+    ) -> Dict[str, Union[str, float]]:
         """Evaluate a given query/response pair for sexual content
     
         :keyword query: The query to be evaluated.
@@ -82,7 +82,7 @@ def __call__(
         *,
         conversation,
         **kwargs,
-    ):
+     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
         """Evaluate a conversation for sexual content
         
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -103,4 +103,18 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
+        """
+        Evaluate whether sexual content is present in your AI system's response.
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages". Conversation turns are expected
+            to be dictionaries with keys "content" and "role".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The fluency score.
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
index 27762d03d497..a6b791993354 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -1,13 +1,14 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union
 
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 
 @experimental
@@ -64,7 +65,7 @@ def __call__(
         *,
         query: str,
         response: str,
-    ):
+    ) -> Dict[str, Union[str, float]]:
         """Evaluate a given query/response pair for violent content
     
         :keyword query: The query to be evaluated.
@@ -80,9 +81,9 @@ def __call__(
     def __call__(
         self,
         *,
-        conversation,
+        conversation: Conversation,
         **kwargs,
-    ):
+    ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for violent content
         
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -103,4 +104,19 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
+        """
+        Evaluate whether violent content is present in your AI system's response.
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages". Conversation turns are expected
+            to be dictionaries with keys "content" and "role".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The fluency score.
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        """
+
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
index 38cc2dcc9e9f..5ffb061a970c 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
@@ -3,11 +3,12 @@
 # ---------------------------------------------------------
 
 import os
-from typing import Optional
+from typing import Dict, List, Optional, Union
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 
 class FluencyEvaluator(PromptyEvaluatorBase):
@@ -50,6 +51,42 @@ def __init__(self, model_config):
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
 
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: str,
+    ) -> Dict[str, float]:
+        """Evaluate fleuncy in given query/response
+    
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :return: The fluency score
+        :rtype: Dict[str, float]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation: Conversation,
+        **kwargs,
+    ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
+        """Evaluate cross domain injected attacks are present in a conversation 
+        
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The fluency score
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        """
+        ...
+
     @override
     def __call__(
         self,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index c951388c4657..688cadeffe6f 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -2,11 +2,12 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import os
-from typing import Optional
+from typing import Dict, List, Optional, Union
 
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 
 class GroundednessEvaluator(PromptyEvaluatorBase):
@@ -56,7 +57,7 @@ def __call__(
         *,
         response: str,
         context: str,
-    ):
+    ) -> Dict[str, float]:
         """Evaluate groundedness for given input of response, context
     
         :keyword query: The query to be evaluated.
@@ -74,9 +75,9 @@ def __call__(
     def __call__(
         self,
         *,
-        conversation,
+        conversation: Conversation,
         **kwargs,
-    ):
+    ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
         """Evaluate groundedness for a conversation
         
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -98,4 +99,19 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
+        """Evaluate groundedless. Accepts either a response and context a single evaluation,
+        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
+        the evaluator will aggregate the results of each turn.
+
+        :keyword response: The response to be evaluated.
+        :paramtype response: Optional[str]
+        :keyword context: The context to be evaluated.
+        :paramtype context: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        """
         return super().__call__(response=response, context=context, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
index 8bc5de81d91d..545c8c0c249a 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union
 
 from typing_extensions import overload, override
 
@@ -65,7 +65,7 @@ def __call__(
         *,
         query: str,
         response: str,
-    ):
+    ) -> Dict[str, Union[str, bool]]:
         """Evaluate a given query/response pair for protected material
     
         :keyword query: The query to be evaluated.
@@ -83,7 +83,7 @@ def __call__(
         *,
         conversation,
         **kwargs,
-    ):
+    ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]:
         """Evaluate a conversation for protected material
         
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -104,4 +104,18 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
+        """
+        Evaluate if protected material is present in your AI system's response.
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages". Conversation turns are expected
+            to be dictionaries with keys "content" and "role".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The fluency score.
+        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]]
+        """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
index 1e7c18a04cd0..9da783ce1992 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
@@ -3,10 +3,11 @@
 # ---------------------------------------------------------
 
 import os
-from typing import Optional
+from typing import Dict, Union, List, Optional
 
 from typing_extensions import overload, override
 
+from azure.ai.evaluation._model_configurations import Conversation
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 
 
@@ -60,36 +61,16 @@ def __call__(
         query: str,
         response: str,
         context: str,
-    ):
-        """Evaluate relevance for given input of query, response, context
-    
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :keyword context: The context to be evaluated.
-        :paramtype context: str
-        :return: The relevance score.
-        :rtype: Dict[str, float]
-        """
+    ) -> Dict[str, float]:
         ...
 
     @overload
     def __call__(
         self,
         *,
-        conversation,
+        conversation: Conversation,
         **kwargs,
-    ):
-        """Evaluate relevance for a conversation
-        
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages", and potentially a global context under the key "context". Conversation turns are expected
-            to be dictionaries with keys "content", "role", and possibly "context".
-        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The relevance score.
-        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
-        """
+    ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
         ...
         
 
@@ -103,4 +84,21 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
+        """Evaluate relevance. Accepts either a response and context a single evaluation,
+        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
+        the evaluator will aggregate the results of each turn.
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: Optional[str]
+        :keyword response: The response to be evaluated.
+        :paramtype response: Optional[str]
+        :keyword context: The context to be evaluated.
+        :paramtype context: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        """
         return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
index 133f126e4963..c6f37e30d3fd 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
@@ -2,13 +2,14 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import logging
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union
 
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 logger = logging.getLogger(__name__)
 
@@ -72,15 +73,15 @@ def __call__(
         *,
         query: str,
         response: str,
-    ):
+    ) -> Dict[str, Union[str, bool]]:
         """Evaluate whether cross domain injected attacks are present in given query/response
     
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
-        :return: The sexual score
-        :rtype: Dict[str, Union[str, float]]
+        :return: The cross domain injection attack score
+        :rtype: Dict[str, Union[str, bool]]
         """
         ...
 
@@ -88,17 +89,17 @@ def __call__(
     def __call__(
         self,
         *,
-        conversation,
+        conversation: Conversation,
         **kwargs,
-    ):
+    ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]:
         """Evaluate cross domain injected attacks are present in a conversation 
         
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The sexual score
-        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
+        :return: The cross domain injection attack score
+        :rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]
         """
         ...
 
@@ -111,4 +112,18 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
+        """
+        Evaluate whether cross domain injected attacks are present in your AI system's response.
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages". Conversation turns are expected
+            to be dictionaries with keys "content" and "role".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The cross domain injection attack score
+        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]]
+        """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)

From ee1a410d155ec50ad6c210d12a9924063f9f3a1b Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Fri, 25 Oct 2024 14:32:38 -0400
Subject: [PATCH 04/16] fix a typo

---
 .../azure/ai/evaluation/_evaluators/_fluency/_fluency.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
index 5ffb061a970c..22dd63f928b4 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
@@ -58,7 +58,7 @@ def __call__(
         query: str,
         response: str,
     ) -> Dict[str, float]:
-        """Evaluate fleuncy in given query/response
+        """Evaluate fluency in given query/response
     
         :keyword query: The query to be evaluated.
         :paramtype query: str
@@ -76,7 +76,7 @@ def __call__(
         conversation: Conversation,
         **kwargs,
     ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
-        """Evaluate cross domain injected attacks are present in a conversation 
+        """Evaluate fluency for a conversation 
         
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected

From d04684c3616c728b226b416310f779ff934f5930 Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Fri, 25 Oct 2024 14:33:49 -0400
Subject: [PATCH 05/16] remove file

---
 .../_groundedness/_groundedness.py            |  2 +-
 .../azure/ai/evaluation/test.py               | 56 -------------------
 2 files changed, 1 insertion(+), 57 deletions(-)
 delete mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index 688cadeffe6f..99463b0607e7 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -84,7 +84,7 @@ def __call__(
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The relevance score.
+        :return: The groundedness score.
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
         ...
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py
deleted file mode 100644
index 3192d5bb07cb..000000000000
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/test.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from azure.ai.evaluation import evaluate, RelevanceEvaluator, HateUnfairnessEvaluator, ProtectedMaterialEvaluator
-from azure.identity import AzureCliCredential
-import os
-
-if __name__ == "__main__":
-    model_config = {
-        # OpenAI API key
-        "type": "azure_openai",
-        "azure_deployment": "gpt-35-turbo",
-        "azure_endpoint": "https://ai-neduvvurai952818858670.openai.azure.com",
-        "api_key": "dc2f807bc52448deafb28b23e2f146f6",
-    }
-    
-    
-    relevance_eval = RelevanceEvaluator(model_config)
-
-    relevance_eval
-
-    hate_unfairness_evaluator = ProtectedMaterialEvaluator(
-        azure_ai_project={
-            "subscription_id": "b17253fa-f327-42d6-9686-f3e553e24763",
-            "resource_group_name": "rg-neduvvurai",
-            "project_name": "neduvvur-4217",
-        },
-        credential=AzureCliCredential(),
-    )
-
-    print(hate_unfairness_evaluator(
-        query="Which tent is the most waterproof?",
-        response="The Alpine Explorer Tent is the most waterproof.",
-    ))
-
-    """os.environ["PF_EVALS_BATCH_USE_ASYNC"] = "false"
-
-    datasets_folderpath = os.path.abspath(".")
-    input_path = os.path.join(datasets_folderpath, "data.jsonl")
-
-    print(input_path)
-
-    eval_result = evaluate(
-        evaluators={
-            "relevance": relevance_eval,
-            "hate_unfairness": hate_unfairness_evaluator,
-        },
-        data="C:/Users/neduvvur/azure-sdk-for-python/data.jsonl",
-    )
-
-    print(eval_result)"""
-
-
-    """print(relevance_eval(
-            query="Which tent is the most waterproof?",
-            response="The Alpine Explorer Tent is the most waterproof.",
-            context="From the our product list, the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight.",
-        )
-    )"""
\ No newline at end of file

From 7cbb85d24ef1e3279872513eae94a091c1c69458 Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Fri, 25 Oct 2024 14:37:58 -0400
Subject: [PATCH 06/16] remove a bad param

---
 .../ai/evaluation/_evaluators/_groundedness/_groundedness.py    | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index 99463b0607e7..6e2d536f1781 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -60,8 +60,6 @@ def __call__(
     ) -> Dict[str, float]:
         """Evaluate groundedness for given input of response, context
     
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
         :keyword context: The context to be evaluated.

From ce01a37f27e3ba47f8a94d14db480bc8b8134a78 Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Fri, 25 Oct 2024 14:39:25 -0400
Subject: [PATCH 07/16] add docs for relevance

---
 .../_evaluators/_relevance/_relevance.py      | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
index 9da783ce1992..c163843b37b3 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
@@ -62,6 +62,18 @@ def __call__(
         response: str,
         context: str,
     ) -> Dict[str, float]:
+        """Evaluate groundedness for given input of query, response, context
+    
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword context: The context to be evaluated.
+        :paramtype context: str
+        :return: The relevance score.
+        :rtype: Dict[str, float]
+        """
+        
         ...
 
     @overload
@@ -71,6 +83,15 @@ def __call__(
         conversation: Conversation,
         **kwargs,
     ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
+        """Evaluate relevance for a conversation
+        
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        """
         ...
         
 

From ebc1425968e2f14897a40466acd12ef085c7fda1 Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Fri, 25 Oct 2024 15:00:44 -0400
Subject: [PATCH 08/16] fix some missing type hints

---
 .../azure/ai/evaluation/_evaluators/_coherence/_coherence.py | 3 ++-
 .../ai/evaluation/_evaluators/_content_safety/_sexual.py     | 5 +++--
 .../_evaluators/_protected_material/_protected_material.py   | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
index 292b4e7b4f46..e1e357407956 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
@@ -7,6 +7,7 @@
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 
 class CoherenceEvaluator(PromptyEvaluatorBase):
@@ -71,7 +72,7 @@ def __call__(
     def __call__(
         self,
         *,
-        conversation,
+        conversation: Conversation,
         **kwargs,
     ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
         """Evaluate coherence for a conversation
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
index 6bb20ccd0500..63744b2985a7 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -8,6 +8,7 @@
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 
 @experimental
@@ -80,9 +81,9 @@ def __call__(
     def __call__(
         self,
         *,
-        conversation,
+        conversation: Conversation,
         **kwargs,
-     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
+     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for sexual content
         
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
index 545c8c0c249a..1bf2ae108292 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
@@ -9,6 +9,7 @@
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 
 @experimental
@@ -81,7 +82,7 @@ def __call__(
     def __call__(
         self,
         *,
-        conversation,
+        conversation: Conversation,
         **kwargs,
     ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]:
         """Evaluate a conversation for protected material

From ab5a3d80fef79eed44e449b3e4e5738a4f7080da Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Fri, 25 Oct 2024 17:12:28 -0400
Subject: [PATCH 09/16] lint and run black

---
 .../azure/ai/evaluation/_common/_experimental.py           | 6 ++++--
 .../ai/evaluation/_evaluators/_coherence/_coherence.py     | 4 ++--
 .../_evaluators/_content_safety/_content_safety.py         | 4 ++--
 .../_evaluators/_content_safety/_hate_unfairness.py        | 6 +++---
 .../evaluation/_evaluators/_content_safety/_self_harm.py   | 4 ++--
 .../ai/evaluation/_evaluators/_content_safety/_sexual.py   | 6 +++---
 .../ai/evaluation/_evaluators/_content_safety/_violence.py | 4 ++--
 .../azure/ai/evaluation/_evaluators/_fluency/_fluency.py   | 6 +++---
 .../evaluation/_evaluators/_groundedness/_groundedness.py  | 5 ++---
 .../_evaluators/_protected_material/_protected_material.py | 4 ++--
 .../ai/evaluation/_evaluators/_relevance/_relevance.py     | 7 +++----
 .../azure/ai/evaluation/_evaluators/_xpia/xpia.py          | 7 +++----
 .../azure/ai/evaluation/simulator/_simulator.py            | 1 -
 13 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py
index ca676c9bcdc9..6728a61649c6 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py
@@ -27,11 +27,13 @@
 
 
 @overload
-def experimental(wrapped: Type[T]) -> Type[T]: ...
+def experimental(wrapped: Type[T]) -> Type[T]:
+    ...
 
 
 @overload
-def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: ...
+def experimental(wrapped: Callable[P, T]) -> Callable[P, T]:
+    ...
 
 
 def experimental(wrapped: Union[Type[T], Callable[P, T]]) -> Union[Type[T], Callable[P, T]]:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
index e1e357407956..6aefc1eecdfb 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
@@ -58,7 +58,7 @@ def __call__(
         response: str,
     ) -> Dict[str, float]:
         """Evaluate coherence for given input of query, response
-    
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
@@ -76,7 +76,7 @@ def __call__(
         **kwargs,
     ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
         """Evaluate coherence for a conversation
-        
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
index 7652fe70df6b..ae1325285266 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -87,7 +87,7 @@ def __call__(
         response: str,
     ) -> Dict[str, Union[str, float]]:
         """Evaluate a collection of content safety metrics for the given query/response pair
-    
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
@@ -105,7 +105,7 @@ def __call__(
         **kwargs,
     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a collection of content safety metrics for a conversation
-        
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
index 16ff9237e394..7c3acf13ea17 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -58,7 +58,7 @@ def __init__(
             credential=credential,
             eval_last_turn=eval_last_turn,
         )
-    
+
     @overload
     def __call__(
         self,
@@ -67,7 +67,7 @@ def __call__(
         response: str,
     ) -> Dict[str, Union[str, float]]:
         """Evaluate the given query/response pair for hateful content
-    
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
@@ -85,7 +85,7 @@ def __call__(
         **kwargs,
     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for hateful content
-        
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
index 2c7efff8de5e..a613ae259eab 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -67,7 +67,7 @@ def __call__(
         response: str,
     ) -> Dict[str, Union[str, float]]:
         """Evaluate a given query/response pair for self-harm content
-    
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
@@ -85,7 +85,7 @@ def __call__(
         **kwargs,
     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for self-harm content
-        
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
index 4ec50b1df3e3..6de8e76a70b8 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -67,7 +67,7 @@ def __call__(
         response: str,
     ) -> Dict[str, Union[str, float]]:
         """Evaluate a given query/response pair for sexual content
-    
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
@@ -83,9 +83,9 @@ def __call__(
         *,
         conversation: Conversation,
         **kwargs,
-     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
+    ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for sexual content
-        
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
index 193441fc35a5..2c66a681c9c0 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -67,7 +67,7 @@ def __call__(
         response: str,
     ) -> Dict[str, Union[str, float]]:
         """Evaluate a given query/response pair for violent content
-    
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
@@ -85,7 +85,7 @@ def __call__(
         **kwargs,
     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for violent content
-        
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
index 22dd63f928b4..2bad778f2034 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
@@ -59,7 +59,7 @@ def __call__(
         response: str,
     ) -> Dict[str, float]:
         """Evaluate fluency in given query/response
-    
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
@@ -76,8 +76,8 @@ def __call__(
         conversation: Conversation,
         **kwargs,
     ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
-        """Evaluate fluency for a conversation 
-        
+        """Evaluate fluency for a conversation
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index 3260ec536c4b..5d39fcc9c79f 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -59,7 +59,7 @@ def __call__(
         context: str,
     ) -> Dict[str, float]:
         """Evaluate groundedness for given input of response, context
-    
+
         :keyword response: The response to be evaluated.
         :paramtype response: str
         :keyword context: The context to be evaluated.
@@ -77,7 +77,7 @@ def __call__(
         **kwargs,
     ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
         """Evaluate groundedness for a conversation
-        
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
@@ -86,7 +86,6 @@ def __call__(
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
         ...
-        
 
     @override
     def __call__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
index 2250fecf2dbb..379f0d47e5ac 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
@@ -68,7 +68,7 @@ def __call__(
         response: str,
     ) -> Dict[str, Union[str, bool]]:
         """Evaluate a given query/response pair for protected material
-    
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
@@ -86,7 +86,7 @@ def __call__(
         **kwargs,
     ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]:
         """Evaluate a conversation for protected material
-        
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
index c163843b37b3..b871297dae2f 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
@@ -63,7 +63,7 @@ def __call__(
         context: str,
     ) -> Dict[str, float]:
         """Evaluate groundedness for given input of query, response, context
-    
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
@@ -73,7 +73,7 @@ def __call__(
         :return: The relevance score.
         :rtype: Dict[str, float]
         """
-        
+
         ...
 
     @overload
@@ -84,7 +84,7 @@ def __call__(
         **kwargs,
     ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
         """Evaluate relevance for a conversation
-        
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
@@ -93,7 +93,6 @@ def __call__(
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
         ...
-        
 
     @override
     def __call__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
index 0b5935b71597..eb0874a1deff 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
@@ -66,7 +66,6 @@ def __init__(
             eval_last_turn=eval_last_turn,
         )
 
-
     @overload
     def __call__(
         self,
@@ -75,7 +74,7 @@ def __call__(
         response: str,
     ) -> Dict[str, Union[str, bool]]:
         """Evaluate whether cross domain injected attacks are present in given query/response
-    
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
@@ -92,8 +91,8 @@ def __call__(
         conversation: Conversation,
         **kwargs,
     ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]:
-        """Evaluate cross domain injected attacks are present in a conversation 
-        
+        """Evaluate cross domain injected attacks are present in a conversation
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py
index b6fcca19fb29..835f623612ed 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py
@@ -226,7 +226,6 @@ async def _simulate_with_predefined_turns(
         semaphore = asyncio.Semaphore(concurrent_async_tasks)
         progress_bar_lock = asyncio.Lock()
 
-
         async def run_simulation(simulation: List[Union[str, Dict[str, Any]]]) -> JsonLineChatProtocol:
             async with semaphore:
                 current_simulation = ConversationHistory()

From ee3a1db636d7753629f28ff00495b6aaba476079 Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Mon, 28 Oct 2024 12:56:28 -0400
Subject: [PATCH 10/16] merge with main

---
 .../evaluation/_evaluators/_coherence/_coherence.py |  8 ++------
 .../ai/evaluation/_evaluators/_common/_base_eval.py |  2 +-
 .../_evaluators/_common/_base_rai_svc_eval.py       |  9 +++------
 .../_evaluators/_content_safety/_content_safety.py  |  8 ++------
 .../_evaluators/_content_safety/_hate_unfairness.py |  8 ++------
 .../_evaluators/_content_safety/_self_harm.py       |  8 ++------
 .../_evaluators/_content_safety/_sexual.py          |  8 ++------
 .../_evaluators/_content_safety/_violence.py        |  8 ++------
 .../ai/evaluation/_evaluators/_fluency/_fluency.py  |  8 ++------
 .../_evaluators/_groundedness/_groundedness.py      |  8 ++------
 .../_protected_material/_protected_material.py      |  5 ++---
 .../evaluation/_evaluators/_relevance/_relevance.py | 13 +++++--------
 .../azure/ai/evaluation/_evaluators/_xpia/xpia.py   | 12 ++++--------
 13 files changed, 31 insertions(+), 74 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
index 6aefc1eecdfb..067f236b01e4 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
@@ -73,7 +73,6 @@ def __call__(
         self,
         *,
         conversation: Conversation,
-        **kwargs,
     ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
         """Evaluate coherence for a conversation
 
@@ -89,10 +88,7 @@ def __call__(
     @override
     def __call__(
         self,
-        *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """Evaluate coherence. Accepts either a query and response for a single evaluation,
@@ -110,4 +106,4 @@ def __call__(
         :return: The relevance score.
         :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
         """
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
index ef7a38e58e6f..1afb2eef668a 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -87,7 +87,7 @@ def __init__(
     # This needs to be overridden just to change the function header into something more informative,
     # and to be able to add a more specific docstring. The actual function contents should just be
     # super().__call__(<inputs>)
-    def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
+    def __call__(self, *args, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
         one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
         The actual behavior of this function shouldn't change beyond adding more inputs to the
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
index 28d00f7977b6..5d9a60d14e74 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
@@ -18,7 +18,7 @@
 
 from . import EvaluatorBase
 
-T = Union[str, float]
+T = Union[str, float, bool]
 
 
 class RaiServiceEvaluatorBase(EvaluatorBase[T]):
@@ -52,10 +52,7 @@ def __init__(
     @override
     def __call__(
         self,
-        *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """Evaluate either a query and response or a conversation. Must supply either a query AND response,
@@ -71,7 +68,7 @@ def __call__(
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
         """
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
 
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
index ae1325285266..0e3e1a3bfc08 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -102,7 +102,6 @@ def __call__(
         self,
         *,
         conversation: Conversation,
-        **kwargs,
     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a collection of content safety metrics for a conversation
 
@@ -118,10 +117,7 @@ def __call__(
     @override
     def __call__(
         self,
-        *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """Evaluate a collection of content safety metrics for the given query/response pair or conversation.
@@ -138,7 +134,7 @@ def __call__(
         :return: The evaluation result.
         :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
         """
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
 
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
index 7c3acf13ea17..1fa3e55d7583 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -82,7 +82,6 @@ def __call__(
         self,
         *,
         conversation: Conversation,
-        **kwargs,
     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for hateful content
 
@@ -98,10 +97,7 @@ def __call__(
     @override
     def __call__(
         self,
-        *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """
@@ -118,4 +114,4 @@ def __call__(
         :return: The fluency score.
         :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
         """
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args,**kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
index a613ae259eab..5f43c8874f4f 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -82,7 +82,6 @@ def __call__(
         self,
         *,
         conversation: Conversation,
-        **kwargs,
     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for self-harm content
 
@@ -98,10 +97,7 @@ def __call__(
     @override
     def __call__(
         self,
-        *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """
@@ -118,4 +114,4 @@ def __call__(
         :return: The fluency score.
         :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
         """
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
index 6de8e76a70b8..d8d71fa38515 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -82,7 +82,6 @@ def __call__(
         self,
         *,
         conversation: Conversation,
-        **kwargs,
     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for sexual content
 
@@ -98,10 +97,7 @@ def __call__(
     @override
     def __call__(
         self,
-        *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """
@@ -118,4 +114,4 @@ def __call__(
         :return: The fluency score.
         :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
         """
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
index 2c66a681c9c0..df16fab93ed0 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -82,7 +82,6 @@ def __call__(
         self,
         *,
         conversation: Conversation,
-        **kwargs,
     ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for violent content
 
@@ -98,10 +97,7 @@ def __call__(
     @override
     def __call__(
         self,
-        *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """
@@ -119,4 +115,4 @@ def __call__(
         :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
         """
 
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
index 2bad778f2034..1fc7e6838b02 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
@@ -74,7 +74,6 @@ def __call__(
         self,
         *,
         conversation: Conversation,
-        **kwargs,
     ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
         """Evaluate fluency for a conversation
 
@@ -90,10 +89,7 @@ def __call__(
     @override
     def __call__(
         self,
-        *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """
@@ -112,4 +108,4 @@ def __call__(
         :return: The fluency score.
         :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
         """
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index 5d39fcc9c79f..9df5861ea356 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -74,7 +74,6 @@ def __call__(
         self,
         *,
         conversation: Conversation,
-        **kwargs,
     ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
         """Evaluate groundedness for a conversation
 
@@ -90,10 +89,7 @@ def __call__(
     @override
     def __call__(
         self,
-        *,
-        response: Optional[str] = None,
-        context: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """Evaluate groundedness. Accepts either a response and context a single evaluation,
@@ -111,4 +107,4 @@ def __call__(
         :return: The relevance score.
         :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
         """
-        return super().__call__(response=response, context=context, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
index 379f0d47e5ac..0c7a23d19a8c 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
@@ -83,8 +83,7 @@ def __call__(
         self,
         *,
         conversation: Conversation,
-        **kwargs,
-    ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
         """Evaluate a conversation for protected material
 
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -117,6 +116,6 @@ def __call__(
             to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]]
+        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
         """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
index b871297dae2f..1fedc6d037e9 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
@@ -57,10 +57,11 @@ def __init__(self, model_config):
     @overload
     def __call__(
         self,
-        *,
+        *args,
         query: str,
         response: str,
         context: str,
+        **kwargs,
     ) -> Dict[str, float]:
         """Evaluate groundedness for given input of query, response, context
 
@@ -79,7 +80,7 @@ def __call__(
     @overload
     def __call__(
         self,
-        *,
+        *args,
         conversation: Conversation,
         **kwargs,
     ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
@@ -97,11 +98,7 @@ def __call__(
     @override
     def __call__(
         self,
-        *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        context: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """Evaluate relevance. Accepts either a response and context a single evaluation,
@@ -121,4 +118,4 @@ def __call__(
         :return: The relevance score.
         :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
         """
-        return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
index eb0874a1deff..1640cd6fa5ab 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
@@ -89,8 +89,7 @@ def __call__(
         self,
         *,
         conversation: Conversation,
-        **kwargs,
-    ) -> Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
         """Evaluate cross domain injected attacks are present in a conversation
 
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -105,10 +104,7 @@ def __call__(
     @override
     def __call__(
         self,
-        *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """
@@ -123,6 +119,6 @@ def __call__(
             to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The cross domain injection attack score
-        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]]
+        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
         """
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)

From 35ecd53aff927857cac52f78af102b7690d13622 Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Mon, 28 Oct 2024 20:05:59 -0400
Subject: [PATCH 11/16] fix some mypy errors, not all pylint

---
 .../_evaluators/_coherence/_coherence.py      | 10 ++--
 .../_evaluators/_common/_base_prompty_eval.py |  6 ++-
 .../_evaluators/_common/_base_rai_svc_eval.py |  4 +-
 .../_content_safety/_content_safety.py        | 12 ++---
 .../_content_safety/_hate_unfairness.py       | 14 +++---
 .../_evaluators/_content_safety/_self_harm.py | 12 ++---
 .../_evaluators/_content_safety/_sexual.py    | 10 ++--
 .../_evaluators/_content_safety/_violence.py  | 12 ++---
 .../_evaluators/_fluency/_fluency.py          | 10 ++--
 .../_groundedness/_groundedness.py            | 10 ++--
 .../_protected_material.py                    |  4 +-
 .../ai/evaluation/_evaluators/_qa/_qa.py      |  6 +--
 .../_evaluators/_relevance/_relevance.py      | 17 +++----
 .../_evaluators/_retrieval/_retrieval.py      | 50 ++++++++++++++++---
 .../_service_groundedness.py                  | 47 ++++++++++++++---
 .../ai/evaluation/_evaluators/_xpia/xpia.py   |  6 +--
 16 files changed, 139 insertions(+), 91 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
index 067f236b01e4..01d374704542 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import os
-from typing import Dict, Union, List, Optional
+from typing import Dict, Union, List
 
 from typing_extensions import overload, override
 
@@ -10,7 +10,7 @@
 from azure.ai.evaluation._model_configurations import Conversation
 
 
-class CoherenceEvaluator(PromptyEvaluatorBase):
+class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
     Initialize a coherence evaluator configured for a specific Azure OpenAI model.
 
@@ -56,7 +56,7 @@ def __call__(
         *,
         query: str,
         response: str,
-    ) -> Dict[str, float]:
+    ) -> Dict[str, Union[str, float]]:
         """Evaluate coherence for given input of query, response
 
         :keyword query: The query to be evaluated.
@@ -66,14 +66,13 @@ def __call__(
         :return: The coherence score.
         :rtype: Dict[str, float]
         """
-        ...
 
     @overload
     def __call__(
         self,
         *,
         conversation: Conversation,
-    ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate coherence for a conversation
 
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -83,7 +82,6 @@ def __call__(
         :return: The coherence score.
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
-        ...
 
     @override
     def __call__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
index e02f29ad0def..e851e8499260 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
@@ -4,7 +4,7 @@
 
 import math
 import re
-from typing import Dict, Union
+from typing import Dict, TypeVar, Union
 
 from promptflow.core import AsyncPrompty
 from typing_extensions import override
@@ -18,8 +18,10 @@
 except ImportError:
     USER_AGENT = "None"
 
+T = TypeVar("T")
 
-class PromptyEvaluatorBase(EvaluatorBase[float]):
+
+class PromptyEvaluatorBase(EvaluatorBase[T]):
     """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
     make use of a prompty file, and return their results as a dictionary, with a single key-value pair
     linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
index 5d9a60d14e74..4a82c1fef20a 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, Optional, Union
+from typing import Dict, TypeVar, Union
 
 from typing_extensions import override
 
@@ -18,7 +18,7 @@
 
 from . import EvaluatorBase
 
-T = Union[str, float, bool]
+T = TypeVar("T")
 
 
 class RaiServiceEvaluatorBase(EvaluatorBase[T]):
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
index 0e3e1a3bfc08..597462f6952d 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 from concurrent.futures import as_completed
-from typing import Callable, Dict, List, Optional, Union
+from typing import Callable, Dict, List, Union
 
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
 from typing_extensions import overload, override
@@ -18,7 +18,7 @@
 
 
 @experimental
-class ContentSafetyEvaluator(EvaluatorBase):
+class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
     """
     Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
 
@@ -95,14 +95,13 @@ def __call__(
         :return: The content safety scores.
         :rtype: Dict[str, Union[str, float]]
         """
-        ...
 
     @overload
     def __call__(
         self,
         *,
         conversation: Conversation,
-    ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a collection of content safety metrics for a conversation
 
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -110,9 +109,8 @@ def __call__(
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The content safety scores.
-        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
+        :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]
         """
-        ...
 
     @override
     def __call__(
@@ -132,7 +130,7 @@ def __call__(
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The evaluation result.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
         """
         return super().__call__(*args, **kwargs)
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
index 1fa3e55d7583..b4f85d84d639 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Union
 
 from typing_extensions import overload, override
 
@@ -12,7 +12,7 @@
 
 
 @experimental
-class HateUnfairnessEvaluator(RaiServiceEvaluatorBase):
+class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
     """
     Initialize a hate-unfairness evaluator for hate unfairness score.
 
@@ -75,14 +75,13 @@ def __call__(
         :return: The hate score
         :rtype: Dict[str, Union[str, float]]
         """
-        ...
 
     @overload
     def __call__(
         self,
         *,
         conversation: Conversation,
-    ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for hateful content
 
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -90,9 +89,8 @@ def __call__(
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The hate score
-        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
+        :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]
         """
-        ...
 
     @override
     def __call__(
@@ -112,6 +110,6 @@ def __call__(
             to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
         """
-        return super().__call__(*args,**kwargs)
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
index 5f43c8874f4f..e9831a065c66 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Union
 
 from typing_extensions import overload, override
 
@@ -12,7 +12,7 @@
 
 
 @experimental
-class SelfHarmEvaluator(RaiServiceEvaluatorBase):
+class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
     """
     Initialize a self harm evaluator for self harm score.
 
@@ -75,14 +75,13 @@ def __call__(
         :return: The self-harm score
         :rtype: Dict[str, Union[str, float]]
         """
-        ...
 
     @overload
     def __call__(
         self,
         *,
         conversation: Conversation,
-    ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for self-harm content
 
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -90,9 +89,8 @@ def __call__(
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The self-harm score
-        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
+        :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]
         """
-        ...
 
     @override
     def __call__(
@@ -112,6 +110,6 @@ def __call__(
             to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
         """
         return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
index d8d71fa38515..575450a15efa 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Union
 
 from typing_extensions import overload, override
 
@@ -12,7 +12,7 @@
 
 
 @experimental
-class SexualEvaluator(RaiServiceEvaluatorBase):
+class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
     """
     Initialize a sexual evaluator for sexual score.
 
@@ -75,14 +75,13 @@ def __call__(
         :return: The sexual score
         :rtype: Dict[str, Union[str, float]]
         """
-        ...
 
     @overload
     def __call__(
         self,
         *,
         conversation: Conversation,
-    ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for sexual content
 
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -90,9 +89,8 @@ def __call__(
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The sexual score
-        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
+        :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]
         """
-        ...
 
     @override
     def __call__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
index df16fab93ed0..dfe5c1445d59 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Union
 
 from typing_extensions import overload, override
 
@@ -12,7 +12,7 @@
 
 
 @experimental
-class ViolenceEvaluator(RaiServiceEvaluatorBase):
+class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
     """
     Initialize a violence evaluator for violence score.
 
@@ -75,14 +75,13 @@ def __call__(
         :return: The content safety score.
         :rtype: Dict[str, Union[str, float]]
         """
-        ...
 
     @overload
     def __call__(
         self,
         *,
         conversation: Conversation,
-    ) -> Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate a conversation for violent content
 
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -90,9 +89,8 @@ def __call__(
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The violence score.
-        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
+        :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]
         """
-        ...
 
     @override
     def __call__(
@@ -112,7 +110,7 @@ def __call__(
             to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
         """
 
         return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
index 1045ee9d0599..31553ca29780 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 
 import os
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Union
 
 from typing_extensions import overload, override
 
@@ -11,7 +11,7 @@
 from azure.ai.evaluation._model_configurations import Conversation
 
 
-class FluencyEvaluator(PromptyEvaluatorBase):
+class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
     Initialize a fluency evaluator configured for a specific Azure OpenAI model.
 
@@ -54,7 +54,7 @@ def __call__(
         self,
         *,
         response: str,
-    ) -> Dict[str, float]:
+    ) -> Dict[str, Union[str, float]]:
         """Evaluate fluency in given query/response
 
         :keyword response: The response to be evaluated.
@@ -62,14 +62,13 @@ def __call__(
         :return: The fluency score
         :rtype: Dict[str, float]
         """
-        ...
 
     @overload
     def __call__(
         self,
         *,
         conversation: Conversation,
-    ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate fluency for a conversation
 
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -79,7 +78,6 @@ def __call__(
         :return: The fluency score
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
-        ...
 
     @override
     def __call__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index fbfba6450cf7..f20e2d5f8637 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -17,7 +17,7 @@
     USER_AGENT = "None"
 
 
-class GroundednessEvaluator(PromptyEvaluatorBase):
+class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
     Initialize a groundedness evaluator configured for a specific Azure OpenAI model.
 
@@ -70,7 +70,7 @@ def __call__(
         response: str,
         context: str,
         query: Optional[str] = None,
-    ) -> Dict[str, float]:
+    ) -> Dict[str, Union[str, float]]:
         """Evaluate groundedness for given input of response, context
 
         :keyword response: The response to be evaluated.
@@ -83,14 +83,13 @@ def __call__(
         :return: The groundedness score.
         :rtype: Dict[str, float]
         """
-        ...
 
     @overload
     def __call__(
         self,
         *,
         conversation: Conversation,
-    ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate groundedness for a conversation
 
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -100,7 +99,6 @@ def __call__(
         :return: The groundedness score.
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
-        ...
 
     @override
     def __call__(
@@ -125,7 +123,7 @@ def __call__(
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The relevance score.
-        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
         """
 
         if kwargs.get("query", None):
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
index 3c96970f3823..70835cf73d67 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
@@ -13,7 +13,7 @@
 
 
 @experimental
-class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase):
+class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     """
     Initialize a protected material evaluator to detect whether protected material
     is present in the AI system's response. The evaluator outputs a Boolean label (`True` or `False`)
@@ -81,7 +81,6 @@ def __call__(
         :return: The protected material score.
         :rtype: Dict[str, Union[str, bool]]
         """
-        ...
 
     @overload
     def __call__(
@@ -98,7 +97,6 @@ def __call__(
         :return: The protected material score.
         :rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]
         """
-        ...
 
     @override
     def __call__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py
index e8198ff85e89..f27ce10c721c 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 
 from concurrent.futures import as_completed
-from typing import Callable, Dict, List
+from typing import Callable, Dict, List, Union
 
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
 
@@ -58,7 +58,7 @@ class QAEvaluator:
     def __init__(self, model_config, parallel: bool = True):
         self._parallel = parallel
 
-        self._evaluators: List[Callable[..., Dict[str, float]]] = [
+        self._evaluators: List[Union[Callable[..., Dict[str, Union[str, float]]], Callable[..., Dict[str, float]]]] = [
             GroundednessEvaluator(model_config),
             RelevanceEvaluator(model_config),
             CoherenceEvaluator(model_config),
@@ -82,7 +82,7 @@ def __call__(self, *, query: str, response: str, context: str, ground_truth: str
         :keyword parallel: Whether to evaluate in parallel. Defaults to True.
         :paramtype parallel: bool
         :return: The scores for QA scenario.
-        :rtype: Dict[str, float]
+        :rtype: Dict[str, Union[str, float]]
         """
         results: Dict[str, float] = {}
         if self._parallel:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
index 35f641b3e6f1..bb5a70fb06a0 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 
 import os
-from typing import Dict, Union, List, Optional
+from typing import Dict, Union, List
 
 from typing_extensions import overload, override
 
@@ -56,11 +56,10 @@ def __init__(self, model_config):
     @overload
     def __call__(
         self,
-        *args,
+        *,
         query: str,
         response: str,
-        **kwargs,
-    ) -> Dict[str, float]:
+    ) -> Dict[str, Union[str, float]]:
         """Evaluate groundedness for given input of query, response, context
 
         :keyword query: The query to be evaluated.
@@ -71,15 +70,12 @@ def __call__(
         :rtype: Dict[str, float]
         """
 
-        ...
-
     @overload
     def __call__(
         self,
-        *args,
+        *,
         conversation: Conversation,
-        **kwargs,
-    ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate relevance for a conversation
 
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
@@ -89,7 +85,6 @@ def __call__(
         :return: The relevance score.
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
-        ...
 
     @override
     def __call__(
@@ -110,6 +105,6 @@ def __call__(
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The relevance score.
-        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
         """
         return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
index 748c4e4904b0..5ea83ce48c62 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
@@ -6,12 +6,15 @@
 import logging
 import math
 import os
-from typing import Optional
+from typing import Dict, List, Union
+from typing_extensions import overload
 
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from promptflow.core import AsyncPrompty
 
+from azure.ai.evaluation._evaluators._common._base_prompty_eval import PromptyEvaluatorBase
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from azure.ai.evaluation._model_configurations import Conversation
 from ..._common.math import list_mean_nan_safe
 from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
 
@@ -107,7 +110,7 @@ async def __call__(self, *, query, context, conversation, **kwargs):
         }
 
 
-class RetrievalEvaluator:
+class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
     Initialize an evaluator configured for a specific Azure OpenAI model.
 
@@ -152,10 +155,42 @@ class RetrievalEvaluator:
     however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
 
-    def __init__(self, model_config):
+    def __init__(self, model_config):  # pylint: disable=super-init-not-called
         self._async_evaluator = _AsyncRetrievalScoreEvaluator(model_config)
 
-    def __call__(self, *, query: Optional[str] = None, context: Optional[str] = None, conversation=None, **kwargs):
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        context: str,
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluates retrieval for a given a query and context
+
+        :keyword query: The query to be evaluated. Mutually exclusive with `conversation` parameter.
+        :paramtype query: Optional[str]
+        :keyword context: The context to be evaluated. Mutually exclusive with `conversation` parameter.
+        :paramtype context: Optional[str]
+        :return: The scores for Chat scenario.
+        :rtype: Dict[str, Union[str, float]]
+        """
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation: Conversation,
+    ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
+        """Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn,
+        the evaluator will aggregate the results of each turn.
+
+        :keyword conversation: The conversation to be evaluated.
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The scores for Chat scenario.
+        :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        """
+
+    def __call__(self, *args, **kwargs):
         """Evaluates retrieval score chat scenario. Accepts either a query and context for a single evaluation,
         or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
         the evaluator will aggregate the results of each turn.
@@ -169,6 +204,10 @@ def __call__(self, *, query: Optional[str] = None, context: Optional[str] = None
         :return: The scores for Chat scenario.
         :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
+        query = kwargs.get("query", None)
+        context = kwargs.get("context", None)
+        conversation = kwargs.get("conversation", None)
+
         if (query is None or context is None) and conversation is None:
             msg = "Either a pair of 'query'/'context' or 'conversation' must be provided."
             raise EvaluationException(
@@ -192,6 +231,3 @@ def __call__(self, *, query: Optional[str] = None, context: Optional[str] = None
         return async_run_allowing_running_loop(
             self._async_evaluator, query=query, context=context, conversation=conversation, **kwargs
         )
-
-    def _to_async(self):
-        return self._async_evaluator
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
index 83780f6506ef..c617c977acde 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
@@ -1,16 +1,17 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Dict
-from typing_extensions import override
+from typing import List, Optional, Union, Dict
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 
 @experimental
-class GroundednessProEvaluator(RaiServiceEvaluatorBase):
+class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     """
     Initialize a Groundedness Pro evaluator for determine if the response is grounded
     in the query and context.
@@ -100,14 +101,48 @@ def __init__(
             **kwargs,
         )
 
-    @override
+    @overload
     def __call__(
         self,
         *,
         query: Optional[str] = None,
         response: Optional[str] = None,
         context: Optional[str] = None,
-        conversation=None,
+    ) -> Dict[str, Union[str, bool]]:
+        """Evaluate groundedness for a given query/response/context
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: Optional[str]
+        :keyword response: The response to be evaluated.
+        :paramtype response: Optional[str]
+        :keyword context: The context to be evaluated.
+        :paramtype context: Optional[str]
+        :return: The relevance score.
+        :rtype: Dict[str, Union[str, bool]]
+        """
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation: Conversation,
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
+        """Evaluate groundednessf for a conversation for a multi-turn evaluation. If the conversation has
+        more than one turn, the evaluator will aggregate the results of each turn, with the per-turn results
+        available in the output under the "evaluation_per_turn" key.
+
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]
+        """
+
+    @override
+    def __call__(
+        self,
+        *args,
         **kwargs,
     ):
         """Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a
@@ -128,7 +163,7 @@ def __call__(
         :return: The relevance score.
         :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
         """
-        return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
 
     @override
     async def _do_eval(self, eval_input: Dict):
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
index 1640cd6fa5ab..efcc7b5032a9 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import logging
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Union
 
 from typing_extensions import overload, override
 
@@ -15,7 +15,7 @@
 
 
 @experimental
-class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
+class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     """A Cross-Domain Prompt Injection Attack (XPIA) jailbreak evaluator.
 
     Detect whether cross domain injected attacks are present in your AI system's response.
@@ -82,7 +82,6 @@ def __call__(
         :return: The cross domain injection attack score
         :rtype: Dict[str, Union[str, bool]]
         """
-        ...
 
     @overload
     def __call__(
@@ -99,7 +98,6 @@ def __call__(
         :return: The cross domain injection attack score
         :rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]
         """
-        ...
 
     @override
     def __call__(

From d3aacc191e3e79923b01ee7213f34a528f3e4c02 Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Tue, 29 Oct 2024 10:34:36 -0400
Subject: [PATCH 12/16] fix black errors

---
 .../azure/ai/evaluation/_common/_experimental.py            | 6 ++----
 .../ai/evaluation/_evaluators/_coherence/_coherence.py      | 2 +-
 .../azure/ai/evaluation/_evaluators/_common/_base_eval.py   | 6 +++++-
 .../ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py | 4 ++--
 .../_evaluators/_content_safety/_content_safety.py          | 2 +-
 .../_evaluators/_content_safety/_hate_unfairness.py         | 2 +-
 .../ai/evaluation/_evaluators/_content_safety/_self_harm.py | 2 +-
 .../ai/evaluation/_evaluators/_content_safety/_sexual.py    | 2 +-
 .../ai/evaluation/_evaluators/_content_safety/_violence.py  | 2 +-
 .../azure/ai/evaluation/_evaluators/_fluency/_fluency.py    | 2 +-
 .../evaluation/_evaluators/_groundedness/_groundedness.py   | 2 +-
 .../azure/ai/evaluation/_evaluators/_qa/_qa.py              | 2 +-
 .../ai/evaluation/_evaluators/_relevance/_relevance.py      | 2 +-
 .../ai/evaluation/_evaluators/_retrieval/_retrieval.py      | 2 +-
 .../_service_groundedness/_service_groundedness.py          | 4 ++--
 .../azure/ai/evaluation/_evaluators/_xpia/xpia.py           | 2 +-
 16 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py
index 6728a61649c6..ca676c9bcdc9 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py
@@ -27,13 +27,11 @@
 
 
 @overload
-def experimental(wrapped: Type[T]) -> Type[T]:
-    ...
+def experimental(wrapped: Type[T]) -> Type[T]: ...
 
 
 @overload
-def experimental(wrapped: Callable[P, T]) -> Callable[P, T]:
-    ...
+def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: ...
 
 
 def experimental(wrapped: Union[Type[T], Callable[P, T]]) -> Union[Type[T], Callable[P, T]]:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
index 01d374704542..a07754b69d56 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
@@ -84,7 +84,7 @@ def __call__(
         """
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
index d5431045179d..71ade7fa33b9 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -88,7 +88,11 @@ def __init__(
     # This needs to be overridden just to change the function header into something more informative,
     # and to be able to add a more specific docstring. The actual function contents should just be
     # super().__call__(<inputs>)
-    def __call__(self, *args, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
         one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
         The actual behavior of this function shouldn't change beyond adding more inputs to the
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
index 4a82c1fef20a..cb687d23e695 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
@@ -50,7 +50,7 @@ def __init__(
         self._credential = credential
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,
@@ -105,7 +105,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
                 )
             input_data["context"] = context
 
-        return await evaluate_with_rai_service(
+        return await evaluate_with_rai_service(  # type: ignore
             metric_name=self._eval_metric,
             data=input_data,
             project_scope=self._azure_ai_project,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
index 597462f6952d..05ad3c7539d8 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -113,7 +113,7 @@ def __call__(
         """
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
index b4f85d84d639..aed7deb827f1 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -93,7 +93,7 @@ def __call__(
         """
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
index e9831a065c66..9d7693018a28 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -93,7 +93,7 @@ def __call__(
         """
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
index 575450a15efa..12c6b9ab2578 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -93,7 +93,7 @@ def __call__(
         """
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
index dfe5c1445d59..7867d563521b 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -93,7 +93,7 @@ def __call__(
         """
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
index 31553ca29780..dd995f1cf367 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
@@ -80,7 +80,7 @@ def __call__(
         """
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index f20e2d5f8637..5215396aa9d2 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -101,7 +101,7 @@ def __call__(
         """
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py
index f27ce10c721c..b5f3ac810eff 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py
@@ -84,7 +84,7 @@ def __call__(self, *, query: str, response: str, context: str, ground_truth: str
         :return: The scores for QA scenario.
         :rtype: Dict[str, Union[str, float]]
         """
-        results: Dict[str, float] = {}
+        results: Dict[str, Union[str, float]] = {}
         if self._parallel:
             with ThreadPoolExecutor() as executor:
                 futures = {
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
index bb5a70fb06a0..f5fb2d96360b 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
@@ -87,7 +87,7 @@ def __call__(
         """
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
index 5ea83ce48c62..371453d682e3 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
@@ -190,7 +190,7 @@ def __call__(
         :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
 
-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args, **kwargs):  # pylint: disable=docstring-missing-param
         """Evaluates retrieval score chat scenario. Accepts either a query and context for a single evaluation,
         or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
         the evaluator will aggregate the results of each turn.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
index c617c977acde..be0d249c99b3 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
@@ -127,7 +127,7 @@ def __call__(
         *,
         conversation: Conversation,
     ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
-        """Evaluate groundednessf for a conversation for a multi-turn evaluation. If the conversation has
+        """Evaluate groundedness for a conversation for a multi-turn evaluation. If the conversation has
         more than one turn, the evaluator will aggregate the results of each turn, with the per-turn results
         available in the output under the "evaluation_per_turn" key.
 
@@ -140,7 +140,7 @@ def __call__(
         """
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
index efcc7b5032a9..079b035e7e93 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
@@ -100,7 +100,7 @@ def __call__(
         """
 
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
         *args,
         **kwargs,

From 28524258cb65591bf02be16f5814949b717ae2dc Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Tue, 29 Oct 2024 11:30:38 -0400
Subject: [PATCH 13/16] attempt to fix tests

---
 .../evaluation/_evaluators/_common/_base_eval.py  | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
index 71ade7fa33b9..09fe859dd01f 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -7,7 +7,7 @@
 from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
 
 from promptflow._utils.async_utils import async_run_allowing_running_loop
-from typing_extensions import ParamSpec, TypeAlias
+from typing_extensions import ParamSpec, TypeAlias, get_overloads
 
 from azure.ai.evaluation._common.math import list_mean
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
@@ -131,11 +131,18 @@ def _derive_singleton_inputs(self) -> List[str]:
         :rtype: List[str]
         """
 
+        overloads = get_overloads(self.__call__)
+        if not overloads:
+            call_signatures = [inspect.signature(self.__call__)]
+        else:
+            call_signatures = [inspect.signature(overload) for overload in overloads]
         call_signature = inspect.signature(self.__call__)
         singletons = []
-        for param in call_signature.parameters:
-            if param not in self._not_singleton_inputs:
-                singletons.append(param)
+        for call_signature in call_signatures:
+            params = call_signature.parameters
+            if any([not_singleton_input in params for not_singleton_input in self._not_singleton_inputs]):
+                continue
+            singletons.extend(params)
         return singletons
 
     def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:

From 0d76ce6f6f1cdb40560fbad69d1ad92059a5857b Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Tue, 29 Oct 2024 13:42:52 -0400
Subject: [PATCH 14/16] fix retrieval

---
 .../_evaluators/_common/_base_eval.py         |  3 ++-
 .../ai/evaluation/_evaluators/_eci/_eci.py    | 26 ++++++++++++++++++-
 .../_evaluators/_retrieval/_retrieval.py      |  6 ++---
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
index 09fe859dd01f..cc6ff6e311df 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -142,7 +142,8 @@ def _derive_singleton_inputs(self) -> List[str]:
             params = call_signature.parameters
             if any([not_singleton_input in params for not_singleton_input in self._not_singleton_inputs]):
                 continue
-            singletons.extend(params)
+            # exclude self since it is not a singleton input
+            singletons.extend([p for p in params if p != "self"])
         return singletons
 
     def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py
index 3e6e420e9305..c89df72fb13a 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py
@@ -1,11 +1,12 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import _InternalEvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 
 
 @experimental
@@ -62,3 +63,26 @@ def __init__(
             credential=credential,
             eval_last_turn=eval_last_turn,
         )
+
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: str,
+    ): ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation: Conversation,
+    ): ...
+
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
index 371453d682e3..b23cf62b10be 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
@@ -204,9 +204,9 @@ def __call__(self, *args, **kwargs):  # pylint: disable=docstring-missing-param
         :return: The scores for Chat scenario.
         :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
-        query = kwargs.get("query", None)
-        context = kwargs.get("context", None)
-        conversation = kwargs.get("conversation", None)
+        query = kwargs.pop("query", None)
+        context = kwargs.pop("context", None)
+        conversation = kwargs.pop("conversation", None)
 
         if (query is None or context is None) and conversation is None:
             msg = "Either a pair of 'query'/'context' or 'conversation' must be provided."

From b8a1777cbef17b8848d676f684989600b6c2ce40 Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Tue, 29 Oct 2024 13:52:42 -0400
Subject: [PATCH 15/16] fix up tests and lint

---
 .../azure/ai/evaluation/_evaluators/_common/_base_eval.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
index cc6ff6e311df..8969d4dae9a9 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -140,7 +140,7 @@ def _derive_singleton_inputs(self) -> List[str]:
         singletons = []
         for call_signature in call_signatures:
             params = call_signature.parameters
-            if any([not_singleton_input in params for not_singleton_input in self._not_singleton_inputs]):
+            if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
                 continue
             # exclude self since it is not a singleton input
             singletons.extend([p for p in params if p != "self"])

From 5aa71430e11c9da793cd89aeee6f481438008e4c Mon Sep 17 00:00:00 2001
From: Neehar Duvvuri <neduvvur@microsoft.com>
Date: Wed, 30 Oct 2024 15:15:28 -0400
Subject: [PATCH 16/16] fix some docstrings to mark some things as optional

---
 .../_evaluators/_content_safety/_hate_unfairness.py           | 4 ++--
 .../ai/evaluation/_evaluators/_content_safety/_self_harm.py   | 4 ++--
 .../ai/evaluation/_evaluators/_content_safety/_sexual.py      | 4 ++--
 .../ai/evaluation/_evaluators/_content_safety/_violence.py    | 4 ++--
 .../azure/ai/evaluation/_evaluators/_fluency/_fluency.py      | 2 +-
 .../_evaluators/_protected_material/_protected_material.py    | 4 ++--
 .../azure/ai/evaluation/_evaluators/_xpia/xpia.py             | 4 ++--
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
index aed7deb827f1..afb645d45768 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -102,9 +102,9 @@ def __call__(  # pylint: disable=docstring-missing-param
         Evaluate whether hateful content is present in your AI system's response.
 
         :keyword query: The query to be evaluated.
-        :paramtype query: str
+        :paramtype query: Optional[str]
         :keyword response: The response to be evaluated.
-        :paramtype response: str
+        :paramtype response: Optional[str]
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages". Conversation turns are expected
             to be dictionaries with keys "content" and "role".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
index 9d7693018a28..66cc70280737 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -102,9 +102,9 @@ def __call__(  # pylint: disable=docstring-missing-param
         Evaluate whether self-harm content is present in your AI system's response.
 
         :keyword query: The query to be evaluated.
-        :paramtype query: str
+        :paramtype query: Optional[str]
         :keyword response: The response to be evaluated.
-        :paramtype response: str
+        :paramtype response: Optional[str]
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages". Conversation turns are expected
             to be dictionaries with keys "content" and "role".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
index 12c6b9ab2578..dbf7a2a0ae12 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -102,9 +102,9 @@ def __call__(  # pylint: disable=docstring-missing-param
         Evaluate whether sexual content is present in your AI system's response.
 
         :keyword query: The query to be evaluated.
-        :paramtype query: str
+        :paramtype query: Optional[str]
         :keyword response: The response to be evaluated.
-        :paramtype response: str
+        :paramtype response: Optional[str]
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages". Conversation turns are expected
             to be dictionaries with keys "content" and "role".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
index 7867d563521b..f43c08726dcd 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -102,9 +102,9 @@ def __call__(  # pylint: disable=docstring-missing-param
         Evaluate whether violent content is present in your AI system's response.
 
         :keyword query: The query to be evaluated.
-        :paramtype query: str
+        :paramtype query: Optional[str]
         :keyword response: The response to be evaluated.
-        :paramtype response: str
+        :paramtype response: Optional[str]
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages". Conversation turns are expected
             to be dictionaries with keys "content" and "role".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
index dd995f1cf367..66c162a03993 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
@@ -91,7 +91,7 @@ def __call__(  # pylint: disable=docstring-missing-param
         the evaluator will aggregate the results of each turn.
 
         :keyword response: The response to be evaluated. Mutually exclusive with the "conversation" parameter.
-        :paramtype response: str
+        :paramtype response: Optional[str]
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
index 70835cf73d67..fb7dc8aefcb3 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
@@ -111,9 +111,9 @@ def __call__(
         Evaluate if protected material is present in your AI system's response.
 
         :keyword query: The query to be evaluated.
-        :paramtype query: str
+        :paramtype query: Optional[str]
         :keyword response: The response to be evaluated.
-        :paramtype response: str
+        :paramtype response: Optional[str]
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages". Conversation turns are expected
             to be dictionaries with keys "content" and "role".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
index 079b035e7e93..9d591f8d75b7 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
@@ -109,9 +109,9 @@ def __call__(  # pylint: disable=docstring-missing-param
         Evaluate whether cross domain injected attacks are present in your AI system's response.
 
         :keyword query: The query to be evaluated.
-        :paramtype query: str
+        :paramtype query: Optional[str]
         :keyword response: The response to be evaluated.
-        :paramtype response: str
+        :paramtype response: Optional[str]
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages". Conversation turns are expected
             to be dictionaries with keys "content" and "role".